FastAPI Model Serving — Production REST API
FastAPI is the de-facto standard for deploying ML models as REST APIs: async by default, automatic OpenAPI docs, Pydantic validation, and sub-millisecond overhead. The patterns you learn here apply to serving any model — sentiment classifiers, object detectors, LLMs, or embedding models.
Production FastAPI Model Server
from fastapi import FastAPI, HTTPException, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
from contextlib import asynccontextmanager
from typing import Optional
import torch
from transformers import pipeline
import time, uuid, logging
from prometheus_fastapi_instrumentator import Instrumentator
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# MODEL LOADING -- lazy on startup, not on each request
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
models = {}
@asynccontextmanager
async def lifespan(app: FastAPI):
'''Load models once on startup, release on shutdown.'''
logger.info("Loading models...")
device = 0 if torch.cuda.is_available() else -1
models["sentiment"] = pipeline(
"sentiment-analysis",
model="distilbert-base-uncased-finetuned-sst-2-english",
device=device,
batch_size=32, # batch multiple requests together
truncation=True,
max_length=512,
)
models["ner"] = pipeline("ner", model="dslim/bert-base-NER",
device=device, aggregation_strategy="simple")
logger.info("All models loaded successfully.")
yield # server is running
models.clear()
logger.info("Models released.")
app = FastAPI(
title="AI Model Serving API",
description="Production-grade model inference API",
version="1.0.0",
lifespan=lifespan,
)
# CORS -- required for browser-based clients
app.add_middleware(CORSMiddleware,
allow_origins=["*"], # in production: specify exact domains
allow_credentials=True, allow_methods=["*"], allow_headers=["*"],
)
# Prometheus metrics endpoint (/metrics)
Instrumentator().instrument(app).expose(app)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# API MODELS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
class SentimentRequest(BaseModel):
texts: list[str] = Field(..., min_length=1, max_length=32, description="List of texts to classify")
class SentimentResponse(BaseModel):
request_id: str
results: list[dict]
latency_ms: float
class NERRequest(BaseModel):
text: str = Field(..., min_length=1, max_length=5000)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# ENDPOINTS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@app.get("/health")
async def health_check() -> dict:
return {"status": "healthy", "models_loaded": list(models.keys()),
"device": "cuda" if torch.cuda.is_available() else "cpu"}
@app.post("/sentiment", response_model=SentimentResponse)
async def predict_sentiment(request: SentimentRequest) -> SentimentResponse:
request_id = str(uuid.uuid4())[:8]
start = time.time()
try:
results = models["sentiment"](request.texts) # batched inference
# Normalize response format
normalized = [{"label": r["label"].lower(), "score": round(r["score"], 4)} for r in results]
except Exception as e:
logger.error(f"[{request_id}] Sentiment error: {e}")
raise HTTPException(status_code=500, detail=str(e))
latency = (time.time() - start) * 1000
logger.info(f"[{request_id}] Sentiment batch={len(request.texts)} latency={latency:.1f}ms")
return SentimentResponse(request_id=request_id, results=normalized, latency_ms=round(latency, 2))
@app.post("/ner")
async def predict_ner(request: NERRequest) -> dict:
start = time.time()
entities = models["ner"](request.text)
return {
"entities": [{"text": e["word"], "type": e["entity_group"], "score": round(e["score"], 4)} for e in entities],
"latency_ms": round((time.time() - start) * 1000, 2),
}
# Run: uvicorn main:app --host 0.0.0.0 --port 8000 --workers 4
# Test: curl -X POST http://localhost:8000/sentiment -H "Content-Type: application/json" \
# -d '{"texts": ["Great product!", "Terrible experience."]}'Tip
Tip
Practice FastAPI Model Serving Production REST API in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
REST is the standard for modern web APIs
Practice Task
Note
Practice Task — (1) Write a working example of FastAPI Model Serving Production REST API from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with FastAPI Model Serving Production REST API is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.