FastAPI Model Serving — Production REST API

FastAPI is the de-facto standard for deploying ML models as REST APIs: async by default, automatic OpenAPI docs, Pydantic validation, and sub-millisecond overhead. The patterns you learn here apply to serving any model — sentiment classifiers, object detectors, LLMs, or embedding models.

25 min•By Priygop Team•Updated 2026

Production FastAPI Model Server

from fastapi import FastAPI, HTTPException, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
from contextlib import asynccontextmanager
from typing import Optional
import torch
from transformers import pipeline
import time, uuid, logging
from prometheus_fastapi_instrumentator import Instrumentator

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# MODEL LOADING -- lazy on startup, not on each request
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
models = {}

@asynccontextmanager
async def lifespan(app: FastAPI):
    '''Load models once on startup, release on shutdown.'''
    logger.info("Loading models...")
    device = 0 if torch.cuda.is_available() else -1

    models["sentiment"] = pipeline(
        "sentiment-analysis",
        model="distilbert-base-uncased-finetuned-sst-2-english",
        device=device,
        batch_size=32,           # batch multiple requests together
        truncation=True,
        max_length=512,
    )
    models["ner"] = pipeline("ner", model="dslim/bert-base-NER",
                              device=device, aggregation_strategy="simple")
    logger.info("All models loaded successfully.")
    yield  # server is running
    models.clear()
    logger.info("Models released.")

app = FastAPI(
    title="AI Model Serving API",
    description="Production-grade model inference API",
    version="1.0.0",
    lifespan=lifespan,
)

# CORS -- required for browser-based clients
app.add_middleware(CORSMiddleware,
    allow_origins=["*"],  # in production: specify exact domains
    allow_credentials=True, allow_methods=["*"], allow_headers=["*"],
)

# Prometheus metrics endpoint (/metrics)
Instrumentator().instrument(app).expose(app)

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# API MODELS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
class SentimentRequest(BaseModel):
    texts: list[str] = Field(..., min_length=1, max_length=32, description="List of texts to classify")

class SentimentResponse(BaseModel):
    request_id: str
    results: list[dict]
    latency_ms: float

class NERRequest(BaseModel):
    text: str = Field(..., min_length=1, max_length=5000)

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# ENDPOINTS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@app.get("/health")
async def health_check() -> dict:
    return {"status": "healthy", "models_loaded": list(models.keys()),
            "device": "cuda" if torch.cuda.is_available() else "cpu"}

@app.post("/sentiment", response_model=SentimentResponse)
async def predict_sentiment(request: SentimentRequest) -> SentimentResponse:
    request_id = str(uuid.uuid4())[:8]
    start = time.time()

    try:
        results = models["sentiment"](request.texts)  # batched inference
        # Normalize response format
        normalized = [{"label": r["label"].lower(), "score": round(r["score"], 4)} for r in results]
    except Exception as e:
        logger.error(f"[{request_id}] Sentiment error: {e}")
        raise HTTPException(status_code=500, detail=str(e))

    latency = (time.time() - start) * 1000
    logger.info(f"[{request_id}] Sentiment batch={len(request.texts)} latency={latency:.1f}ms")

    return SentimentResponse(request_id=request_id, results=normalized, latency_ms=round(latency, 2))

@app.post("/ner")
async def predict_ner(request: NERRequest) -> dict:
    start = time.time()
    entities = models["ner"](request.text)
    return {
        "entities": [{"text": e["word"], "type": e["entity_group"], "score": round(e["score"], 4)} for e in entities],
        "latency_ms": round((time.time() - start) * 1000, 2),
    }

# Run: uvicorn main:app --host 0.0.0.0 --port 8000 --workers 4
# Test: curl -X POST http://localhost:8000/sentiment -H "Content-Type: application/json" \
#       -d '{"texts": ["Great product!", "Terrible experience."]}'

Tip

Practice FastAPI Model Serving Production REST API in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

REST is the standard for modern web APIs

Practice Task

Note

Practice Task — (1) Write a working example of FastAPI Model Serving Production REST API from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with FastAPI Model Serving Production REST API is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.

Topics in This Module