FastAPI — Serving ML Models as REST APIs

FastAPI is the most popular framework for serving Python ML models as REST APIs — it's async, auto-generates Swagger docs, and validates requests using Pydantic models. The typical pattern: load the model once at startup, define input/output Pydantic schemas, return predictions with confidence scores. This is the standard architecture at Uber, Netflix, and most ML-heavy companies.

25 min•By Priygop Team•Updated 2026

FastAPI ML Inference Server

# COMPLETE FASTAPI ML SERVING EXAMPLE
# File: main.py (run with: uvicorn main:app --reload --port 8000)

from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel, Field, validator
from typing import Optional
import joblib
import numpy as np
import pandas as pd
import time
import logging
from contextlib import asynccontextmanager

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STARTUP: LOAD MODEL ONCE (not on every request!)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
MODEL     = None
META      = None
INFERENCE_COUNT   = 0
TOTAL_LATENCY_MS  = 0

@asynccontextmanager
async def lifespan(app: FastAPI):
    global MODEL, META
    import json
    MODEL = joblib.load("credit_classifier_v1.0.0.joblib")
    with open("credit_classifier_v1.0.0.json") as f:
        META = json.load(f)
    logging.info(f"Model loaded: v{META['version']}, test AUC={META['metrics']['test_auc']}")
    yield  # startup done, server runs
    # Shutdown cleanup:
    MODEL = None

app = FastAPI(
    title="Credit Default Prediction API",
    version="1.0.0",
    description="Machine Learning API to predict loan default probability.",
    lifespan=lifespan,
)

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# INPUT SCHEMA (Pydantic validates and documents automatically)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
class CreditRequest(BaseModel):
    age:        float  = Field(..., ge=18, le=100,    description="Age in years")
    income:     float  = Field(..., ge=0,             description="Annual income in USD")
    credit:     float  = Field(..., ge=300, le=850,   description="Credit score (FICO)")
    loan_amt:   float  = Field(..., ge=100,           description="Loan amount in USD")
    educ:       str    = Field(...,                   description="Education level: hs, bachelor, master, phd")

    @validator("educ")
    def validate_education(cls, v: str) -> str:
        allowed = {"hs", "bachelor", "master", "phd"}
        if v not in allowed:
            raise ValueError(f"educ must be one of {allowed}, got '{v}'")
        return v

class CreditResponse(BaseModel):
    default_probability: float = Field(..., description="Probability of default (0.0 to 1.0)")
    risk_tier:           str   = Field(..., description="LOW / MEDIUM / HIGH")
    recommendation:      str   = Field(..., description="Business action")
    model_version:       str
    latency_ms:          float

class BatchCreditRequest(BaseModel):
    applicants: list[CreditRequest]

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# ENDPOINTS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
@app.get("/health")
async def health_check():
    """Health check -- used by Kubernetes liveness probes."""
    return {
        "status":        "healthy",
        "model_version": META["version"] if META else "not loaded",
        "inferences":    INFERENCE_COUNT,
        "avg_latency_ms": round(TOTAL_LATENCY_MS / max(INFERENCE_COUNT, 1), 2),
    }

@app.post("/predict", response_model=CreditResponse)
async def predict(request: CreditRequest):
    """Single applicant credit default probability."""
    global INFERENCE_COUNT, TOTAL_LATENCY_MS
    if MODEL is None:
        raise HTTPException(503, "Model not loaded")

    t0 = time.perf_counter()

    # Feature engineering (same as training!)
    features = pd.DataFrame([{
        "age": request.age, "income": request.income,
        "credit": request.credit, "loan_amt": request.loan_amt,
        "educ": request.educ,
    }])

    prob = float(MODEL.predict_proba(features)[0, 1])

    latency = (time.perf_counter() - t0) * 1000
    INFERENCE_COUNT   += 1
    TOTAL_LATENCY_MS  += latency

    # Business logic: map probability to tier
    if prob > 0.6:
        tier, action = "HIGH",   "Decline application"
    elif prob > 0.25:
        tier, action = "MEDIUM", "Manual review required"
    else:
        tier, action = "LOW",    "Approve with standard terms"

    return CreditResponse(
        default_probability=round(prob, 4),
        risk_tier=tier,
        recommendation=action,
        model_version=META["version"],
        latency_ms=round(latency, 2),
    )

@app.post("/predict/batch")
async def predict_batch(request: BatchCreditRequest):
    """Batch prediction for multiple applicants."""
    features = pd.DataFrame([a.dict() for a in request.applicants])
    probs    = MODEL.predict_proba(features)[:, 1]
    return {"predictions": [round(p, 4) for p in probs]}

@app.get("/model/info")
async def model_info():
    """Return model card metadata."""
    return META

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# RUN: uvicorn main:app --reload --port 8000
# DOCS: http://localhost:8000/docs
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# EXAMPLE CLIENT:
# import requests
# resp = requests.post("http://localhost:8000/predict", json={
#     "age": 32, "income": 48000, "credit": 640,
#     "loan_amt": 15000, "educ": "bachelor"
# })
# print(resp.json())

Tip

Practice FastAPI Serving ML Models as REST APIs in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

80% of ML work is data preparation — garbage in = garbage out

Practice Task

Note

Practice Task — (1) Write a working example of FastAPI Serving ML Models as REST APIs from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with FastAPI Serving ML Models as REST APIs is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.

Topics in This Module

FastAPI — Serving ML Models as REST APIs

25 min•By Priygop Team•Updated 2026

FastAPI ML Inference Server

# COMPLETE FASTAPI ML SERVING EXAMPLE
# File: main.py (run with: uvicorn main:app --reload --port 8000)

from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel, Field, validator
from typing import Optional
import joblib
import numpy as np
import pandas as pd
import time
import logging
from contextlib import asynccontextmanager

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STARTUP: LOAD MODEL ONCE (not on every request!)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
MODEL     = None
META      = None
INFERENCE_COUNT   = 0
TOTAL_LATENCY_MS  = 0

@asynccontextmanager
async def lifespan(app: FastAPI):
    global MODEL, META
    import json
    MODEL = joblib.load("credit_classifier_v1.0.0.joblib")
    with open("credit_classifier_v1.0.0.json") as f:
        META = json.load(f)
    logging.info(f"Model loaded: v{META['version']}, test AUC={META['metrics']['test_auc']}")
    yield  # startup done, server runs
    # Shutdown cleanup:
    MODEL = None

app = FastAPI(
    title="Credit Default Prediction API",
    version="1.0.0",
    description="Machine Learning API to predict loan default probability.",
    lifespan=lifespan,
)

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# INPUT SCHEMA (Pydantic validates and documents automatically)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
class CreditRequest(BaseModel):
    age:        float  = Field(..., ge=18, le=100,    description="Age in years")
    income:     float  = Field(..., ge=0,             description="Annual income in USD")
    credit:     float  = Field(..., ge=300, le=850,   description="Credit score (FICO)")
    loan_amt:   float  = Field(..., ge=100,           description="Loan amount in USD")
    educ:       str    = Field(...,                   description="Education level: hs, bachelor, master, phd")

    @validator("educ")
    def validate_education(cls, v: str) -> str:
        allowed = {"hs", "bachelor", "master", "phd"}
        if v not in allowed:
            raise ValueError(f"educ must be one of {allowed}, got '{v}'")
        return v

class CreditResponse(BaseModel):
    default_probability: float = Field(..., description="Probability of default (0.0 to 1.0)")
    risk_tier:           str   = Field(..., description="LOW / MEDIUM / HIGH")
    recommendation:      str   = Field(..., description="Business action")
    model_version:       str
    latency_ms:          float

class BatchCreditRequest(BaseModel):
    applicants: list[CreditRequest]

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# ENDPOINTS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
@app.get("/health")
async def health_check():
    """Health check -- used by Kubernetes liveness probes."""
    return {
        "status":        "healthy",
        "model_version": META["version"] if META else "not loaded",
        "inferences":    INFERENCE_COUNT,
        "avg_latency_ms": round(TOTAL_LATENCY_MS / max(INFERENCE_COUNT, 1), 2),
    }

@app.post("/predict", response_model=CreditResponse)
async def predict(request: CreditRequest):
    """Single applicant credit default probability."""
    global INFERENCE_COUNT, TOTAL_LATENCY_MS
    if MODEL is None:
        raise HTTPException(503, "Model not loaded")

    t0 = time.perf_counter()

    # Feature engineering (same as training!)
    features = pd.DataFrame([{
        "age": request.age, "income": request.income,
        "credit": request.credit, "loan_amt": request.loan_amt,
        "educ": request.educ,
    }])

    prob = float(MODEL.predict_proba(features)[0, 1])

    latency = (time.perf_counter() - t0) * 1000
    INFERENCE_COUNT   += 1
    TOTAL_LATENCY_MS  += latency

    # Business logic: map probability to tier
    if prob > 0.6:
        tier, action = "HIGH",   "Decline application"
    elif prob > 0.25:
        tier, action = "MEDIUM", "Manual review required"
    else:
        tier, action = "LOW",    "Approve with standard terms"

    return CreditResponse(
        default_probability=round(prob, 4),
        risk_tier=tier,
        recommendation=action,
        model_version=META["version"],
        latency_ms=round(latency, 2),
    )

@app.post("/predict/batch")
async def predict_batch(request: BatchCreditRequest):
    """Batch prediction for multiple applicants."""
    features = pd.DataFrame([a.dict() for a in request.applicants])
    probs    = MODEL.predict_proba(features)[:, 1]
    return {"predictions": [round(p, 4) for p in probs]}

@app.get("/model/info")
async def model_info():
    """Return model card metadata."""
    return META

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# RUN: uvicorn main:app --reload --port 8000
# DOCS: http://localhost:8000/docs
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# EXAMPLE CLIENT:
# import requests
# resp = requests.post("http://localhost:8000/predict", json={
#     "age": 32, "income": 48000, "credit": 640,
#     "loan_amt": 15000, "educ": "bachelor"
# })
# print(resp.json())

Tip

Diagram

Loading diagram…

80% of ML work is data preparation — garbage in = garbage out

Topics in This Module