FastAPI — Serving ML Models as REST APIs
FastAPI is the most popular framework for serving Python ML models as REST APIs — it's async, auto-generates Swagger docs, and validates requests using Pydantic models. The typical pattern: load the model once at startup, define input/output Pydantic schemas, return predictions with confidence scores. This is the standard architecture at Uber, Netflix, and most ML-heavy companies.
FastAPI ML Inference Server
# COMPLETE FASTAPI ML SERVING EXAMPLE
# File: main.py (run with: uvicorn main:app --reload --port 8000)
from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel, Field, validator
from typing import Optional
import joblib
import numpy as np
import pandas as pd
import time
import logging
from contextlib import asynccontextmanager
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STARTUP: LOAD MODEL ONCE (not on every request!)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
MODEL = None
META = None
INFERENCE_COUNT = 0
TOTAL_LATENCY_MS = 0
@asynccontextmanager
async def lifespan(app: FastAPI):
global MODEL, META
import json
MODEL = joblib.load("credit_classifier_v1.0.0.joblib")
with open("credit_classifier_v1.0.0.json") as f:
META = json.load(f)
logging.info(f"Model loaded: v{META['version']}, test AUC={META['metrics']['test_auc']}")
yield # startup done, server runs
# Shutdown cleanup:
MODEL = None
app = FastAPI(
title="Credit Default Prediction API",
version="1.0.0",
description="Machine Learning API to predict loan default probability.",
lifespan=lifespan,
)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# INPUT SCHEMA (Pydantic validates and documents automatically)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
class CreditRequest(BaseModel):
age: float = Field(..., ge=18, le=100, description="Age in years")
income: float = Field(..., ge=0, description="Annual income in USD")
credit: float = Field(..., ge=300, le=850, description="Credit score (FICO)")
loan_amt: float = Field(..., ge=100, description="Loan amount in USD")
educ: str = Field(..., description="Education level: hs, bachelor, master, phd")
@validator("educ")
def validate_education(cls, v: str) -> str:
allowed = {"hs", "bachelor", "master", "phd"}
if v not in allowed:
raise ValueError(f"educ must be one of {allowed}, got '{v}'")
return v
class CreditResponse(BaseModel):
default_probability: float = Field(..., description="Probability of default (0.0 to 1.0)")
risk_tier: str = Field(..., description="LOW / MEDIUM / HIGH")
recommendation: str = Field(..., description="Business action")
model_version: str
latency_ms: float
class BatchCreditRequest(BaseModel):
applicants: list[CreditRequest]
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# ENDPOINTS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
@app.get("/health")
async def health_check():
"""Health check -- used by Kubernetes liveness probes."""
return {
"status": "healthy",
"model_version": META["version"] if META else "not loaded",
"inferences": INFERENCE_COUNT,
"avg_latency_ms": round(TOTAL_LATENCY_MS / max(INFERENCE_COUNT, 1), 2),
}
@app.post("/predict", response_model=CreditResponse)
async def predict(request: CreditRequest):
"""Single applicant credit default probability."""
global INFERENCE_COUNT, TOTAL_LATENCY_MS
if MODEL is None:
raise HTTPException(503, "Model not loaded")
t0 = time.perf_counter()
# Feature engineering (same as training!)
features = pd.DataFrame([{
"age": request.age, "income": request.income,
"credit": request.credit, "loan_amt": request.loan_amt,
"educ": request.educ,
}])
prob = float(MODEL.predict_proba(features)[0, 1])
latency = (time.perf_counter() - t0) * 1000
INFERENCE_COUNT += 1
TOTAL_LATENCY_MS += latency
# Business logic: map probability to tier
if prob > 0.6:
tier, action = "HIGH", "Decline application"
elif prob > 0.25:
tier, action = "MEDIUM", "Manual review required"
else:
tier, action = "LOW", "Approve with standard terms"
return CreditResponse(
default_probability=round(prob, 4),
risk_tier=tier,
recommendation=action,
model_version=META["version"],
latency_ms=round(latency, 2),
)
@app.post("/predict/batch")
async def predict_batch(request: BatchCreditRequest):
"""Batch prediction for multiple applicants."""
features = pd.DataFrame([a.dict() for a in request.applicants])
probs = MODEL.predict_proba(features)[:, 1]
return {"predictions": [round(p, 4) for p in probs]}
@app.get("/model/info")
async def model_info():
"""Return model card metadata."""
return META
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# RUN: uvicorn main:app --reload --port 8000
# DOCS: http://localhost:8000/docs
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# EXAMPLE CLIENT:
# import requests
# resp = requests.post("http://localhost:8000/predict", json={
# "age": 32, "income": 48000, "credit": 640,
# "loan_amt": 15000, "educ": "bachelor"
# })
# print(resp.json())Tip
Tip
Practice FastAPI Serving ML Models as REST APIs in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
80% of ML work is data preparation — garbage in = garbage out
Practice Task
Note
Practice Task — (1) Write a working example of FastAPI Serving ML Models as REST APIs from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with FastAPI Serving ML Models as REST APIs is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.