A/B Testing — Evaluating Models in Production
Before replacing a production model with a new version, you need to prove the new model is better on real traffic with statistical confidence. Shadow mode (run both models, log both predictions, only serve Model A's output) lets you evaluate without risk. A/B testing (route X% of traffic to each model) measures real business impact. The statistical test determines if the difference is significant or just noise.
Shadow Mode and A/B Model Testing
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency, mannwhitneyu
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import hashlib
np.random.seed(42)
N = 3000
X = pd.DataFrame({
"age": np.random.normal(38, 12, N).clip(18, 75),
"income": np.random.exponential(55000, N).clip(15000, 200000),
"credit": np.random.normal(680, 80, N).clip(300, 850),
})
y = np.random.choice([0, 1], N, p=[0.83, 0.17])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# MODEL A: existing production model
model_a = Pipeline([("sc", StandardScaler()), ("m", GradientBoostingClassifier(n_estimators=100, random_state=42))])
model_a.fit(X_train, y_train)
# MODEL B: new challenger model
model_b = Pipeline([("sc", StandardScaler()), ("m", GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=42))])
model_b.fit(X_train, y_train)
probs_a = model_a.predict_proba(X_test)[:, 1]
probs_b = model_b.predict_proba(X_test)[:, 1]
# SHADOW MODE: route all traffic to A, but also run B silently
def shadow_inference(features: pd.DataFrame, model_a, model_b, shadow_log: list) -> float:
"""Serve A's prediction, log both."""
prob_a = model_a.predict_proba(features)[0, 1]
prob_b = model_b.predict_proba(features)[0, 1]
shadow_log.append({"prob_a": prob_a, "prob_b": prob_b})
return prob_a # only A goes to customer
shadow_log: list = []
for i in range(len(X_test)):
shadow_inference(X_test.iloc[[i]], model_a, model_b, shadow_log)
shadow_df = pd.DataFrame(shadow_log)
print(f"Shadow mode comparison ({len(shadow_df)} requests):")
print(f" Model A mean prob: {shadow_df['prob_a'].mean():.4f}")
print(f" Model B mean prob: {shadow_df['prob_b'].mean():.4f}")
print(f" Agreement rate: {(abs(shadow_df['prob_a'] - shadow_df['prob_b']) < 0.1).mean():.1%}")
# A/B TEST: route 50% of traffic to each
def ab_route(customer_id: str, split_pct: float = 0.5) -> str:
"""Deterministic routing: same customer always gets same model."""
h = int(hashlib.md5(customer_id.encode()).hexdigest(), 16)
return "A" if (h % 100) < (split_pct * 100) else "B"
# Simulate 2000 A/B test requests with real outcomes (after 30 days)
ab_results = []
for i in range(2000):
customer_id = f"cust_{i:05d}"
group = ab_route(customer_id)
model = model_a if group == "A" else model_b
row = X_test.iloc[i % len(X_test)]
prob = model.predict_proba(pd.DataFrame([row]))[0, 1]
actual = y_test.iloc[i % len(y_test)]
ab_results.append({"group": group, "pred_prob": prob, "actual": actual})
ab_df = pd.DataFrame(ab_results)
auc_a = roc_auc_score(ab_df[ab_df["group"]=="A"]["actual"], ab_df[ab_df["group"]=="A"]["pred_prob"])
auc_b = roc_auc_score(ab_df[ab_df["group"]=="B"]["actual"], ab_df[ab_df["group"]=="B"]["pred_prob"])
print(f"\nA/B Test Results:")
print(f" Group A (n={ab_df['group'].eq('A').sum()}): AUC = {auc_a:.4f}")
print(f" Group B (n={ab_df['group'].eq('B').sum()}): AUC = {auc_b:.4f}")
print(f" Lift: {(auc_b - auc_a)*100:.2f}% AUC points")
# STATISTICAL SIGNIFICANCE TEST (Mann-Whitney U for AUC comparison)
probs_a_ab = ab_df[ab_df["group"]=="A"]["pred_prob"].values
probs_b_ab = ab_df[ab_df["group"]=="B"]["pred_prob"].values
u_stat, p_value = mannwhitneyu(probs_b_ab, probs_a_ab, alternative="two-sided")
significant = p_value < 0.05
print(f" Mann-Whitney U p-value: {p_value:.4f}")
print(f" Statistically significant (alpha=0.05): {significant}")
if significant and auc_b > auc_a:
print(" DECISION: Promote Model B to production!")
else:
print(" DECISION: Keep Model A (insufficient evidence to switch)")Tip
Tip
Practice AB Testing Evaluating Models in Production in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
Machine Learning follows a structured pipeline from data to deployment
Practice Task
Note
Practice Task — (1) Write a working example of AB Testing Evaluating Models in Production from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with AB Testing Evaluating Models in Production is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.