A/B Testing — Evaluating Models in Production

Before replacing a production model with a new version, you need to prove the new model is better on real traffic with statistical confidence. Shadow mode (run both models, log both predictions, only serve Model A's output) lets you evaluate without risk. A/B testing (route X% of traffic to each model) measures real business impact. The statistical test determines if the difference is significant or just noise.

20 min•By Priygop Team•Updated 2026

Shadow Mode and A/B Model Testing

import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency, mannwhitneyu
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import hashlib

np.random.seed(42)
N = 3000
X = pd.DataFrame({
    "age":    np.random.normal(38, 12, N).clip(18, 75),
    "income": np.random.exponential(55000, N).clip(15000, 200000),
    "credit": np.random.normal(680, 80, N).clip(300, 850),
})
y = np.random.choice([0, 1], N, p=[0.83, 0.17])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# MODEL A: existing production model
model_a = Pipeline([("sc", StandardScaler()), ("m", GradientBoostingClassifier(n_estimators=100, random_state=42))])
model_a.fit(X_train, y_train)

# MODEL B: new challenger model
model_b = Pipeline([("sc", StandardScaler()), ("m", GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=42))])
model_b.fit(X_train, y_train)

probs_a = model_a.predict_proba(X_test)[:, 1]
probs_b = model_b.predict_proba(X_test)[:, 1]

# SHADOW MODE: route all traffic to A, but also run B silently
def shadow_inference(features: pd.DataFrame, model_a, model_b, shadow_log: list) -> float:
    """Serve A's prediction, log both."""
    prob_a = model_a.predict_proba(features)[0, 1]
    prob_b = model_b.predict_proba(features)[0, 1]
    shadow_log.append({"prob_a": prob_a, "prob_b": prob_b})
    return prob_a  # only A goes to customer

shadow_log: list = []
for i in range(len(X_test)):
    shadow_inference(X_test.iloc[[i]], model_a, model_b, shadow_log)

shadow_df = pd.DataFrame(shadow_log)
print(f"Shadow mode comparison ({len(shadow_df)} requests):")
print(f"  Model A mean prob: {shadow_df['prob_a'].mean():.4f}")
print(f"  Model B mean prob: {shadow_df['prob_b'].mean():.4f}")
print(f"  Agreement rate:    {(abs(shadow_df['prob_a'] - shadow_df['prob_b']) < 0.1).mean():.1%}")

# A/B TEST: route 50% of traffic to each
def ab_route(customer_id: str, split_pct: float = 0.5) -> str:
    """Deterministic routing: same customer always gets same model."""
    h = int(hashlib.md5(customer_id.encode()).hexdigest(), 16)
    return "A" if (h % 100) < (split_pct * 100) else "B"

# Simulate 2000 A/B test requests with real outcomes (after 30 days)
ab_results = []
for i in range(2000):
    customer_id  = f"cust_{i:05d}"
    group        = ab_route(customer_id)
    model        = model_a if group == "A" else model_b
    row          = X_test.iloc[i % len(X_test)]
    prob         = model.predict_proba(pd.DataFrame([row]))[0, 1]
    actual       = y_test.iloc[i % len(y_test)]
    ab_results.append({"group": group, "pred_prob": prob, "actual": actual})

ab_df    = pd.DataFrame(ab_results)
auc_a    = roc_auc_score(ab_df[ab_df["group"]=="A"]["actual"], ab_df[ab_df["group"]=="A"]["pred_prob"])
auc_b    = roc_auc_score(ab_df[ab_df["group"]=="B"]["actual"], ab_df[ab_df["group"]=="B"]["pred_prob"])

print(f"\nA/B Test Results:")
print(f"  Group A (n={ab_df['group'].eq('A').sum()}): AUC = {auc_a:.4f}")
print(f"  Group B (n={ab_df['group'].eq('B').sum()}): AUC = {auc_b:.4f}")
print(f"  Lift:         {(auc_b - auc_a)*100:.2f}% AUC points")

# STATISTICAL SIGNIFICANCE TEST (Mann-Whitney U for AUC comparison)
probs_a_ab = ab_df[ab_df["group"]=="A"]["pred_prob"].values
probs_b_ab = ab_df[ab_df["group"]=="B"]["pred_prob"].values
u_stat, p_value = mannwhitneyu(probs_b_ab, probs_a_ab, alternative="two-sided")
significant = p_value < 0.05

print(f"  Mann-Whitney U p-value: {p_value:.4f}")
print(f"  Statistically significant (alpha=0.05): {significant}")
if significant and auc_b > auc_a:
    print("  DECISION: Promote Model B to production!")
else:
    print("  DECISION: Keep Model A (insufficient evidence to switch)")

Tip

Practice AB Testing Evaluating Models in Production in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Machine Learning follows a structured pipeline from data to deployment

Practice Task

Note

Practice Task — (1) Write a working example of AB Testing Evaluating Models in Production from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with AB Testing Evaluating Models in Production is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.

Topics in This Module

A/B Testing — Evaluating Models in Production

20 min•By Priygop Team•Updated 2026

Shadow Mode and A/B Model Testing

import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency, mannwhitneyu
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import hashlib

np.random.seed(42)
N = 3000
X = pd.DataFrame({
    "age":    np.random.normal(38, 12, N).clip(18, 75),
    "income": np.random.exponential(55000, N).clip(15000, 200000),
    "credit": np.random.normal(680, 80, N).clip(300, 850),
})
y = np.random.choice([0, 1], N, p=[0.83, 0.17])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# MODEL A: existing production model
model_a = Pipeline([("sc", StandardScaler()), ("m", GradientBoostingClassifier(n_estimators=100, random_state=42))])
model_a.fit(X_train, y_train)

# MODEL B: new challenger model
model_b = Pipeline([("sc", StandardScaler()), ("m", GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=42))])
model_b.fit(X_train, y_train)

probs_a = model_a.predict_proba(X_test)[:, 1]
probs_b = model_b.predict_proba(X_test)[:, 1]

# SHADOW MODE: route all traffic to A, but also run B silently
def shadow_inference(features: pd.DataFrame, model_a, model_b, shadow_log: list) -> float:
    """Serve A's prediction, log both."""
    prob_a = model_a.predict_proba(features)[0, 1]
    prob_b = model_b.predict_proba(features)[0, 1]
    shadow_log.append({"prob_a": prob_a, "prob_b": prob_b})
    return prob_a  # only A goes to customer

shadow_log: list = []
for i in range(len(X_test)):
    shadow_inference(X_test.iloc[[i]], model_a, model_b, shadow_log)

shadow_df = pd.DataFrame(shadow_log)
print(f"Shadow mode comparison ({len(shadow_df)} requests):")
print(f"  Model A mean prob: {shadow_df['prob_a'].mean():.4f}")
print(f"  Model B mean prob: {shadow_df['prob_b'].mean():.4f}")
print(f"  Agreement rate:    {(abs(shadow_df['prob_a'] - shadow_df['prob_b']) < 0.1).mean():.1%}")

# A/B TEST: route 50% of traffic to each
def ab_route(customer_id: str, split_pct: float = 0.5) -> str:
    """Deterministic routing: same customer always gets same model."""
    h = int(hashlib.md5(customer_id.encode()).hexdigest(), 16)
    return "A" if (h % 100) < (split_pct * 100) else "B"

# Simulate 2000 A/B test requests with real outcomes (after 30 days)
ab_results = []
for i in range(2000):
    customer_id  = f"cust_{i:05d}"
    group        = ab_route(customer_id)
    model        = model_a if group == "A" else model_b
    row          = X_test.iloc[i % len(X_test)]
    prob         = model.predict_proba(pd.DataFrame([row]))[0, 1]
    actual       = y_test.iloc[i % len(y_test)]
    ab_results.append({"group": group, "pred_prob": prob, "actual": actual})

ab_df    = pd.DataFrame(ab_results)
auc_a    = roc_auc_score(ab_df[ab_df["group"]=="A"]["actual"], ab_df[ab_df["group"]=="A"]["pred_prob"])
auc_b    = roc_auc_score(ab_df[ab_df["group"]=="B"]["actual"], ab_df[ab_df["group"]=="B"]["pred_prob"])

print(f"\nA/B Test Results:")
print(f"  Group A (n={ab_df['group'].eq('A').sum()}): AUC = {auc_a:.4f}")
print(f"  Group B (n={ab_df['group'].eq('B').sum()}): AUC = {auc_b:.4f}")
print(f"  Lift:         {(auc_b - auc_a)*100:.2f}% AUC points")

# STATISTICAL SIGNIFICANCE TEST (Mann-Whitney U for AUC comparison)
probs_a_ab = ab_df[ab_df["group"]=="A"]["pred_prob"].values
probs_b_ab = ab_df[ab_df["group"]=="B"]["pred_prob"].values
u_stat, p_value = mannwhitneyu(probs_b_ab, probs_a_ab, alternative="two-sided")
significant = p_value < 0.05

print(f"  Mann-Whitney U p-value: {p_value:.4f}")
print(f"  Statistically significant (alpha=0.05): {significant}")
if significant and auc_b > auc_a:
    print("  DECISION: Promote Model B to production!")
else:
    print("  DECISION: Keep Model A (insufficient evidence to switch)")

Tip

Diagram

Loading diagram…

Machine Learning follows a structured pipeline from data to deployment

Topics in This Module