Mini Project: Building a Model Evaluation Dashboard
Build a comprehensive evaluation report for a credit default prediction model — the kind of artefact you'd present to a data science team lead or business stakeholder. Includes: all classification metrics, ROC and PR curves, calibration, learning curves, threshold analysis, and business impact simulation.
Complete Model Evaluation Report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (confusion_matrix, classification_report, roc_curve, roc_auc_score,
precision_recall_curve, average_precision_score, brier_score_loss,
f1_score, precision_score, recall_score)
from sklearn.calibration import CalibrationDisplay
np.random.seed(42)
X, y = make_classification(n_samples=10000, n_features=20, n_informative=10, weights=[0.88, 0.12], random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
MODEL_NAME = "Gradient Boosting Classifier"
model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, subsample=0.8, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
print(f"===== EVALUATION REPORT: {MODEL_NAME} =====")
# --- SECTION 1: CROSS-VALIDATION ---
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_auc = cross_val_score(model, X_train, y_train, cv=cv, scoring="roc_auc")
print(f"\n1. Cross-Validation (10-fold, training data only):")
print(f" AUC-ROC: {cv_auc.mean():.4f} +/- {cv_auc.std():.4f}")
# --- SECTION 2: TEST SET METRICS ---
print("\n2. Test Set Performance:")
metrics = {
"AUC-ROC": roc_auc_score(y_test, y_prob),
"Avg Precision": average_precision_score(y_test, y_prob),
"Brier Score": brier_score_loss(y_test, y_prob),
"F1 Score": f1_score(y_test, y_pred),
"Precision": precision_score(y_test, y_pred),
"Recall": recall_score(y_test, y_pred),
}
for name, val in metrics.items():
print(f" {name:18s}: {val:.4f}")
# --- SECTION 3: THRESHOLD ANALYSIS ---
thresholds = np.arange(0.1, 0.9, 0.05)
th_results = []
for t in thresholds:
yp = (y_prob >= t).astype(int)
th_results.append({
"threshold": t,
"precision": precision_score(y_test, yp, zero_division=0),
"recall": recall_score(y_test, yp, zero_division=0),
"f1": f1_score(y_test, yp, zero_division=0),
})
th_df = pd.DataFrame(th_results)
best_row = th_df.loc[th_df["f1"].idxmax()]
print(f"\n3. Optimal Threshold (max F1): {best_row['threshold']:.2f}")
print(f" Precision: {best_row['precision']:.3f} | Recall: {best_row['recall']:.3f} | F1: {best_row['f1']:.3f}")
# --- SECTION 4: BUSINESS IMPACT SIMULATION ---
loan_value = 10000
false_neg_cost = loan_value * 0.8 # missed bad loan: lose 80% of loan
false_pos_cost = loan_value * 0.05 # rejected good customer: lose 5% in opportunity cost
print("\n4. Business Impact at optimal threshold:")
cm = confusion_matrix(y_test, (y_prob >= best_row["threshold"]).astype(int))
tn, fp, fn, tp = cm.ravel()
total_cost = fn * false_neg_cost + fp * false_pos_cost
print(f" False Negatives (missed defaults): {fn} x ${false_neg_cost:,.0f} = ${fn*false_neg_cost:,.0f}")
print(f" False Positives (rejected good): {fp} x ${false_pos_cost:,.0f} = ${fp*false_pos_cost:,.0f}")
print(f" Total estimated cost: ${total_cost:,.0f}")
print(f" Baseline (no model): ${y_test.sum() * false_neg_cost:,.0f}")
print(f" SAVINGS vs baseline: ${y_test.sum() * false_neg_cost - total_cost:,.0f}")
print("\nEvaluation report complete.")Tip
Tip
Practice Mini Project Building a Model Evaluation Dashboard in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
Technical diagram.
Practice Task
Note
Practice Task — (1) Write a working example of Mini Project Building a Model Evaluation Dashboard from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with Mini Project Building a Model Evaluation Dashboard is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.