Mini Project: Feature Engineering for Credit Risk
Apply comprehensive feature engineering to a credit risk dataset: create 20+ engineered features (ratios, interactions, bins, aggregations), select the best using SHAP, build a pipeline, and measure the AUC improvement over the raw feature baseline. This is the pattern used at banks and fintech companies for credit scoring models.
Credit Risk Feature Engineering Pipeline
import numpy as np
import pandas as pd
import shap
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
np.random.seed(42)
N = 5000
# RAW CREDIT DATASET
df = pd.DataFrame({
"age": np.random.normal(40, 12, N).clip(18, 75),
"annual_income": np.random.exponential(55000, N).clip(15000, 300000),
"employment_years": np.random.exponential(8, N).clip(0, 40),
"num_credit_lines": np.random.poisson(3, N).clip(0, 15),
"credit_limit": np.random.exponential(10000, N).clip(500, 100000),
"credit_balance": np.random.exponential(3000, N).clip(0, 50000),
"num_late_payments":np.random.choice(range(10), N, p=[0.5,0.2,0.1,0.07,0.05,0.03,0.02,0.01,0.01,0.01]),
"loan_amount": np.random.exponential(20000, N).clip(1000, 100000),
"loan_term": np.random.choice([12, 24, 36, 48, 60], N),
"num_inquiries": np.random.poisson(1, N).clip(0, 10),
})
# Target: default driven by debt ratios and late payments
df["default"] = (
(df["credit_balance"] / df["credit_limit"].clip(1) > 0.8) |
(df["loan_amount"] / df["annual_income"] > 0.6) |
(df["num_late_payments"] > 3) |
(np.random.uniform(0, 1, N) < 0.05)
).astype(int)
X_raw = df.drop("default", axis=1)
y = df["default"]
X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=42, stratify=y)
# BASELINE AUC (raw features)
base_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
base_auc = cross_val_score(base_model, X_train, y_train, cv=5, scoring="roc_auc").mean()
print(f"Baseline AUC (raw features): {base_auc:.4f}")
# FEATURE ENGINEERING
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
# Credit utilization (key credit-risk metric)
df["credit_utilization"] = df["credit_balance"] / df["credit_limit"].clip(1)
# Debt-to-income ratios
df["loan_to_income"] = df["loan_amount"] / df["annual_income"]
df["total_debt_to_income"] = (df["loan_amount"] + df["credit_balance"]) / df["annual_income"]
# Monthly payment burden
df["monthly_payment"] = df["loan_amount"] / df["loan_term"]
df["payment_to_income"] = df["monthly_payment"] / (df["annual_income"] / 12)
# Late payment indicators
df["has_late_payment"] = (df["num_late_payments"] > 0).astype(int)
df["chronic_late"] = (df["num_late_payments"] > 3).astype(int)
# Credit profile
df["credit_per_line"] = df["credit_limit"] / (df["num_credit_lines"] + 1)
df["avg_balance_per_line"] = df["credit_balance"] / (df["num_credit_lines"] + 1)
# Age-income interaction
df["income_per_age"] = df["annual_income"] / df["age"]
# Stability features
df["employment_stability"] = np.log1p(df["employment_years"])
# Inquiry flag
df["high_inquiry"] = (df["num_inquiries"] > 3).astype(int)
return df
X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)
# AUC WITH ENGINEERED FEATURES
fe_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
fe_auc = cross_val_score(fe_model, X_train_fe, y_train, cv=5, scoring="roc_auc").mean()
print(f"Engineered features AUC: {fe_auc:.4f} (+{fe_auc-base_auc:.4f})")
# SHAP-BASED FEATURE IMPORTANCE
xgb_fe = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, verbosity=0, random_state=42)
xgb_fe.fit(X_train_fe, y_train)
explainer = shap.TreeExplainer(xgb_fe)
shap_vals = explainer.shap_values(X_test_fe)
mean_shap = pd.DataFrame({"Feature": X_test_fe.columns, "Mean|SHAP|": np.abs(shap_vals).mean(axis=0)})
print("\nTop 10 features by SHAP importance:")
print(mean_shap.nlargest(10, "Mean|SHAP|").to_string(index=False))
# SELECT TOP-12 AND COMPARE
top12 = mean_shap.nlargest(12, "Mean|SHAP|")["Feature"].tolist()
final_auc = cross_val_score(fe_model, X_train_fe[top12], y_train, cv=5, scoring="roc_auc").mean()
print(f"\nFinal model (top 12 SHAP features): AUC = {final_auc:.4f}")
print(f"Total AUC improvement over baseline: +{final_auc-base_auc:.4f}")Tip
Tip
Practice Mini Project Feature Engineering for Credit Risk in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
Feature engineering = 80% of ML success.
Practice Task
Note
Practice Task — (1) Write a working example of Mini Project Feature Engineering for Credit Risk from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with Mini Project Feature Engineering for Credit Risk is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.