Mini Project: Feature Engineering for Credit Risk

Apply comprehensive feature engineering to a credit risk dataset: create 20+ engineered features (ratios, interactions, bins, aggregations), select the best using SHAP, build a pipeline, and measure the AUC improvement over the raw feature baseline. This is the pattern used at banks and fintech companies for credit scoring models.

45 min•By Priygop Team•Updated 2026

Credit Risk Feature Engineering Pipeline

import numpy as np
import pandas as pd
import shap
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier

np.random.seed(42)
N = 5000

# RAW CREDIT DATASET
df = pd.DataFrame({
    "age":              np.random.normal(40, 12, N).clip(18, 75),
    "annual_income":    np.random.exponential(55000, N).clip(15000, 300000),
    "employment_years": np.random.exponential(8, N).clip(0, 40),
    "num_credit_lines": np.random.poisson(3, N).clip(0, 15),
    "credit_limit":     np.random.exponential(10000, N).clip(500, 100000),
    "credit_balance":   np.random.exponential(3000, N).clip(0, 50000),
    "num_late_payments":np.random.choice(range(10), N, p=[0.5,0.2,0.1,0.07,0.05,0.03,0.02,0.01,0.01,0.01]),
    "loan_amount":      np.random.exponential(20000, N).clip(1000, 100000),
    "loan_term":        np.random.choice([12, 24, 36, 48, 60], N),
    "num_inquiries":    np.random.poisson(1, N).clip(0, 10),
})
# Target: default driven by debt ratios and late payments
df["default"] = (
    (df["credit_balance"] / df["credit_limit"].clip(1) > 0.8) |
    (df["loan_amount"] / df["annual_income"] > 0.6) |
    (df["num_late_payments"] > 3) |
    (np.random.uniform(0, 1, N) < 0.05)
).astype(int)

X_raw = df.drop("default", axis=1)
y     = df["default"]

X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=42, stratify=y)

# BASELINE AUC (raw features)
base_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
base_auc = cross_val_score(base_model, X_train, y_train, cv=5, scoring="roc_auc").mean()
print(f"Baseline AUC (raw features): {base_auc:.4f}")

# FEATURE ENGINEERING
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # Credit utilization (key credit-risk metric)
    df["credit_utilization"]     = df["credit_balance"] / df["credit_limit"].clip(1)
    # Debt-to-income ratios
    df["loan_to_income"]         = df["loan_amount"] / df["annual_income"]
    df["total_debt_to_income"]   = (df["loan_amount"] + df["credit_balance"]) / df["annual_income"]
    # Monthly payment burden
    df["monthly_payment"]        = df["loan_amount"] / df["loan_term"]
    df["payment_to_income"]      = df["monthly_payment"] / (df["annual_income"] / 12)
    # Late payment indicators
    df["has_late_payment"]       = (df["num_late_payments"] > 0).astype(int)
    df["chronic_late"]           = (df["num_late_payments"] > 3).astype(int)
    # Credit profile
    df["credit_per_line"]        = df["credit_limit"] / (df["num_credit_lines"] + 1)
    df["avg_balance_per_line"]   = df["credit_balance"] / (df["num_credit_lines"] + 1)
    # Age-income interaction
    df["income_per_age"]         = df["annual_income"] / df["age"]
    # Stability features
    df["employment_stability"]   = np.log1p(df["employment_years"])
    # Inquiry flag
    df["high_inquiry"]           = (df["num_inquiries"] > 3).astype(int)
    return df

X_train_fe = engineer_features(X_train)
X_test_fe  = engineer_features(X_test)

# AUC WITH ENGINEERED FEATURES
fe_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
fe_auc = cross_val_score(fe_model, X_train_fe, y_train, cv=5, scoring="roc_auc").mean()
print(f"Engineered features AUC:     {fe_auc:.4f}  (+{fe_auc-base_auc:.4f})")

# SHAP-BASED FEATURE IMPORTANCE
xgb_fe = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, verbosity=0, random_state=42)
xgb_fe.fit(X_train_fe, y_train)
explainer   = shap.TreeExplainer(xgb_fe)
shap_vals   = explainer.shap_values(X_test_fe)
mean_shap   = pd.DataFrame({"Feature": X_test_fe.columns, "Mean|SHAP|": np.abs(shap_vals).mean(axis=0)})

print("\nTop 10 features by SHAP importance:")
print(mean_shap.nlargest(10, "Mean|SHAP|").to_string(index=False))

# SELECT TOP-12 AND COMPARE
top12 = mean_shap.nlargest(12, "Mean|SHAP|")["Feature"].tolist()
final_auc = cross_val_score(fe_model, X_train_fe[top12], y_train, cv=5, scoring="roc_auc").mean()
print(f"\nFinal model (top 12 SHAP features): AUC = {final_auc:.4f}")
print(f"Total AUC improvement over baseline: +{final_auc-base_auc:.4f}")

Tip

Practice Mini Project Feature Engineering for Credit Risk in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Feature engineering = 80% of ML success.

Practice Task

Note

Practice Task — (1) Write a working example of Mini Project Feature Engineering for Credit Risk from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Mini Project Feature Engineering for Credit Risk is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.

Topics in This Module