Mini Project: Customer Churn Prediction

Build a production-ready customer churn prediction system for a telecom company using LightGBM. Includes: EDA, feature engineering, LightGBM with early stopping, Optuna hyperparameter tuning, SHAP-based business insights, and a real-time scoring function. Customer churn prediction is worth millions in recovered revenue for telecom companies.

60 min•By Priygop Team•Updated 2026

Telecom Churn Prediction with LightGBM + SHAP

import numpy as np
import pandas as pd
import lightgbm as lgb
import shap
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, f1_score
import matplotlib.pyplot as plt

np.random.seed(42)
N = 5000

# SIMULATE TELECOM CHURN DATASET
df = pd.DataFrame({
    "tenure":           np.random.exponential(30, N).clip(1, 72).round(),
    "monthly_charges":  np.random.normal(65, 30, N).clip(20, 120).round(2),
    "total_charges":    None,  # will compute
    "num_services":     np.random.randint(1, 9, N),
    "contract":         np.random.choice(["monthly", "one_year", "two_year"], N, p=[0.55, 0.24, 0.21]),
    "payment_method":   np.random.choice(["electronic", "credit_card", "bank", "mailed"], N, p=[0.33, 0.22, 0.22, 0.23]),
    "senior_citizen":   np.random.choice([0, 1], N, p=[0.84, 0.16]),
    "support_tickets":  np.random.poisson(0.8, N),
})
df["total_charges"] = (df["tenure"] * df["monthly_charges"] + np.random.normal(0, 50, N)).clip(0)
# Churn: more likely for monthly contract, high charges, low tenure
churn_prob = (
    0.4 * (df["contract"] == "monthly") +
    0.2 * (df["monthly_charges"] > 80) +
    0.3 * (df["tenure"] < 12) +
    0.1 * (df["support_tickets"] > 2) +
    np.random.uniform(0, 0.3, N) - 0.2
).clip(0, 1)
df["Churn"] = (churn_prob > 0.45).astype(int)

print(f"Dataset: {df.shape} | Churn rate: {df['Churn'].mean():.1%}")

# FEATURE ENGINEERING
df["charges_per_service"] = df["monthly_charges"] / df["num_services"]
df["avg_tenure_charge"]   = df["total_charges"] / (df["tenure"] + 1)
df["high_support"]        = (df["support_tickets"] > 2).astype(int)

# ENCODE CATEGORICALS (LightGBM handles this natively, but for demo)
df["contract_code"] = df["contract"].map({"monthly": 0, "one_year": 1, "two_year": 2})

feature_cols = ["tenure", "monthly_charges", "total_charges", "num_services", "senior_citizen",
                "support_tickets", "contract_code", "charges_per_service", "avg_tenure_charge", "high_support"]
X = df[feature_cols]
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val,  y_train, y_val  = train_test_split(X_train, y_train, test_size=0.15, random_state=42, stratify=y_train)

# LIGHTGBM WITH EARLY STOPPING
lgb_model = lgb.LGBMClassifier(
    n_estimators=1000, learning_rate=0.05, num_leaves=31,
    is_unbalance=True,  # handles class imbalance
    random_state=42, verbose=-1,
)
lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(30, verbose=False)])

y_prob = lgb_model.predict_proba(X_test)[:, 1]
print(f"\nLightGBM Test AUC: {roc_auc_score(y_test, y_prob):.4f}")
print(f"Best iteration: {lgb_model.best_iteration_}")

# SHAP INSIGHTS
explainer = shap.TreeExplainer(lgb_model)
shap_values = explainer.shap_values(X_test)
if isinstance(shap_values, list): shap_values = shap_values[1]  # class 1

mean_shap = pd.DataFrame({
    "Feature": feature_cols,
    "Mean |SHAP|": np.abs(shap_values).mean(axis=0),
}).sort_values("Mean |SHAP|", ascending=False)

print("\nTop Churn Drivers (SHAP feature importance):")
for _, row in mean_shap.head(5).iterrows():
    print(f"  {row['Feature']:28s}: {row['Mean |SHAP|']:.4f}")

# CHURN SCORING FUNCTION (for production)
def churn_score(customer: dict) -> dict:
    sample = pd.DataFrame([customer])[feature_cols]
    prob = lgb_model.predict_proba(sample)[0, 1]
    tier = "HIGH" if prob > 0.6 else "MEDIUM" if prob > 0.3 else "LOW"
    return {"churn_probability": round(prob, 3), "risk_tier": tier, "action": "Call retention team" if tier == "HIGH" else "Send offer" if tier == "MEDIUM" else "Monitor"}

customer_example = {"tenure": 6, "monthly_charges": 85, "total_charges": 510, "num_services": 2,
                     "senior_citizen": 0, "support_tickets": 3, "contract_code": 0,
                     "charges_per_service": 42.5, "avg_tenure_charge": 73, "high_support": 1}
result = churn_score(customer_example)
print(f"\nChurn Score: {result}")

Tip

Practice Mini Project Customer Churn Prediction in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Neural networks learn by adjusting connection weights via backpropagation

Practice Task

Note

Practice Task — (1) Write a working example of Mini Project Customer Churn Prediction from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Mini Project Customer Churn Prediction is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.

Topics in This Module