Mini Project: Customer Segmentation
Build a complete customer segmentation system for an e-commerce company using RFM (Recency, Frequency, Monetary) feature engineering and K-Means clustering. Derive business-interpretable segments (Champions, Loyal, At-Risk, Lost) and build a real-time segment scoring function.
RFM Customer Segmentation with K-Means
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
np.random.seed(42)
N = 2000
# SIMULATE E-COMMERCE TRANSACTION DATA
today = pd.Timestamp("2024-01-01")
df = pd.DataFrame({
"customer_id": range(1, N + 1),
"last_purchase": [today - pd.Timedelta(days=np.random.exponential(90)) for _ in range(N)],
"num_orders": np.random.negative_binomial(3, 0.3, N).clip(1, 50),
"total_spent": np.random.exponential(250, N).clip(10, 5000).round(2),
})
# RFM FEATURE ENGINEERING
df["recency"] = (today - df["last_purchase"]).dt.days # lower = more recent
df["frequency"] = df["num_orders"]
df["monetary"] = df["total_spent"]
rfm = df[["customer_id", "recency", "frequency", "monetary"]].copy()
print("RFM Summary:")
print(rfm[["recency", "frequency", "monetary"]].describe().round(1))
# PREPARE FOR CLUSTERING
scaler = StandardScaler()
rfm_features = rfm[["recency", "frequency", "monetary"]]
rfm_scaled = scaler.fit_transform(rfm_features)
# SELECT OPTIMAL K
inertias = []
silhouette_vals = []
k_range = range(2, 9)
for k in k_range:
km = KMeans(n_clusters=k, n_init=20, random_state=42)
labels = km.fit_predict(rfm_scaled)
inertias.append(km.inertia_)
silhouette_vals.append(silhouette_score(rfm_scaled, labels))
print("\nSilhouette scores by K:")
for k, sil in zip(k_range, silhouette_vals):
print(f" K={k}: {sil:.4f}")
best_k = k_range[np.argmax(silhouette_vals)]
print(f"Best K: {best_k}")
# FIT FINAL MODEL
km_final = KMeans(n_clusters=best_k, n_init=20, random_state=42)
rfm["segment"] = km_final.fit_predict(rfm_scaled)
# INTERPRET SEGMENTS
segment_profile = rfm.groupby("segment")[["recency", "frequency", "monetary"]].mean().round(1)
print("\nSegment Profiles:")
print(segment_profile)
# LABEL SEGMENTS BASED ON PROFILE
def label_segment(row: pd.Series) -> str:
if row["recency"] < 30 and row["frequency"] > 10 and row["monetary"] > 400:
return "Champions"
elif row["recency"] < 60 and row["frequency"] > 5:
return "Loyal"
elif row["recency"] > 120 and row["frequency"] < 3:
return "At Risk"
elif row["recency"] > 180:
return "Lost"
return "Potential Loyalists"
segment_profile["label"] = segment_profile.apply(label_segment, axis=1)
print("\nSegment Labels:")
print(segment_profile[["recency", "frequency", "monetary", "label"]])
# MERGE LABELS BACK
label_map = segment_profile["label"].to_dict()
rfm["segment_label"] = rfm["segment"].map(label_map)
print("\nCustomer distribution by segment:")
print(rfm["segment_label"].value_counts().to_string())
# 3D VISUALIZATION VIA PCA
pca = PCA(n_components=2).fit_transform(rfm_scaled)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
for ax, (c_col, title) in zip(axes, [("segment", "K-Means Cluster"), ("monetary", "Monetary Value")]):
if c_col == "segment":
sc = ax.scatter(pca[:, 0], pca[:, 1], c=rfm["segment"], cmap="tab10", s=20, alpha=0.6)
for seg_id in rfm["segment"].unique():
mask = rfm["segment"] == seg_id
cx, cy = pca[mask, 0].mean(), pca[mask, 1].mean()
label = label_map.get(seg_id, str(seg_id))
ax.annotate(label, (cx, cy), fontsize=9, fontweight="bold",
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7))
else:
sc = ax.scatter(pca[:, 0], pca[:, 1], c=rfm["monetary"], cmap="YlOrRd", s=20, alpha=0.6)
plt.colorbar(sc, ax=ax, label="Monetary ($)")
ax.set_title(title)
ax.set_xlabel("PCA Component 1")
ax.set_ylabel("PCA Component 2")
plt.tight_layout()
plt.savefig("customer_segments.png", dpi=100, bbox_inches="tight")
plt.show()
# SEGMENT SCORING FUNCTION
def score_customer(recency_days: int, num_orders: int, total_spent: float) -> dict:
sample = scaler.transform([[recency_days, num_orders, total_spent]])
segment_id = km_final.predict(sample)[0]
segment_name = label_map.get(segment_id, f"Segment {segment_id}")
actions = {
"Champions": "Send VIP exclusive offers and early access",
"Loyal": "Reward with loyalty points, upsell premium products",
"At Risk": "Send win-back campaign with 20% discount",
"Lost": "Last-chance email campaign or remove from active",
"Potential Loyalists": "Nurture with onboarding sequence",
}
return {"segment": segment_name, "action": actions.get(segment_name, "Monitor")}
print("\nSample predictions:")
print(score_customer(15, 25, 1200)) # recent, frequent, high value
print(score_customer(180, 2, 45)) # old, rare, low valueTip
Tip
Practice Mini Project Customer Segmentation in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
Neural networks learn by adjusting connection weights via backpropagation
Practice Task
Note
Practice Task — (1) Write a working example of Mini Project Customer Segmentation from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with Mini Project Customer Segmentation is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.