Mini Project: Customer Segmentation

Build a complete customer segmentation system for an e-commerce company using RFM (Recency, Frequency, Monetary) feature engineering and K-Means clustering. Derive business-interpretable segments (Champions, Loyal, At-Risk, Lost) and build a real-time segment scoring function.

60 min•By Priygop Team•Updated 2026

RFM Customer Segmentation with K-Means

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

np.random.seed(42)
N = 2000

# SIMULATE E-COMMERCE TRANSACTION DATA
today = pd.Timestamp("2024-01-01")
df = pd.DataFrame({
    "customer_id":    range(1, N + 1),
    "last_purchase":  [today - pd.Timedelta(days=np.random.exponential(90)) for _ in range(N)],
    "num_orders":     np.random.negative_binomial(3, 0.3, N).clip(1, 50),
    "total_spent":    np.random.exponential(250, N).clip(10, 5000).round(2),
})

# RFM FEATURE ENGINEERING
df["recency"]   = (today - df["last_purchase"]).dt.days       # lower = more recent
df["frequency"] = df["num_orders"]
df["monetary"]  = df["total_spent"]

rfm = df[["customer_id", "recency", "frequency", "monetary"]].copy()

print("RFM Summary:")
print(rfm[["recency", "frequency", "monetary"]].describe().round(1))

# PREPARE FOR CLUSTERING
scaler = StandardScaler()
rfm_features = rfm[["recency", "frequency", "monetary"]]
rfm_scaled = scaler.fit_transform(rfm_features)

# SELECT OPTIMAL K
inertias = []
silhouette_vals = []
k_range = range(2, 9)

for k in k_range:
    km = KMeans(n_clusters=k, n_init=20, random_state=42)
    labels = km.fit_predict(rfm_scaled)
    inertias.append(km.inertia_)
    silhouette_vals.append(silhouette_score(rfm_scaled, labels))

print("\nSilhouette scores by K:")
for k, sil in zip(k_range, silhouette_vals):
    print(f"  K={k}: {sil:.4f}")

best_k = k_range[np.argmax(silhouette_vals)]
print(f"Best K: {best_k}")

# FIT FINAL MODEL
km_final = KMeans(n_clusters=best_k, n_init=20, random_state=42)
rfm["segment"] = km_final.fit_predict(rfm_scaled)

# INTERPRET SEGMENTS
segment_profile = rfm.groupby("segment")[["recency", "frequency", "monetary"]].mean().round(1)
print("\nSegment Profiles:")
print(segment_profile)

# LABEL SEGMENTS BASED ON PROFILE
def label_segment(row: pd.Series) -> str:
    if row["recency"] < 30 and row["frequency"] > 10 and row["monetary"] > 400:
        return "Champions"
    elif row["recency"] < 60 and row["frequency"] > 5:
        return "Loyal"
    elif row["recency"] > 120 and row["frequency"] < 3:
        return "At Risk"
    elif row["recency"] > 180:
        return "Lost"
    return "Potential Loyalists"

segment_profile["label"] = segment_profile.apply(label_segment, axis=1)
print("\nSegment Labels:")
print(segment_profile[["recency", "frequency", "monetary", "label"]])

# MERGE LABELS BACK
label_map = segment_profile["label"].to_dict()
rfm["segment_label"] = rfm["segment"].map(label_map)

print("\nCustomer distribution by segment:")
print(rfm["segment_label"].value_counts().to_string())

# 3D VISUALIZATION VIA PCA
pca = PCA(n_components=2).fit_transform(rfm_scaled)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for ax, (c_col, title) in zip(axes, [("segment", "K-Means Cluster"), ("monetary", "Monetary Value")]):
    if c_col == "segment":
        sc = ax.scatter(pca[:, 0], pca[:, 1], c=rfm["segment"], cmap="tab10", s=20, alpha=0.6)
        for seg_id in rfm["segment"].unique():
            mask = rfm["segment"] == seg_id
            cx, cy = pca[mask, 0].mean(), pca[mask, 1].mean()
            label = label_map.get(seg_id, str(seg_id))
            ax.annotate(label, (cx, cy), fontsize=9, fontweight="bold",
                        bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7))
    else:
        sc = ax.scatter(pca[:, 0], pca[:, 1], c=rfm["monetary"], cmap="YlOrRd", s=20, alpha=0.6)
        plt.colorbar(sc, ax=ax, label="Monetary ($)")
    ax.set_title(title)
    ax.set_xlabel("PCA Component 1")
    ax.set_ylabel("PCA Component 2")

plt.tight_layout()
plt.savefig("customer_segments.png", dpi=100, bbox_inches="tight")
plt.show()

# SEGMENT SCORING FUNCTION
def score_customer(recency_days: int, num_orders: int, total_spent: float) -> dict:
    sample = scaler.transform([[recency_days, num_orders, total_spent]])
    segment_id = km_final.predict(sample)[0]
    segment_name = label_map.get(segment_id, f"Segment {segment_id}")
    actions = {
        "Champions": "Send VIP exclusive offers and early access",
        "Loyal": "Reward with loyalty points, upsell premium products",
        "At Risk": "Send win-back campaign with 20% discount",
        "Lost": "Last-chance email campaign or remove from active",
        "Potential Loyalists": "Nurture with onboarding sequence",
    }
    return {"segment": segment_name, "action": actions.get(segment_name, "Monitor")}

print("\nSample predictions:")
print(score_customer(15,  25, 1200))  # recent, frequent, high value
print(score_customer(180,  2,   45))  # old, rare, low value

Tip

Practice Mini Project Customer Segmentation in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Neural networks learn by adjusting connection weights via backpropagation

Practice Task

Note

Practice Task — (1) Write a working example of Mini Project Customer Segmentation from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Mini Project Customer Segmentation is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.

Topics in This Module