Mini Project: Complete AutoML-Style Pipeline
Build a complete AutoML-style experiment: automatically preprocess a mixed-type dataset, run multiple algorithm families, use Optuna to deeply tune the best performer, evaluate with nested CV, and export the winner as a production-ready serialized pipeline.
AutoML-Style Pipeline: Full Experiment
import numpy as np
import pandas as pd
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
np.random.seed(42)
N = 4000
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# DATASET
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
df = pd.DataFrame({
"age": np.random.normal(40, 12, N).clip(18, 75),
"income": np.random.exponential(60000, N).clip(15000, 300000),
"credit": np.random.normal(680, 80, N).clip(300, 850),
"loan_amt": np.random.exponential(20000, N).clip(1000, 100000),
"term": np.random.choice([12,24,36,48,60], N),
"educ": np.random.choice(["hs","bachelor","master","phd"], N, p=[0.3,0.4,0.2,0.1]),
"employ": np.random.choice(["full","part","self","unemp"], N, p=[0.6,0.15,0.15,0.1]),
"default": np.random.choice([0,1], N, p=[0.83,0.17]),
})
for col, frac in [("income",0.08),("credit",0.05),("educ",0.03)]:
df.loc[df.sample(int(N*frac), random_state=1).index, col] = np.nan
X = df.drop("default", axis=1)
y = df["default"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# SHARED PREPROCESSOR
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
num_cols = ["age","income","credit","loan_amt","term"]
cat_cols = ["educ","employ"]
prep = ColumnTransformer([
("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("yeo", PowerTransformer())]), num_cols),
("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore"))]), cat_cols),
])
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 1: ALGORITHM SELECTION
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
algorithms = {
"LogReg": LogisticRegression(C=1.0, class_weight="balanced", max_iter=1000, random_state=42),
"RF": RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42, n_jobs=-1),
"GBM": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=42),
"XGB": xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, verbosity=0, random_state=42, n_jobs=-1),
}
print("Step 1: Algorithm Selection (5-fold CV AUC):")
algo_results = {}
for name, algo in algorithms.items():
pipe = Pipeline([("prep", prep), ("model", algo)])
auc = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc").mean()
algo_results[name] = auc
print(f" {name:10s}: {auc:.4f}")
winner = max(algo_results, key=algo_results.get)
print(f" Best: {winner} ({algo_results[winner]:.4f})")
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 2: OPTUNA TUNING ON WINNER
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
print("\nStep 2: Optuna tuning of best algorithm (40 trials)...")
def xgb_objective(trial: optuna.Trial) -> float:
params = {
"n_estimators": trial.suggest_int("n_estimators", 100, 600),
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
"max_depth": trial.suggest_int("max_depth", 3, 7),
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
"reg_alpha": trial.suggest_float("reg_alpha", 1e-6, 10, log=True),
"verbosity": 0, "n_jobs": -1, "random_state": 42,
}
pipe = Pipeline([("prep", prep), ("model", xgb.XGBClassifier(**params))])
return cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc").mean()
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(xgb_objective, n_trials=40, show_progress_bar=True)
print(f" Best AUC after tuning: {study.best_value:.4f} (was {algo_results[winner]:.4f})")
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 3: FINAL MODEL & EVALUATION
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
final_model = Pipeline([
("prep", prep),
("model", xgb.XGBClassifier(**study.best_params, verbosity=0, random_state=42)),
])
final_model.fit(X_train, y_train)
y_prob = final_model.predict_proba(X_test)[:, 1]
y_pred = final_model.predict(X_test)
test_auc = roc_auc_score(y_test, y_prob)
print(f"\nStep 3: Final test AUC: {test_auc:.4f}")
print(classification_report(y_test, y_pred, target_names=["No Default","Default"]))
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 4: SAVE PRODUCTION ARTIFACT
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
import json, datetime
metadata = {
"model": "XGBoost",
"best_params": study.best_params,
"cv_auc": study.best_value,
"test_auc": test_auc,
"n_train": len(X_train),
"created_at": datetime.datetime.now().isoformat(),
}
joblib.dump(final_model, "final_pipeline_v1.joblib")
with open("model_metadata.json", "w") as f:
json.dump(metadata, f, indent=2)
print("\nSaved: final_pipeline_v1.joblib + model_metadata.json")Tip
Tip
Practice Mini Project Complete AutoMLStyle Pipeline in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
80% of ML work is data preparation — garbage in = garbage out
Practice Task
Note
Practice Task — (1) Write a working example of Mini Project Complete AutoMLStyle Pipeline from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with Mini Project Complete AutoMLStyle Pipeline is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.