Mini Project: Complete AutoML-Style Pipeline

Build a complete AutoML-style experiment: automatically preprocess a mixed-type dataset, run multiple algorithm families, use Optuna to deeply tune the best performer, evaluate with nested CV, and export the winner as a production-ready serialized pipeline.

60 min•By Priygop Team•Updated 2026

AutoML-Style Pipeline: Full Experiment

import numpy as np
import pandas as pd
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

np.random.seed(42)
N = 4000

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# DATASET
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
df = pd.DataFrame({
    "age":        np.random.normal(40, 12, N).clip(18, 75),
    "income":     np.random.exponential(60000, N).clip(15000, 300000),
    "credit":     np.random.normal(680, 80, N).clip(300, 850),
    "loan_amt":   np.random.exponential(20000, N).clip(1000, 100000),
    "term":       np.random.choice([12,24,36,48,60], N),
    "educ":       np.random.choice(["hs","bachelor","master","phd"], N, p=[0.3,0.4,0.2,0.1]),
    "employ":     np.random.choice(["full","part","self","unemp"], N, p=[0.6,0.15,0.15,0.1]),
    "default":    np.random.choice([0,1], N, p=[0.83,0.17]),
})
for col, frac in [("income",0.08),("credit",0.05),("educ",0.03)]:
    df.loc[df.sample(int(N*frac), random_state=1).index, col] = np.nan

X = df.drop("default", axis=1)
y = df["default"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# SHARED PREPROCESSOR
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
num_cols = ["age","income","credit","loan_amt","term"]
cat_cols  = ["educ","employ"]
prep = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("yeo", PowerTransformer())]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore"))]), cat_cols),
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 1: ALGORITHM SELECTION
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
algorithms = {
    "LogReg":  LogisticRegression(C=1.0, class_weight="balanced", max_iter=1000, random_state=42),
    "RF":      RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42, n_jobs=-1),
    "GBM":     GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=42),
    "XGB":     xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, verbosity=0, random_state=42, n_jobs=-1),
}

print("Step 1: Algorithm Selection (5-fold CV AUC):")
algo_results = {}
for name, algo in algorithms.items():
    pipe = Pipeline([("prep", prep), ("model", algo)])
    auc  = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc").mean()
    algo_results[name] = auc
    print(f"  {name:10s}: {auc:.4f}")

winner = max(algo_results, key=algo_results.get)
print(f"  Best: {winner} ({algo_results[winner]:.4f})")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 2: OPTUNA TUNING ON WINNER
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
print("\nStep 2: Optuna tuning of best algorithm (40 trials)...")

def xgb_objective(trial: optuna.Trial) -> float:
    params = {
        "n_estimators":     trial.suggest_int("n_estimators", 100, 600),
        "learning_rate":    trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth":        trial.suggest_int("max_depth", 3, 7),
        "subsample":        trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha":        trial.suggest_float("reg_alpha", 1e-6, 10, log=True),
        "verbosity": 0, "n_jobs": -1, "random_state": 42,
    }
    pipe = Pipeline([("prep", prep), ("model", xgb.XGBClassifier(**params))])
    return cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc").mean()

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(xgb_objective, n_trials=40, show_progress_bar=True)
print(f"  Best AUC after tuning: {study.best_value:.4f} (was {algo_results[winner]:.4f})")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 3: FINAL MODEL & EVALUATION
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
final_model = Pipeline([
    ("prep",  prep),
    ("model", xgb.XGBClassifier(**study.best_params, verbosity=0, random_state=42)),
])
final_model.fit(X_train, y_train)
y_prob = final_model.predict_proba(X_test)[:, 1]
y_pred = final_model.predict(X_test)

test_auc = roc_auc_score(y_test, y_prob)
print(f"\nStep 3: Final test AUC: {test_auc:.4f}")
print(classification_report(y_test, y_pred, target_names=["No Default","Default"]))

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 4: SAVE PRODUCTION ARTIFACT
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
import json, datetime
metadata = {
    "model":      "XGBoost",
    "best_params": study.best_params,
    "cv_auc":     study.best_value,
    "test_auc":   test_auc,
    "n_train":    len(X_train),
    "created_at": datetime.datetime.now().isoformat(),
}
joblib.dump(final_model, "final_pipeline_v1.joblib")
with open("model_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)
print("\nSaved: final_pipeline_v1.joblib + model_metadata.json")

Tip

Practice Mini Project Complete AutoMLStyle Pipeline in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

80% of ML work is data preparation — garbage in = garbage out

Practice Task

Note

Practice Task — (1) Write a working example of Mini Project Complete AutoMLStyle Pipeline from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Mini Project Complete AutoMLStyle Pipeline is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.

Topics in This Module

AutoML-Style Pipeline: Full Experiment

import numpy as np
import pandas as pd
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

np.random.seed(42)
N = 4000

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# DATASET
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
df = pd.DataFrame({
    "age":        np.random.normal(40, 12, N).clip(18, 75),
    "income":     np.random.exponential(60000, N).clip(15000, 300000),
    "credit":     np.random.normal(680, 80, N).clip(300, 850),
    "loan_amt":   np.random.exponential(20000, N).clip(1000, 100000),
    "term":       np.random.choice([12,24,36,48,60], N),
    "educ":       np.random.choice(["hs","bachelor","master","phd"], N, p=[0.3,0.4,0.2,0.1]),
    "employ":     np.random.choice(["full","part","self","unemp"], N, p=[0.6,0.15,0.15,0.1]),
    "default":    np.random.choice([0,1], N, p=[0.83,0.17]),
})
for col, frac in [("income",0.08),("credit",0.05),("educ",0.03)]:
    df.loc[df.sample(int(N*frac), random_state=1).index, col] = np.nan

X = df.drop("default", axis=1)
y = df["default"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# SHARED PREPROCESSOR
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
num_cols = ["age","income","credit","loan_amt","term"]
cat_cols  = ["educ","employ"]
prep = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("yeo", PowerTransformer())]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore"))]), cat_cols),
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 1: ALGORITHM SELECTION
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
algorithms = {
    "LogReg":  LogisticRegression(C=1.0, class_weight="balanced", max_iter=1000, random_state=42),
    "RF":      RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42, n_jobs=-1),
    "GBM":     GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=42),
    "XGB":     xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, verbosity=0, random_state=42, n_jobs=-1),
}

print("Step 1: Algorithm Selection (5-fold CV AUC):")
algo_results = {}
for name, algo in algorithms.items():
    pipe = Pipeline([("prep", prep), ("model", algo)])
    auc  = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc").mean()
    algo_results[name] = auc
    print(f"  {name:10s}: {auc:.4f}")

winner = max(algo_results, key=algo_results.get)
print(f"  Best: {winner} ({algo_results[winner]:.4f})")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 2: OPTUNA TUNING ON WINNER
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
print("\nStep 2: Optuna tuning of best algorithm (40 trials)...")

def xgb_objective(trial: optuna.Trial) -> float:
    params = {
        "n_estimators":     trial.suggest_int("n_estimators", 100, 600),
        "learning_rate":    trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth":        trial.suggest_int("max_depth", 3, 7),
        "subsample":        trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha":        trial.suggest_float("reg_alpha", 1e-6, 10, log=True),
        "verbosity": 0, "n_jobs": -1, "random_state": 42,
    }
    pipe = Pipeline([("prep", prep), ("model", xgb.XGBClassifier(**params))])
    return cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc").mean()

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(xgb_objective, n_trials=40, show_progress_bar=True)
print(f"  Best AUC after tuning: {study.best_value:.4f} (was {algo_results[winner]:.4f})")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 3: FINAL MODEL & EVALUATION
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
final_model = Pipeline([
    ("prep",  prep),
    ("model", xgb.XGBClassifier(**study.best_params, verbosity=0, random_state=42)),
])
final_model.fit(X_train, y_train)
y_prob = final_model.predict_proba(X_test)[:, 1]
y_pred = final_model.predict(X_test)

test_auc = roc_auc_score(y_test, y_prob)
print(f"\nStep 3: Final test AUC: {test_auc:.4f}")
print(classification_report(y_test, y_pred, target_names=["No Default","Default"]))

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 4: SAVE PRODUCTION ARTIFACT
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
import json, datetime
metadata = {
    "model":      "XGBoost",
    "best_params": study.best_params,
    "cv_auc":     study.best_value,
    "test_auc":   test_auc,
    "n_train":    len(X_train),
    "created_at": datetime.datetime.now().isoformat(),
}
joblib.dump(final_model, "final_pipeline_v1.joblib")
with open("model_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)
print("\nSaved: final_pipeline_v1.joblib + model_metadata.json")

Tip

Diagram

Loading diagram…

80% of ML work is data preparation — garbage in = garbage out

Topics in This Module