Mini Project: Titanic Survival Classifier

The Titanic dataset is the most famous classification benchmark in ML. Predict survival based on passenger features: class, age, sex, fare, family size. This project covers the complete pipeline: feature engineering, handling missing values, categorical encoding, model comparison, hyperparameter tuning, and feature importance analysis.

60 min•By Priygop Team•Updated 2026

Titanic End-to-End Classification

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# SIMULATE TITANIC DATA (realistic proportions)
np.random.seed(42)
N = 891
df = pd.DataFrame({
    "Pclass":    np.random.choice([1, 2, 3], N, p=[0.25, 0.21, 0.54]),
    "Sex":       np.random.choice(["male", "female"], N, p=[0.65, 0.35]),
    "Age":       np.random.normal(29, 14, N).clip(0.5, 80),
    "SibSp":     np.random.choice(range(9), N, p=[0.68, 0.23, 0.03, 0.02, 0.01, 0.01, 0.01, 0.005, 0.005]),
    "Parch":     np.random.choice(range(7), N, p=[0.76, 0.13, 0.09, 0.01, 0.005, 0.003, 0.002]),
    "Fare":      np.random.exponential(32, N).clip(0, 512),
    "Embarked":  np.random.choice(["S", "C", "Q", None], N, p=[0.72, 0.19, 0.09, 0.002]),
})
# Survival: women and higher class had better survival (historical)
df["Survived"] = (
    (df["Sex"] == "female").astype(int) * 0.5 +
    (df["Pclass"] == 1).astype(int) * 0.3 +
    np.random.uniform(0, 1, N) * 0.2
) > 0.4
df["Survived"] = df["Survived"].astype(int)

# Inject missing values (realistic)
df.loc[df.sample(177, random_state=1).index, "Age"]      = np.nan
df.loc[df.sample(2, random_state=2).index, "Embarked"]  = None
print(f"Dataset: {df.shape} | Survival rate: {df['Survived'].mean():.1%}")

# FEATURE ENGINEERING
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["IsAlone"]    = (df["FamilySize"] == 1).astype(int)
df["ChildGuard"] = ((df["Age"] < 16) & (df["Parch"] > 0)).astype(int)

feature_cols = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "IsAlone", "ChildGuard"]
X = df[feature_cols]
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# PREPROCESSING
num_cols = ["Age", "Fare", "FamilySize", "IsAlone", "ChildGuard"]
cat_cols  = ["Sex", "Embarked"]
ord_cols  = ["Pclass"]

preprocessor = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"))]), cat_cols),
    ("ord", OrdinalEncoder(categories=[[1, 2, 3]]), ord_cols),
])

# MODEL COMPARISON
models = {
    "Logistic Regression": LogisticRegression(C=0.5, class_weight="balanced", max_iter=1000, random_state=42),
    "Random Forest":       RandomForestClassifier(n_estimators=100, max_depth=6, class_weight="balanced", random_state=42),
    "Gradient Boosting":   GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42),
}

print("\nCV AUC-ROC (5-fold):")
for name, model in models.items():
    pipe = Pipeline([("prep", preprocessor), ("model", model)])
    auc = cross_val_score(pipe, X_train, y_train, cv=5, scoring="roc_auc").mean()
    print(f"  {name:<22}: {auc:.4f}")

# BEST MODEL: GRADIENT BOOSTING
best_pipe = Pipeline([("prep", preprocessor), ("model", GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42))])
best_pipe.fit(X_train, y_train)

y_pred  = best_pipe.predict(X_test)
y_prob  = best_pipe.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_prob)

print(f"\nTest AUC-ROC: {test_auc:.4f}")
print(classification_report(y_test, y_pred, target_names=["Did not survive", "Survived"]))

# PREDICTION FUNCTION
def predict_survival(pclass: int, sex: str, age: float, fare: float) -> str:
    sample = pd.DataFrame([{"Pclass": pclass, "Sex": sex, "Age": age, "Fare": fare,
                             "Embarked": "S", "FamilySize": 1, "IsAlone": 1, "ChildGuard": 0}])
    prob = best_pipe.predict_proba(sample)[0, 1]
    return f"{prob:.1%} survival probability"

print("\nPredictions:")
print(f"  1st class female, 28: {predict_survival(1, 'female', 28, 100)}")
print(f"  3rd class male, 35:   {predict_survival(3, 'male', 35, 8)}")

Tip

Practice Mini Project Titanic Survival Classifier in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

F1 = harmonic mean of Precision and Recall (balanced metric)

Practice Task

Note

Practice Task — (1) Write a working example of Mini Project Titanic Survival Classifier from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Mini Project Titanic Survival Classifier is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.

Topics in This Module