Mini Project: House Price Prediction

Build a complete house price prediction system: load and explore the California Housing dataset, perform EDA, build a preprocessing pipeline, compare multiple regression models, tune the best one, analyze residuals, and create a prediction function. This is the classic end-to-end regression ML project every data scientist has in their portfolio.

60 min•By Priygop Team•Updated 2026

Complete House Price Prediction Pipeline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 1: LOAD & FEATURE ENGINEERING
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df["MedHouseVal"] = housing.target

# EDA-driven feature engineering
df["rooms_per_person"] = df["AveRooms"] / df["AveOccup"]
df["bedrooms_ratio"]   = df["AveBedrms"] / df["AveRooms"]
df["income_per_room"]  = df["MedInc"] / (df["AveRooms"] + 1)
# Clip to remove unrealistic values
df["rooms_per_person"] = df["rooms_per_person"].clip(0, 15)
df["bedrooms_ratio"]   = df["bedrooms_ratio"].clip(0, 1)

feature_cols = housing.feature_names.tolist() + ["rooms_per_person", "bedrooms_ratio", "income_per_room"]
X = df[feature_cols]
y = df["MedHouseVal"]

print(f"Features: {feature_cols}")
print(f"Shape: {X.shape}")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 2: SPLIT
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val,  y_train, y_val  = train_test_split(X_train, y_train, test_size=0.15, random_state=42)
print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 3: MODEL COMPARISON
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
models = {
    "Ridge":            Pipeline([("scaler", StandardScaler()), ("model", Ridge(alpha=10.0))]),
    "Random Forest":    Pipeline([("model", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))]),
    "Gradient Boost":   Pipeline([("model", GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42))]),
}

print("\nModel comparison (5-fold CV on training data):")
for name, model in models.items():
    cv_r2 = cross_val_score(model, X_train, y_train, cv=5, scoring="r2")
    print(f"  {name:<18}: CV R2 = {cv_r2.mean():.4f} +/- {cv_r2.std():.4f}")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 4: TUNE BEST MODEL
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
param_grid = {
    "model__n_estimators":  [200, 300],
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth":     [4, 5, 6],
}
gb_pipeline = Pipeline([("model", GradientBoostingRegressor(random_state=42))])
search = GridSearchCV(gb_pipeline, param_grid, cv=5, scoring="r2", n_jobs=-1, verbose=0)
search.fit(X_train, y_train)
print(f"\nBest params: {search.best_params_}")
print(f"Best CV R2:  {search.best_score_:.4f}")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 5: FINAL EVALUATION
# ━━━━━━━━━━━━━━━━━━━━━━━━━━
best_model = search.best_estimator_
best_model.fit(X_train, y_train)

val_r2  = best_model.score(X_val, y_val)
test_r2 = best_model.score(X_test, y_test)
test_mae = mean_absolute_error(y_test, best_model.predict(X_test))

print(f"\nFinal Results:")
print(f"  Validation R2: {val_r2:.4f}")
print(f"  Test R2:       {test_r2:.4f}  (honest estimate)")
print(f"  Test MAE:      ${test_mae * 100_000:,.0f}  (average prediction error)")

# SAVE MODEL
joblib.dump(best_model, "house_price_model.joblib")
print("\nModel saved to house_price_model.joblib")

# PREDICTION FUNCTION
def predict_house_price(rooms: float, income: float, lat: float, lon: float) -> str:
    sample = pd.DataFrame([{
        "MedInc": income, "HouseAge": 30, "AveRooms": rooms,
        "AveBedrms": rooms * 0.3, "Population": 1500, "AveOccup": 2.5,
        "Latitude": lat, "Longitude": lon,
        "rooms_per_person": rooms / 2.5, "bedrooms_ratio": 0.3,
        "income_per_room": income / (rooms + 1),
    }])[feature_cols]
    pred = best_model.predict(sample)[0]
    return f"${pred * 100_000:,.0f}"

print("\nSample predictions:")
print(f"  LA: {predict_house_price(5, 8, 34.05, -118.25)}")
print(f"  SF: {predict_house_price(4, 10, 37.77, -122.43)}")

Tip

Practice Mini Project House Price Prediction in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

80% of ML work is data preparation — garbage in = garbage out

Practice Task

Note

Practice Task — (1) Write a working example of Mini Project House Price Prediction from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Mini Project House Price Prediction is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.

Topics in This Module