Comparing Regression Algorithms

Linear regression isn't always best for regression. Decision trees can model non-linear relationships. Random forests and gradient boosting handle interactions and outliers. SVR works well in high-dimensional spaces. Knowing when to use which algorithm — and how to compare them fairly with cross-validation — is a core ML skill.

15 min•By Priygop Team•Updated 2026

Comparing Regression Algorithms

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

housing = fetch_california_housing()
X, y = housing.data, housing.target

# All models wrapped in pipelines (handles scaling automatically)
regression_models = {
    "Linear Regression":    Pipeline([("scaler", StandardScaler()), ("model", LinearRegression())]),
    "Ridge (tuned)":        Pipeline([("scaler", StandardScaler()), ("model", Ridge(alpha=1.0))]),
    "Lasso (tuned)":        Pipeline([("scaler", StandardScaler()), ("model", Lasso(alpha=0.01, max_iter=5000))]),
    "Decision Tree":        Pipeline([("model", DecisionTreeRegressor(max_depth=8, random_state=42))]),
    "Random Forest":        Pipeline([("model", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))]),
    "Gradient Boosting":    Pipeline([("model", GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=4, random_state=42))]),
    "SVR (RBF)":            Pipeline([("scaler", StandardScaler()), ("model", SVR(kernel="rbf", C=1.0))]),
    "KNN (k=10)":           Pipeline([("scaler", StandardScaler()), ("model", KNeighborsRegressor(n_neighbors=10))]),
}

print(f"{'Algorithm':<25} {'CV R2 (mean)':>12} {'CV R2 (std)':>12} {'Notes'}")
print("-" * 80)

results = {}
for name, model in regression_models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring="r2", n_jobs=-1)
    results[name] = scores.mean()

    # Add contextual notes
    notes = {
        "Linear Regression": "baseline -- interpret coefficients",
        "Ridge (tuned)":     "L2 regularized -- handles multicollinearity",
        "Lasso (tuned)":     "L1 -- automatic feature selection",
        "Decision Tree":     "interpretable -- prone to overfit",
        "Random Forest":     "robust -- needs more memory",
        "Gradient Boosting": "often best -- slowest to train",
        "SVR (RBF)":         "good for smaller datasets",
        "KNN (k=10)":        "no explicit training -- slow at predict",
    }
    print(f"{name:<25} {scores.mean():>12.4f} {scores.std():>12.4f}  {notes.get(name, '')}")

best_model = max(results, key=results.get)
print(f"\nBest model: {best_model} (CV R2 = {results[best_model]:.4f})")

Tip

Practice Comparing Regression Algorithms in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Simplest ML. y = mx + b. Minimize MSE.

Practice Task

Note

Practice Task — (1) Write a working example of Comparing Regression Algorithms from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Comparing Regression Algorithms is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ml code.

Topics in This Module

Comparing Regression Algorithms

15 min•By Priygop Team•Updated 2026

Comparing Regression Algorithms

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

housing = fetch_california_housing()
X, y = housing.data, housing.target

# All models wrapped in pipelines (handles scaling automatically)
regression_models = {
    "Linear Regression":    Pipeline([("scaler", StandardScaler()), ("model", LinearRegression())]),
    "Ridge (tuned)":        Pipeline([("scaler", StandardScaler()), ("model", Ridge(alpha=1.0))]),
    "Lasso (tuned)":        Pipeline([("scaler", StandardScaler()), ("model", Lasso(alpha=0.01, max_iter=5000))]),
    "Decision Tree":        Pipeline([("model", DecisionTreeRegressor(max_depth=8, random_state=42))]),
    "Random Forest":        Pipeline([("model", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))]),
    "Gradient Boosting":    Pipeline([("model", GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=4, random_state=42))]),
    "SVR (RBF)":            Pipeline([("scaler", StandardScaler()), ("model", SVR(kernel="rbf", C=1.0))]),
    "KNN (k=10)":           Pipeline([("scaler", StandardScaler()), ("model", KNeighborsRegressor(n_neighbors=10))]),
}

print(f"{'Algorithm':<25} {'CV R2 (mean)':>12} {'CV R2 (std)':>12} {'Notes'}")
print("-" * 80)

results = {}
for name, model in regression_models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring="r2", n_jobs=-1)
    results[name] = scores.mean()

    # Add contextual notes
    notes = {
        "Linear Regression": "baseline -- interpret coefficients",
        "Ridge (tuned)":     "L2 regularized -- handles multicollinearity",
        "Lasso (tuned)":     "L1 -- automatic feature selection",
        "Decision Tree":     "interpretable -- prone to overfit",
        "Random Forest":     "robust -- needs more memory",
        "Gradient Boosting": "often best -- slowest to train",
        "SVR (RBF)":         "good for smaller datasets",
        "KNN (k=10)":        "no explicit training -- slow at predict",
    }
    print(f"{name:<25} {scores.mean():>12.4f} {scores.std():>12.4f}  {notes.get(name, '')}")

best_model = max(results, key=results.get)
print(f"\nBest model: {best_model} (CV R2 = {results[best_model]:.4f})")

Tip

Diagram

Loading diagram…

Simplest ML. y = mx + b. Minimize MSE.

Topics in This Module