Regularization — Preventing Overfitting

Overfitting means the model memorizes training data but fails on new examples. Regularization techniques force the model to learn generalizable patterns. The most effective are: Dropout, BatchNorm, weight decay, and data augmentation.

20 min•By Priygop Team•Updated 2026

Regularization Techniques

import torch
import torch.nn as nn

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 1. DROPOUT — random neuron deactivation
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# During training: randomly zero p fraction of neurons
# During inference: all neurons active, outputs scaled by (1-p)
# Effect: forces network to learn REDUNDANT representations
# Use: after every hidden layer in MLP/CNN, p=0.1-0.5

dropout = nn.Dropout(p=0.3)
x = torch.ones(5)
print("Train mode:", dropout(x))  # ~30% zeros randomly
dropout.eval()
print("Eval mode: ", dropout(x))  # all ones — Dropout disabled in eval

# CRITICAL: remember to call model.eval() during inference!
# model.train() → Dropout active, BatchNorm uses batch stats
# model.eval()  → Dropout disabled, BatchNorm uses running stats

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 2. BATCH NORMALIZATION — normalize layer inputs
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Normalizes each feature across the batch to mean=0, std=1
# Then applies learnable scale (gamma) and shift (beta)
# Effect: smoother loss landscape → faster training, higher lr allowed

# Where to place: AFTER linear/conv, BEFORE activation
model_with_bn = nn.Sequential(
    nn.Linear(128, 64),
    nn.BatchNorm1d(64),   # normalize 64 features across batch
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(64, 32),
    nn.BatchNorm1d(32),
    nn.ReLU(),
    nn.Linear(32, 10),
)

# LAYER NORM vs BATCH NORM:
# BatchNorm: normalizes ACROSS BATCH — great for CNNs, MLPs
# LayerNorm:  normalizes ACROSS FEATURES — used in Transformers/LLMs
#   Because LLMs process variable-length sequences, BatchNorm doesn't work well

layer_norm = nn.LayerNorm(512)  # used inside every Transformer block

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 3. WEIGHT DECAY (L2 Regularization)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Adds penalty: loss = cross_entropy + lambda * sum(W^2)
# Effect: pushes weights towards zero → prevents any single weight from dominating
# Set via optimizer: weight_decay=0.01 (AdamW)
import torch.optim as optim
opt = optim.AdamW(model_with_bn.parameters(), lr=1e-3, weight_decay=0.01)

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 4. EARLY STOPPING — stop when validation loss stops improving
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
class EarlyStopping:
    def __init__(self, patience: int = 5, min_delta: float = 1e-4):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float("inf")
        self.counter = 0

    def __call__(self, val_loss: float) -> bool:
        """Returns True if training should stop."""
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience

early_stop = EarlyStopping(patience=5)
# In training loop: if early_stop(val_loss): break

Tip

Practice Regularization Preventing Overfitting in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Technical diagram.

Practice Task

Note

Practice Task — (1) Write a working example of Regularization Preventing Overfitting from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Regularization Preventing Overfitting is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.

Topics in This Module

Regularization Techniques

import torch
import torch.nn as nn

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 1. DROPOUT — random neuron deactivation
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# During training: randomly zero p fraction of neurons
# During inference: all neurons active, outputs scaled by (1-p)
# Effect: forces network to learn REDUNDANT representations
# Use: after every hidden layer in MLP/CNN, p=0.1-0.5

dropout = nn.Dropout(p=0.3)
x = torch.ones(5)
print("Train mode:", dropout(x))  # ~30% zeros randomly
dropout.eval()
print("Eval mode: ", dropout(x))  # all ones — Dropout disabled in eval

# CRITICAL: remember to call model.eval() during inference!
# model.train() → Dropout active, BatchNorm uses batch stats
# model.eval()  → Dropout disabled, BatchNorm uses running stats

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 2. BATCH NORMALIZATION — normalize layer inputs
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Normalizes each feature across the batch to mean=0, std=1
# Then applies learnable scale (gamma) and shift (beta)
# Effect: smoother loss landscape → faster training, higher lr allowed

# Where to place: AFTER linear/conv, BEFORE activation
model_with_bn = nn.Sequential(
    nn.Linear(128, 64),
    nn.BatchNorm1d(64),   # normalize 64 features across batch
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(64, 32),
    nn.BatchNorm1d(32),
    nn.ReLU(),
    nn.Linear(32, 10),
)

# LAYER NORM vs BATCH NORM:
# BatchNorm: normalizes ACROSS BATCH — great for CNNs, MLPs
# LayerNorm:  normalizes ACROSS FEATURES — used in Transformers/LLMs
#   Because LLMs process variable-length sequences, BatchNorm doesn't work well

layer_norm = nn.LayerNorm(512)  # used inside every Transformer block

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 3. WEIGHT DECAY (L2 Regularization)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Adds penalty: loss = cross_entropy + lambda * sum(W^2)
# Effect: pushes weights towards zero → prevents any single weight from dominating
# Set via optimizer: weight_decay=0.01 (AdamW)
import torch.optim as optim
opt = optim.AdamW(model_with_bn.parameters(), lr=1e-3, weight_decay=0.01)

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 4. EARLY STOPPING — stop when validation loss stops improving
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
class EarlyStopping:
    def __init__(self, patience: int = 5, min_delta: float = 1e-4):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float("inf")
        self.counter = 0

    def __call__(self, val_loss: float) -> bool:
        """Returns True if training should stop."""
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience

early_stop = EarlyStopping(patience=5)
# In training loop: if early_stop(val_loss): break

Tip

Diagram

Loading diagram…

Technical diagram.

Topics in This Module