Backpropagation — How Networks Learn
Backpropagation computes the gradient of the loss with respect to every weight in the network using the chain rule of calculus. PyTorch's autograd does this automatically — but understanding it deeply helps you debug exploding/vanishing gradients and design better architectures.
Backpropagation with PyTorch Autograd
import torch
import torch.nn as nn
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# PyTorch AUTOGRAD — automatic differentiation engine
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# PyTorch builds a computation graph dynamically (define-by-run)
# Every operation on a requires_grad=True tensor is recorded
# .backward() computes all gradients via chain rule automatically
# Simple example: y = x^2, dy/dx = 2x
x = torch.tensor(3.0, requires_grad=True)
y = x ** 2 # y = 9.0, graph: y → x**2 → x
y.backward() # compute dy/dx analytically via chain rule
print(f"x = {x.item()}, y = x^2 = {y.item()}, dy/dx = {x.grad.item()}")
# dy/dx = 2x = 2*3 = 6 ✅
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Full training step — what happens every iteration
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
model = nn.Linear(10, 1) # y = Wx + b
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()
X = torch.randn(32, 10) # batch of 32 samples, 10 features each
y = torch.randn(32, 1) # target values
# ── STEP 1: FORWARD PASS ──────────────────────────────
predictions = model(X) # compute predictions
loss = loss_fn(predictions, y) # compute scalar loss
# ── STEP 2: BACKWARD PASS ────────────────────────────
optimizer.zero_grad() # !!! CRITICAL: clear old gradients (PyTorch accumulates by default)
loss.backward() # compute gradient of loss w.r.t. ALL parameters (chain rule)
# Inspect gradients
print(f"\nLoss: {loss.item():.4f}")
print(f"Weight gradient shape: {model.weight.grad.shape}") # [1, 10]
print(f"Bias gradient: {model.bias.grad.item():.4f}")
# ── STEP 3: OPTIMIZER STEP ───────────────────────────
optimizer.step() # update parameters: W = W - lr * grad_W
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# COMMON BUGS with backward()
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# BUG 1: Forgetting zero_grad() → gradients accumulate → wrong updates
# optimizer.zero_grad() ← ALWAYS call BEFORE loss.backward()
# BUG 2: Calling backward() on non-scalar
# loss = model(X) # shape [32, 1] — NOT scalar
# loss.backward() # ERROR: grad must be specified for non-scalar outputs
# FIX: loss = loss_fn(predictions, y) ← reduce to scalar first
# BUG 3: Detached tensor in computation
# target = some_tensor.detach() # grad won't flow through detached tensors
# loss = (predictions - target).pow(2).mean() # OK — only predictions needs grad
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# DETECTING GRADIENT ISSUES
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
def check_gradients(model: nn.Module) -> None:
"""Log gradient norms per layer — find exploding/vanishing gradients."""
for name, param in model.named_parameters():
if param.grad is not None:
grad_norm = param.grad.norm().item()
if grad_norm > 10:
print(f"⚠️ EXPLODING GRADIENT: {name}: {grad_norm:.2f}")
elif grad_norm < 1e-6:
print(f"⚠️ VANISHING GRADIENT: {name}: {grad_norm:.8f}")
else:
print(f" OK: {name}: norm={grad_norm:.4f}")Tip
Tip
Practice Backpropagation How Networks Learn in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
Input → Hidden layers → Output. Train via backpropagation.
Practice Task
Note
Practice Task — (1) Write a working example of Backpropagation How Networks Learn from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with Backpropagation How Networks Learn is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.