GPU Acceleration & Mixed Precision Training
Training on GPU can be 10-100x faster than CPU for neural networks, because GPUs have thousands of small cores optimized for parallel matrix operations. Automatic Mixed Precision (AMP) additionally gives 2x speedup and 50% memory savings by using float16 for forward pass and float32 for stability.
GPU Training & Mixed Precision
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# DEVICE SELECTION — works on CUDA, MPS (Mac M1/M2), or CPU
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
if torch.cuda.is_available():
device = torch.device("cuda")
print(f"Using GPU: {torch.cuda.get_device_name(0)}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
device = torch.device("mps") # Apple Silicon
print("Using Apple Silicon MPS")
else:
device = torch.device("cpu")
print("Using CPU — training will be slower")
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# AUTOMATIC MIXED PRECISION (AMP)
# FP32 (32-bit float): used for weight updates — numerically stable
# FP16 (16-bit float): used for forward pass — 2x faster, 2x less VRAM
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
model = nn.Sequential(nn.Linear(512, 256), nn.ReLU(), nn.Linear(256, 10)).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
scaler = GradScaler() # scales loss to prevent fp16 underflow
def train_step_amp(X: torch.Tensor, y: torch.Tensor) -> float:
"""Training step with Automatic Mixed Precision."""
X, y = X.to(device), y.to(device)
optimizer.zero_grad()
# Forward pass in FP16 (autocast automatically switches dtypes)
with autocast():
logits = model(X) # computation in fp16
loss = loss_fn(logits, y) # fp32 for numerical stability
# Backward in FP32 (scaler handles this)
scaler.scale(loss).backward() # scale loss to prevent fp16 underflow
scaler.unscale_(optimizer) # unscale gradients before clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
scaler.step(optimizer) # update weights (skips if gradients are NaN/Inf)
scaler.update() # adjust scale factor for next iteration
return loss.item()
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# GPU MEMORY MANAGEMENT
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
def gpu_info() -> None:
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1e9
reserved = torch.cuda.memory_reserved() / 1e9
print(f"GPU Memory — Allocated: {allocated:.2f} GB | Reserved: {reserved:.2f} GB")
# Free GPU cache between experiments
torch.cuda.empty_cache()
# gradient checkpointing — trade compute for memory
# Recomputes activations during backward instead of storing them
# Reduces memory by ~sqrt(n_layers), costs ~33% more compute
# from torch.utils.checkpoint import checkpoint
# x = checkpoint(layer, x) # inside forward()
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# OUT-OF-MEMORY ERROR DEBUGGING
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
def find_max_batch_size(model, input_shape, start=256, device=device):
"""Binary search for max batch size that fits in GPU memory."""
batch_size = start
while batch_size > 0:
try:
x = torch.randn(batch_size, *input_shape, device=device)
_ = model(x)
print(f"✅ Batch size {batch_size} fits in memory")
return batch_size
except RuntimeError as e:
if "out of memory" in str(e):
torch.cuda.empty_cache()
batch_size //= 2
print(f"↓ OOM at {batch_size*2}, trying {batch_size}")
else:
raise
return 1Tip
Tip
Practice GPU Acceleration Mixed Precision Training in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
Technical diagram.
Practice Task
Note
Practice Task — (1) Write a working example of GPU Acceleration Mixed Precision Training from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with GPU Acceleration Mixed Precision Training is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.