GPU Acceleration & Mixed Precision Training

Training on GPU can be 10-100x faster than CPU for neural networks, because GPUs have thousands of small cores optimized for parallel matrix operations. Automatic Mixed Precision (AMP) additionally gives 2x speedup and 50% memory savings by using float16 for forward pass and float32 for stability.

20 min•By Priygop Team•Updated 2026

GPU Training & Mixed Precision

import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# DEVICE SELECTION — works on CUDA, MPS (Mac M1/M2), or CPU
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device("mps")  # Apple Silicon
    print("Using Apple Silicon MPS")
else:
    device = torch.device("cpu")
    print("Using CPU — training will be slower")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# AUTOMATIC MIXED PRECISION (AMP)
# FP32 (32-bit float): used for weight updates — numerically stable
# FP16 (16-bit float): used for forward pass — 2x faster, 2x less VRAM
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

model = nn.Sequential(nn.Linear(512, 256), nn.ReLU(), nn.Linear(256, 10)).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
scaler = GradScaler()  # scales loss to prevent fp16 underflow

def train_step_amp(X: torch.Tensor, y: torch.Tensor) -> float:
    """Training step with Automatic Mixed Precision."""
    X, y = X.to(device), y.to(device)
    optimizer.zero_grad()

    # Forward pass in FP16 (autocast automatically switches dtypes)
    with autocast():
        logits = model(X)           # computation in fp16
        loss = loss_fn(logits, y)   # fp32 for numerical stability

    # Backward in FP32 (scaler handles this)
    scaler.scale(loss).backward()   # scale loss to prevent fp16 underflow
    scaler.unscale_(optimizer)      # unscale gradients before clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    scaler.step(optimizer)          # update weights (skips if gradients are NaN/Inf)
    scaler.update()                 # adjust scale factor for next iteration

    return loss.item()

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# GPU MEMORY MANAGEMENT
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

def gpu_info() -> None:
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved  = torch.cuda.memory_reserved() / 1e9
        print(f"GPU Memory — Allocated: {allocated:.2f} GB | Reserved: {reserved:.2f} GB")

# Free GPU cache between experiments
torch.cuda.empty_cache()

# gradient checkpointing — trade compute for memory
# Recomputes activations during backward instead of storing them
# Reduces memory by ~sqrt(n_layers), costs ~33% more compute
# from torch.utils.checkpoint import checkpoint
# x = checkpoint(layer, x)  # inside forward()

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# OUT-OF-MEMORY ERROR DEBUGGING
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

def find_max_batch_size(model, input_shape, start=256, device=device):
    """Binary search for max batch size that fits in GPU memory."""
    batch_size = start
    while batch_size > 0:
        try:
            x = torch.randn(batch_size, *input_shape, device=device)
            _ = model(x)
            print(f"✅ Batch size {batch_size} fits in memory")
            return batch_size
        except RuntimeError as e:
            if "out of memory" in str(e):
                torch.cuda.empty_cache()
                batch_size //= 2
                print(f"↓ OOM at {batch_size*2}, trying {batch_size}")
            else:
                raise
    return 1

Tip

Practice GPU Acceleration Mixed Precision Training in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Technical diagram.

Practice Task

Note

Practice Task — (1) Write a working example of GPU Acceleration Mixed Precision Training from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with GPU Acceleration Mixed Precision Training is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.

Topics in This Module