Dataset & DataLoader — Efficient Data Pipelines
A slow data pipeline is often the bottleneck in deep learning training. DataLoader provides batching, shuffling, parallel loading (num_workers), and memory pinning. Custom Dataset lets you handle ANY data format including CSV, images, text, or API responses.
Custom Dataset and Optimized DataLoader
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image
import pandas as pd
from pathlib import Path
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# CUSTOM DATASET — three required methods
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
class ImageClassificationDataset(Dataset):
"""
CSV format: image_path, label
data/img001.jpg,0
data/img002.jpg,1
"""
def __init__(self, csv_path: str, img_dir: str, transform=None):
self.df = pd.read_csv(csv_path, header=None, names=["path", "label"])
self.img_dir = Path(img_dir)
self.transform = transform
def __len__(self) -> int:
"""Required: number of samples in the dataset."""
return len(self.df)
def __getitem__(self, idx: int) -> tuple[torch.Tensor, int]:
"""Required: return one sample by index."""
row = self.df.iloc[idx]
img = Image.open(self.img_dir / row["path"]).convert("RGB")
if self.transform:
img = self.transform(img)
label = int(row["label"])
return img, label
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# IMAGE TRANSFORMS — preprocessing + augmentation
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
train_transform = transforms.Compose([
transforms.Resize(256),
transforms.RandomCrop(224), # data augmentation
transforms.RandomHorizontalFlip(), # augmentation: mirror image
transforms.ColorJitter(brightness=0.2, contrast=0.2), # augmentation
transforms.ToTensor(), # PIL [0,255] → tensor [0,1]
transforms.Normalize( # ImageNet statistics
mean=[0.485, 0.456, 0.406], # pre-computed over 1.2M images
std=[0.229, 0.224, 0.225]
),
])
val_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224), # NO augmentation at validation time
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# OPTIMIZED DATALOADER CONFIGURATION
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Simulated dataset for demonstration
from torch.utils.data import TensorDataset
dummy_X = torch.randn(1000, 3, 224, 224)
dummy_y = torch.randint(0, 10, (1000,))
full_dataset = TensorDataset(dummy_X, dummy_y)
# Split: 80% train, 10% val, 10% test
train_size = int(0.8 * len(full_dataset))
val_size = int(0.1 * len(full_dataset))
test_size = len(full_dataset) - train_size - val_size
train_ds, val_ds, test_ds = random_split(full_dataset, [train_size, val_size, test_size])
train_loader = DataLoader(
train_ds,
batch_size=64,
shuffle=True, # shuffle every epoch — critical for training
num_workers=4, # parallel CPU data loading workers (set to CPU count)
pin_memory=True, # lock data in RAM pinned memory → faster GPU transfer
persistent_workers=True, # keep workers alive between epochs
prefetch_factor=2, # prefetch 2 batches per worker
)
val_loader = DataLoader(
val_ds,
batch_size=128,
shuffle=False, # NEVER shuffle validation
num_workers=4,
pin_memory=True,
)
# Verify shapes
for X_batch, y_batch in train_loader:
print(f"Batch X shape: {X_batch.shape}") # [64, 3, 224, 224]
print(f"Batch y shape: {y_batch.shape}") # [64]
break
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# COLLATE FUNCTION — custom batch assembly
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
from torch.nn.utils.rnn import pad_sequence
def text_collate_fn(batch: list[tuple]) -> tuple:
"""Pads variable-length text sequences to same length in batch."""
texts, labels = zip(*batch)
padded = pad_sequence(texts, batch_first=True, padding_value=0)
labels = torch.stack(labels)
return padded, labels
# Used for NLP datasets where each sequence has different length
# text_loader = DataLoader(text_dataset, batch_size=32, collate_fn=text_collate_fn)Tip
Tip
Practice Dataset DataLoader Efficient Data Pipelines in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
Technical diagram.
Practice Task
Note
Practice Task — (1) Write a working example of Dataset DataLoader Efficient Data Pipelines from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with Dataset DataLoader Efficient Data Pipelines is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.