Image Segmentation — Pixel-Level Understanding
Segmentation classifies every pixel of an image. Semantic segmentation assigns a class to each pixel (no distinction between instances). Instance segmentation also differentiates between separate objects. Critical for autonomous driving, medical imaging, and robotics.
Segmentation with Segment Anything Model (SAM)
import torch
from segment_anything import SamPredictor, sam_model_registry
import numpy as np
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# SAM — Meta's Segment Anything Model (2023)
# Zero-shot segmentation with point/box/text prompts
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Load SAM model (download sam_vit_h_4b8939.pth ~2.5GB)
sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
device = "cuda" if torch.cuda.is_available() else "cpu"
sam = sam.to(device)
predictor = SamPredictor(sam)
# Segment with a point prompt
image = np.random.randint(0, 255, (720, 1280, 3), dtype=np.uint8) # example image
predictor.set_image(image)
# Click a point: [x, y] in pixel coordinates
input_point = np.array([[640, 360]]) # center of image
input_label = np.array([1]) # 1 = foreground, 0 = background
masks, scores, logits = predictor.predict(
point_coords=input_point,
point_labels=input_label,
multimask_output=True, # return 3 masks (small, medium, large interpretations)
)
print(f"Generated {len(masks)} masks")
for i, (mask, score) in enumerate(zip(masks, scores)):
print(f" Mask {i}: score={score:.3f}, pixels={mask.sum():,}")
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Semantic Segmentation — U-Net Architecture
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
class UNet(torch.nn.Module):
"""U-Net: encoder (downsampling) + decoder (upsampling) with skip connections."""
def __init__(self, in_channels: int = 3, n_classes: int = 21):
super().__init__()
# Encoder (backbone)
self.enc1 = self._block(in_channels, 64)
self.enc2 = self._block(64, 128)
self.enc3 = self._block(128, 256)
self.pool = torch.nn.MaxPool2d(2)
# Bottleneck
self.bottleneck = self._block(256, 512)
# Decoder (upsampling with skip connections)
self.up3 = torch.nn.ConvTranspose2d(512, 256, 2, stride=2) # upsample 2x
self.dec3 = self._block(512, 256) # 256 + 256 skip = 512 input
self.up2 = torch.nn.ConvTranspose2d(256, 128, 2, stride=2)
self.dec2 = self._block(256, 128)
self.up1 = torch.nn.ConvTranspose2d(128, 64, 2, stride=2)
self.dec1 = self._block(128, 64)
self.final = torch.nn.Conv2d(64, n_classes, 1)
def _block(self, in_c: int, out_c: int) -> torch.nn.Sequential:
return torch.nn.Sequential(
torch.nn.Conv2d(in_c, out_c, 3, padding=1, bias=False),
torch.nn.BatchNorm2d(out_c),
torch.nn.ReLU(inplace=True),
torch.nn.Conv2d(out_c, out_c, 3, padding=1, bias=False),
torch.nn.BatchNorm2d(out_c),
torch.nn.ReLU(inplace=True),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
e1 = self.enc1(x) # [B, 64, H, W]
e2 = self.enc2(self.pool(e1)) # [B, 128, H/2, W/2]
e3 = self.enc3(self.pool(e2)) # [B, 256, H/4, W/4]
b = self.bottleneck(self.pool(e3)) # [B, 512, H/8, W/8]
d3 = self.dec3(torch.cat([self.up3(b), e3], dim=1)) # skip!
d2 = self.dec2(torch.cat([self.up2(d3), e2], dim=1))
d1 = self.dec1(torch.cat([self.up1(d2), e1], dim=1))
return self.final(d1) # [B, n_classes, H, W]
unet = UNet(in_channels=3, n_classes=21) # 21 Pascal VOC classes
x = torch.randn(2, 3, 256, 256)
print(f"Segmentation output: {unet(x).shape}") # [2, 21, 256, 256]Tip
Tip
Practice Image Segmentation PixelLevel Understanding in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
Modern NLP = Transformer-based. Pre-train, then fine-tune.
Practice Task
Note
Practice Task — (1) Write a working example of Image Segmentation PixelLevel Understanding from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with Image Segmentation PixelLevel Understanding is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.