Image Segmentation — Pixel-Level Understanding

Segmentation classifies every pixel of an image. Semantic segmentation assigns a class to each pixel (no distinction between instances). Instance segmentation also differentiates between separate objects. Critical for autonomous driving, medical imaging, and robotics.

15 min•By Priygop Team•Updated 2026

Segmentation with Segment Anything Model (SAM)

import torch
from segment_anything import SamPredictor, sam_model_registry
import numpy as np

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# SAM — Meta's Segment Anything Model (2023)
# Zero-shot segmentation with point/box/text prompts
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Load SAM model (download sam_vit_h_4b8939.pth ~2.5GB)
sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
device = "cuda" if torch.cuda.is_available() else "cpu"
sam = sam.to(device)
predictor = SamPredictor(sam)

# Segment with a point prompt
image = np.random.randint(0, 255, (720, 1280, 3), dtype=np.uint8)  # example image
predictor.set_image(image)

# Click a point: [x, y] in pixel coordinates
input_point = np.array([[640, 360]])  # center of image
input_label = np.array([1])           # 1 = foreground, 0 = background

masks, scores, logits = predictor.predict(
    point_coords=input_point,
    point_labels=input_label,
    multimask_output=True,  # return 3 masks (small, medium, large interpretations)
)

print(f"Generated {len(masks)} masks")
for i, (mask, score) in enumerate(zip(masks, scores)):
    print(f"  Mask {i}: score={score:.3f}, pixels={mask.sum():,}")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Semantic Segmentation — U-Net Architecture
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

class UNet(torch.nn.Module):
    """U-Net: encoder (downsampling) + decoder (upsampling) with skip connections."""

    def __init__(self, in_channels: int = 3, n_classes: int = 21):
        super().__init__()
        # Encoder (backbone)
        self.enc1 = self._block(in_channels, 64)
        self.enc2 = self._block(64, 128)
        self.enc3 = self._block(128, 256)
        self.pool = torch.nn.MaxPool2d(2)

        # Bottleneck
        self.bottleneck = self._block(256, 512)

        # Decoder (upsampling with skip connections)
        self.up3 = torch.nn.ConvTranspose2d(512, 256, 2, stride=2)  # upsample 2x
        self.dec3 = self._block(512, 256)  # 256 + 256 skip = 512 input
        self.up2 = torch.nn.ConvTranspose2d(256, 128, 2, stride=2)
        self.dec2 = self._block(256, 128)
        self.up1 = torch.nn.ConvTranspose2d(128, 64, 2, stride=2)
        self.dec1 = self._block(128, 64)

        self.final = torch.nn.Conv2d(64, n_classes, 1)

    def _block(self, in_c: int, out_c: int) -> torch.nn.Sequential:
        return torch.nn.Sequential(
            torch.nn.Conv2d(in_c, out_c, 3, padding=1, bias=False),
            torch.nn.BatchNorm2d(out_c),
            torch.nn.ReLU(inplace=True),
            torch.nn.Conv2d(out_c, out_c, 3, padding=1, bias=False),
            torch.nn.BatchNorm2d(out_c),
            torch.nn.ReLU(inplace=True),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        e1 = self.enc1(x)                          # [B, 64, H, W]
        e2 = self.enc2(self.pool(e1))              # [B, 128, H/2, W/2]
        e3 = self.enc3(self.pool(e2))              # [B, 256, H/4, W/4]
        b  = self.bottleneck(self.pool(e3))        # [B, 512, H/8, W/8]
        d3 = self.dec3(torch.cat([self.up3(b), e3], dim=1))  # skip!
        d2 = self.dec2(torch.cat([self.up2(d3), e2], dim=1))
        d1 = self.dec1(torch.cat([self.up1(d2), e1], dim=1))
        return self.final(d1)  # [B, n_classes, H, W]

unet = UNet(in_channels=3, n_classes=21)  # 21 Pascal VOC classes
x = torch.randn(2, 3, 256, 256)
print(f"Segmentation output: {unet(x).shape}")  # [2, 21, 256, 256]

Tip

Practice Image Segmentation PixelLevel Understanding in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Modern NLP = Transformer-based. Pre-train, then fine-tune.

Practice Task

Note

Practice Task — (1) Write a working example of Image Segmentation PixelLevel Understanding from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Image Segmentation PixelLevel Understanding is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.

Topics in This Module