PyTorch Tensors — The Fundamental Data Structure

Tensors are n-dimensional arrays, like NumPy arrays but with GPU support and automatic differentiation. Every input, weight, gradient, and output in a neural network is a tensor. Mastering tensor operations is the most fundamental PyTorch skill.

20 min•By Priygop Team•Updated 2026

Tensor Fundamentals & Operations

import torch
import numpy as np

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# TENSOR CREATION
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
a = torch.tensor([1.0, 2.0, 3.0])       # 1D tensor (vector), float32
b = torch.zeros(3, 4)                    # 3×4 zeros
c = torch.ones(2, 3, 4)                 # 2×3×4 ones
d = torch.randn(64, 3, 224, 224)        # batch of 64 RGB images (ImageNet size)
e = torch.arange(0, 10, 2)             # [0, 2, 4, 6, 8]

print(f"Shape: {d.shape}")   # torch.Size([64, 3, 224, 224])
print(f"Dtype: {d.dtype}")   # torch.float32
print(f"Device: {d.device}") # cpu

# NumPy bridge (zero-copy when on CPU)
arr = np.array([1.0, 2.0, 3.0])
t = torch.from_numpy(arr)        # shares memory!
arr[0] = 99.0
print(t)  # tensor([99., 2., 3.]) — changed too!

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# CRITICAL SHAPE OPERATIONS — used constantly in AI
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
x = torch.randn(4, 3, 32, 32)  # batch=4, channels=3, h=32, w=32

print(x.shape)           # [4, 3, 32, 32]
print(x.reshape(4, -1).shape)  # [4, 3072] — flatten spatial dims
print(x.permute(0, 2, 3, 1).shape)  # [4, 32, 32, 3] — channels last (for PIL)

# squeeze/unsqueeze — add or remove dimensions
logit = torch.randn(4)    # [4]
logit_unsqueezed = logit.unsqueeze(0)  # [1, 4]
logit_unsqueezed = logit.unsqueeze(-1) # [4, 1]

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# BROADCASTING — automatic shape alignment
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
a = torch.randn(64, 512)   # batch of embeddings
mean = torch.randn(512)    # per-feature mean
result = a - mean          # broadcasts: mean expands to [64, 512]

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# DEVICE MANAGEMENT — CPU ↔ GPU
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move tensor to GPU
x_gpu = x.to(device)
x_gpu = x.cuda()  # shortcut (only if CUDA available)

# Move back to CPU for numpy/matplotlib
x_cpu = x_gpu.cpu().detach().numpy()

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# DTYPES — precision matters for memory and speed
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# float32 (default) — standard training precision
# float16 — half precision — 2x memory saving, GPU speedup (use with autocast)
# bfloat16 — better numerical range than float16 — preferred for LLM training
# int8 / int4 — quantized inference — 4-8x memory saving

x_half = x.half()         # float32 → float16
x_bf16 = x.to(torch.bfloat16)  # float32 → bfloat16

# Automatic Mixed Precision (AMP) — train in fp16, accumulate in fp32
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()

# In training loop with AMP:
# with autocast():
#     output = model(input)
#     loss = loss_fn(output, target)
# scaler.scale(loss).backward()
# scaler.step(optimizer)
# scaler.update()
# Result: ~2x faster training, 50% less VRAM

Tip

Practice PyTorch Tensors The Fundamental Data Structure in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Technical diagram.

Practice Task

Note

Practice Task — (1) Write a working example of PyTorch Tensors The Fundamental Data Structure from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with PyTorch Tensors The Fundamental Data Structure is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.

Topics in This Module

Tensor Fundamentals & Operations

import torch
import numpy as np

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# TENSOR CREATION
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
a = torch.tensor([1.0, 2.0, 3.0])       # 1D tensor (vector), float32
b = torch.zeros(3, 4)                    # 3×4 zeros
c = torch.ones(2, 3, 4)                 # 2×3×4 ones
d = torch.randn(64, 3, 224, 224)        # batch of 64 RGB images (ImageNet size)
e = torch.arange(0, 10, 2)             # [0, 2, 4, 6, 8]

print(f"Shape: {d.shape}")   # torch.Size([64, 3, 224, 224])
print(f"Dtype: {d.dtype}")   # torch.float32
print(f"Device: {d.device}") # cpu

# NumPy bridge (zero-copy when on CPU)
arr = np.array([1.0, 2.0, 3.0])
t = torch.from_numpy(arr)        # shares memory!
arr[0] = 99.0
print(t)  # tensor([99., 2., 3.]) — changed too!

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# CRITICAL SHAPE OPERATIONS — used constantly in AI
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
x = torch.randn(4, 3, 32, 32)  # batch=4, channels=3, h=32, w=32

print(x.shape)           # [4, 3, 32, 32]
print(x.reshape(4, -1).shape)  # [4, 3072] — flatten spatial dims
print(x.permute(0, 2, 3, 1).shape)  # [4, 32, 32, 3] — channels last (for PIL)

# squeeze/unsqueeze — add or remove dimensions
logit = torch.randn(4)    # [4]
logit_unsqueezed = logit.unsqueeze(0)  # [1, 4]
logit_unsqueezed = logit.unsqueeze(-1) # [4, 1]

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# BROADCASTING — automatic shape alignment
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
a = torch.randn(64, 512)   # batch of embeddings
mean = torch.randn(512)    # per-feature mean
result = a - mean          # broadcasts: mean expands to [64, 512]

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# DEVICE MANAGEMENT — CPU ↔ GPU
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move tensor to GPU
x_gpu = x.to(device)
x_gpu = x.cuda()  # shortcut (only if CUDA available)

# Move back to CPU for numpy/matplotlib
x_cpu = x_gpu.cpu().detach().numpy()

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# DTYPES — precision matters for memory and speed
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# float32 (default) — standard training precision
# float16 — half precision — 2x memory saving, GPU speedup (use with autocast)
# bfloat16 — better numerical range than float16 — preferred for LLM training
# int8 / int4 — quantized inference — 4-8x memory saving

x_half = x.half()         # float32 → float16
x_bf16 = x.to(torch.bfloat16)  # float32 → bfloat16

# Automatic Mixed Precision (AMP) — train in fp16, accumulate in fp32
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()

# In training loop with AMP:
# with autocast():
#     output = model(input)
#     loss = loss_fn(output, target)
# scaler.scale(loss).backward()
# scaler.step(optimizer)
# scaler.update()
# Result: ~2x faster training, 50% less VRAM

Tip

Diagram

Loading diagram…

Technical diagram.

Topics in This Module