Positional Encoding — Giving Transformers Order
Attention is permutation-invariant — if you shuffle all tokens, the attention output is the same (just shuffled). Transformers need positional encoding to teach the model where each token is. The original Transformer used fixed sine/cosine functions. Modern models use learned absolute PE or relative PE (RoPE, ALiBi) for much longer contexts.
Positional Encoding Strategies
import torch
import torch.nn as nn
import math
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 1. SINUSOIDAL POSITIONAL ENCODING (original Transformer, 2017)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
class SinusoidalPositionalEncoding(nn.Module):
"""
Fixed (not learned). Uses sin/cos at different frequencies.
PE[pos, 2i] = sin(pos / 10000^(2i/d_model))
PE[pos, 2i+1] = cos(pos / 10000^(2i/d_model))
"""
def __init__(self, d_model: int, max_len: int = 5000, dropout: float = 0.1):
super().__init__()
self.dropout = nn.Dropout(dropout)
# Create [max_len, d_model] encoding matrix
pe = torch.zeros(max_len, d_model)
position = torch.arange(max_len).unsqueeze(1).float()
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term) # even indices: sin
pe[:, 1::2] = torch.cos(position * div_term) # odd indices: cos
# Register as buffer (not a parameter, but saved with model)
self.register_buffer("pe", pe.unsqueeze(0)) # [1, max_len, d_model]
def forward(self, x: torch.Tensor) -> torch.Tensor:
# x: [B, T, D]
x = x + self.pe[:, :x.size(1), :] # add positional encoding
return self.dropout(x)
# 2. LEARNED POSITIONAL ENCODING (BERT, GPT-2)
class LearnedPositionalEncoding(nn.Module):
"""
Trainable embedding matrix. Model learns best position representations.
Limitation: max sequence length fixed at training time.
"""
def __init__(self, d_model: int, max_len: int = 512):
super().__init__()
self.pos_embed = nn.Embedding(max_len, d_model)
def forward(self, x: torch.Tensor) -> torch.Tensor:
B, T, D = x.shape
positions = torch.arange(T, device=x.device).unsqueeze(0) # [1, T]
return x + self.pos_embed(positions) # broadcast over batch
# 3. ROTARY POSITIONAL EMBEDDING (RoPE) — used in LLaMA, GPT-NeoX, Mistral
class RotaryPositionalEmbedding(nn.Module):
"""
Encodes relative positions by ROTATING query and key vectors.
Advantage: naturally extends to arbitrary sequence lengths.
Used by: LLaMA 1/2/3, Mistral, Falcon, Qwen.
"""
def __init__(self, dim: int, max_position: int = 8192):
super().__init__()
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
self.register_buffer("inv_freq", inv_freq)
def _rotate_half(self, x: torch.Tensor) -> torch.Tensor:
x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
return torch.cat([-x2, x1], dim=-1)
def forward(self, q: torch.Tensor, k: torch.Tensor, seq_len: int) -> tuple:
t = torch.arange(seq_len, device=q.device).float()
freqs = torch.einsum("i,j->ij", t, self.inv_freq) # [T, dim//2]
emb = torch.cat([freqs, freqs], dim=-1) # [T, dim]
cos, sin = emb.cos()[None, None, :, :], emb.sin()[None, None, :, :]
q_rot = q * cos + self._rotate_half(q) * sin
k_rot = k * cos + self._rotate_half(k) * sin
return q_rot, k_rot
# Comparison:
pe_comparison = {
"Sinusoidal (original)": "Fixed, no parameters, gracefully handles unseen lengths",
"Learned (BERT/GPT-2)": "Trainable, better performance but limited to training max_len",
"RoPE (LLaMA)": "Relative position via rotation, best for long contexts (32K+)",
"ALiBi (MPT)": "Applies bias to attention scores based on distance, very long context",
}
for name, desc in pe_comparison.items():
print(f" {name:30s}: {desc}")Tip
Tip
Practice Positional Encoding Giving Transformers Order in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
Better prompts = better AI output. Structure, examples, and constraints matter.
Practice Task
Note
Practice Task — (1) Write a working example of Positional Encoding Giving Transformers Order from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with Positional Encoding Giving Transformers Order is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.