Multi-Layer Networks & Forward Propagation

Stack multiple perceptrons in layers: input layer → hidden layers → output layer. This creates a Universal Function Approximator — given enough neurons, a 2-layer network can approximate ANY continuous function (Universal Approximation Theorem).

25 min•By Priygop Team•Updated 2026

MLP Forward Pass — Full Implementation

import torch
import torch.nn as nn

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Multi-Layer Perceptron in PyTorch nn.Module style
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

class MLP(nn.Module):
    """
    Input → [Linear → ReLU] × n_hidden → Linear → Output
    This architecture solves ANY non-linear classification / regression task.
    """
    def __init__(self, input_dim: int, hidden_dims: list[int], output_dim: int):
        super().__init__()

        # Build layers dynamically
        layers = []
        in_dim = input_dim
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(in_dim, hidden_dim))  # W*x + b
            layers.append(nn.ReLU())                       # activation
            layers.append(nn.Dropout(0.2))                 # regularization
            in_dim = hidden_dim

        layers.append(nn.Linear(in_dim, output_dim))  # output layer (no activation)
        self.network = nn.Sequential(*layers)

        # Weight initialization (Kaiming He for ReLU networks)
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
                nn.init.zeros_(m.bias)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.network(x)  # PyTorch handles the layer chain automatically

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# What happens INSIDE the forward pass — layer by layer
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Example: MLP for digit classification (MNIST)
model = MLP(input_dim=784, hidden_dims=[256, 128, 64], output_dim=10)
print(model)
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

# Forward pass trace:
x = torch.randn(32, 784)  # batch of 32 images, each flattened  28×28 = 784

# Layer 1: Linear(784 → 256)
# z1 = x @ W1.T + b1        shape: [32, 784] @ [256, 784].T = [32, 256]
# a1 = ReLU(z1)              shape: [32, 256]

# Layer 2: Linear(256 → 128)
# z2 = a1 @ W2.T + b2       shape: [32, 128]
# a2 = ReLU(z2)

# Layer 3: Linear(128 → 64)  → ReLU
# Layer 4: Linear(64 → 10)   → raw logits (no activation)

output = model(x)
print(f"Input shape:  {x.shape}")     # [32, 784]
print(f"Output shape: {output.shape}") # [32, 10] — 10 class logits

# Convert logits to probabilities
probs = torch.softmax(output, dim=-1)
pred_classes = probs.argmax(dim=-1)
print(f"Predicted classes: {pred_classes[:5]}")

Tip

Practice MultiLayer Networks Forward Propagation in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Modern NLP = Transformer-based. Pre-train, then fine-tune.

Practice Task

Note

Practice Task — (1) Write a working example of MultiLayer Networks Forward Propagation from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with MultiLayer Networks Forward Propagation is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.

Topics in This Module

MLP Forward Pass — Full Implementation

import torch
import torch.nn as nn

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Multi-Layer Perceptron in PyTorch nn.Module style
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

class MLP(nn.Module):
    """
    Input → [Linear → ReLU] × n_hidden → Linear → Output
    This architecture solves ANY non-linear classification / regression task.
    """
    def __init__(self, input_dim: int, hidden_dims: list[int], output_dim: int):
        super().__init__()

        # Build layers dynamically
        layers = []
        in_dim = input_dim
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(in_dim, hidden_dim))  # W*x + b
            layers.append(nn.ReLU())                       # activation
            layers.append(nn.Dropout(0.2))                 # regularization
            in_dim = hidden_dim

        layers.append(nn.Linear(in_dim, output_dim))  # output layer (no activation)
        self.network = nn.Sequential(*layers)

        # Weight initialization (Kaiming He for ReLU networks)
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
                nn.init.zeros_(m.bias)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.network(x)  # PyTorch handles the layer chain automatically

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# What happens INSIDE the forward pass — layer by layer
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Example: MLP for digit classification (MNIST)
model = MLP(input_dim=784, hidden_dims=[256, 128, 64], output_dim=10)
print(model)
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

# Forward pass trace:
x = torch.randn(32, 784)  # batch of 32 images, each flattened  28×28 = 784

# Layer 1: Linear(784 → 256)
# z1 = x @ W1.T + b1        shape: [32, 784] @ [256, 784].T = [32, 256]
# a1 = ReLU(z1)              shape: [32, 256]

# Layer 2: Linear(256 → 128)
# z2 = a1 @ W2.T + b2       shape: [32, 128]
# a2 = ReLU(z2)

# Layer 3: Linear(128 → 64)  → ReLU
# Layer 4: Linear(64 → 10)   → raw logits (no activation)

output = model(x)
print(f"Input shape:  {x.shape}")     # [32, 784]
print(f"Output shape: {output.shape}") # [32, 10] — 10 class logits

# Convert logits to probabilities
probs = torch.softmax(output, dim=-1)
pred_classes = probs.argmax(dim=-1)
print(f"Predicted classes: {pred_classes[:5]}")

Tip

Diagram

Loading diagram…

Modern NLP = Transformer-based. Pre-train, then fine-tune.

Topics in This Module