Text Classification — Sentiment Analysis with BERT

Text classification assigns a label to a piece of text: sentiment (positive/negative), topic category, spam detection, intent recognition. Fine-tuning pre-trained BERT on a classification task achieves 95%+ accuracy on most benchmark datasets with only 3-5 epochs of training.

25 min•By Priygop Team•Updated 2026

Fine-grained Sentiment Analysis with DistilBERT

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# BERT-based Text Classifier
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

class TextDataset(Dataset):
    """Dataset for text classification tasks."""
    def __init__(self, texts: list[str], labels: list[int], tokenizer, max_len: int = 128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx: int) -> dict:
        enc = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids":      enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "label":          torch.tensor(self.labels[idx]),
        }

class BERTClassifier(nn.Module):
    """DistilBERT + classification head."""
    def __init__(self, model_name: str = "distilbert-base-uncased", n_classes: int = 2, dropout: float = 0.3):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size  # 768

        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size // 2, n_classes),
        )
        nn.init.xavier_uniform_(self.classifier[0].weight)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # outputs.last_hidden_state: [B, seq_len, 768]
        # Use [CLS] token representation (position 0) as sentence embedding
        cls_output = outputs.last_hidden_state[:, 0, :]   # [B, 768]
        return self.classifier(cls_output)

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Training — SST-2 (Stanford Sentiment Treebank)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = BERTClassifier(n_classes=2).to(device)

# Sample data (in practice, load SST-2 from HuggingFace Datasets)
train_texts = ["I loved this movie!", "Terrible film, waste of time.", "Absolutely wonderful.", "Boring and predictable."] * 100
train_labels = [1, 0, 1, 0] * 100  # 1=positive, 0=negative

dataset = TextDataset(train_texts, train_labels, tokenizer)
loader  = DataLoader(dataset, batch_size=16, shuffle=True)

# Optimizer with different LRs: BERT needs tiny lr to avoid catastrophic forgetting
optimizer = AdamW([
    {"params": model.bert.parameters(),       "lr": 2e-5},  # pre-trained: very small
    {"params": model.classifier.parameters(), "lr": 1e-4},  # new head: normal
], weight_decay=0.01)

total_steps = len(loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=total_steps // 10,   # 10% warmup
    num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} | avg loss: {total_loss/len(loader):.4f}")
# Expected: ~95-96% accuracy on SST-2 test set

Tip

Practice Text Classification Sentiment Analysis with BERT in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Modern NLP = Transformer-based. Pre-train, then fine-tune.

Practice Task

Note

Practice Task — (1) Write a working example of Text Classification Sentiment Analysis with BERT from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Text Classification Sentiment Analysis with BERT is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.

Topics in This Module