ONNX Export — Accelerated Inference
ONNX (Open Neural Network Exchange) is a model format that works across frameworks and enables hardware-optimized inference runtimes. ONNX Runtime with CUDA execution provider achieves 2-5x speedup over PyTorch for inference. Combined with model quantization, you can run BERT-class inference 10x faster.
ONNX Export and Optimization
import torch
import onnxruntime as ort
from transformers import AutoTokenizer
import numpy as np
import time
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 1: EXPORT PYTORCH MODEL TO ONNX
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased-finetuned-sst-2-english"
)
model.eval()
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
dummy_input = tokenizer("sample text", return_tensors="pt", padding="max_length",
max_length=128, truncation=True)
torch.onnx.export(
model,
(dummy_input["input_ids"], dummy_input["attention_mask"]),
"sentiment_model.onnx",
export_params=True,
opset_version=17, # use latest opset for best operator support
do_constant_folding=True, # fold constants for optimization
input_names=["input_ids", "attention_mask"],
output_names=["logits"],
dynamic_axes={ # enable variable sequence lengths
"input_ids": {0: "batch_size", 1: "sequence_length"},
"attention_mask": {0: "batch_size", 1: "sequence_length"},
"logits": {0: "batch_size"},
},
)
print("ONNX model exported!")
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 2: OPTIMIZE WITH ONNX RUNTIME
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
from onnxruntime.transformers import optimizer
from onnxruntime.transformers.fusion_options import FusionOptions
optimized = optimizer.optimize_model(
"sentiment_model.onnx",
model_type="bert",
num_heads=12, hidden_size=768,
optimization_options=FusionOptions("bert"), # fuse attention patterns
)
optimized.save_model_to_file("sentiment_model_optimized.onnx")
print("Optimized ONNX model saved!")
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 3: INT8 QUANTIZATION -- 4x faster, 4x smaller
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic(
"sentiment_model_optimized.onnx",
"sentiment_model_int8.onnx",
weight_type=QuantType.QInt8,
)
print("INT8 quantized model saved!")
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# STEP 4: INFERENCE BENCHMARK
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
texts = ["I absolutely loved this movie!"] * 16 # batch of 16
def benchmark_onnx(model_path: str, texts: list[str], n_runs: int = 100) -> float:
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] if ort.get_device() == "GPU" else ["CPUExecutionProvider"]
session = ort.InferenceSession(model_path, providers=providers)
enc = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="np")
inputs = {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"]}
# Warmup
for _ in range(5):
session.run(None, inputs)
start = time.time()
for _ in range(n_runs):
session.run(None, inputs)
return (time.time() - start) / n_runs * 1000 # ms per run
latency_fp32 = benchmark_onnx("sentiment_model_optimized.onnx", texts)
latency_int8 = benchmark_onnx("sentiment_model_int8.onnx", texts)
print(f"\nBenchmark (batch=16, 100 runs):")
print(f" FP32 optimized: {latency_fp32:.1f} ms/batch ({len(texts)/latency_fp32*1000:.0f} samples/sec)")
print(f" INT8 quantized: {latency_int8:.1f} ms/batch ({len(texts)/latency_int8*1000:.0f} samples/sec)")
print(f" Quantization speedup: {latency_fp32/latency_int8:.1f}x")Tip
Tip
Practice ONNX Export Accelerated Inference in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
Better prompts = better AI output. Structure, examples, and constraints matter.
Practice Task
Note
Practice Task — (1) Write a working example of ONNX Export Accelerated Inference from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with ONNX Export Accelerated Inference is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.