Multimodal AI — Vision-Language Models
Multimodal models combine vision and language in a unified architecture. LLaVA, GPT-4V, Gemini 1.5, and Claude 3 can see images and discuss them. Open-source alternatives (LLaVA, InternVL) run locally and can be fine-tuned for specific vision-language tasks.
LLaVA — Open-Source Vision-Language Model
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
from PIL import Image
import torch
import requests
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# LLaVA-NeXT -- open-source GPT-4V alternative
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained(
"llava-hf/llava-v1.6-mistral-7b-hf",
torch_dtype=torch.float16,
device_map="auto",
)
def analyze_image(image_path_or_url: str, question: str) -> str:
'''Ask a question about an image using LLaVA.'''
if image_path_or_url.startswith("http"):
image = Image.open(requests.get(image_path_or_url, stream=True).raw)
else:
image = Image.open(image_path_or_url)
# LLaVA conversation template requires <image> token
conversation = [{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": question},
]}]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
with torch.inference_mode():
output = model.generate(**inputs, max_new_tokens=500, temperature=0.2, do_sample=True)
return processor.decode(output[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
# Use cases
tasks = [
("what_is_this.jpg", "What is in this image? Describe in detail."),
("chart.png", "Analyze this chart. What are the key trends?"),
("code_screenshot.jpg", "What programming language is this? Identify any bugs."),
("invoice.pdf_page.jpg", "Extract all line items and prices from this invoice as JSON."),
]
for img_path, question in tasks:
print(f"Q: {question}")
# answer = analyze_image(img_path, question) # run with actual images
# print(f"A: {answer}\n")
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# ARCHITECTURE OF VISION-LANGUAGE MODELS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 1. Image Encoder: ViT-L/14 (CLIP-based) processes image into patches
# 2. MLP Projector: maps visual features to LLM token embedding space
# 3. LLM: Mistral-7B / LLaMA-3 processes image tokens + text tokens together
vlm_models = {
"GPT-4o": {"open_source": False, "params": "~200B est.", "best_task": "Most capable, all vision tasks"},
"Claude 3 Opus": {"open_source": False, "params": "> 100B est.", "best_task": "Document analysis, charts, OCR"},
"Gemini 1.5 Pro": {"open_source": False, "params": "unknown", "best_task": "Very long video (1M tokens = 1hr video)"},
"LLaVA-1.6 34B": {"open_source": True, "params": "34B", "best_task": "Best open-source; near GPT-4V quality"},
"InternVL2 8B": {"open_source": True, "params": "8B", "best_task": "Top open-source at 8B, multilingual"},
"Phi-3-Vision": {"open_source": True, "params": "4.2B", "best_task": "Smallest capable VLM, runs on laptop"},
}
print("Vision-Language Model Comparison:")
for name, info in vlm_models.items():
os_str = "Open Source" if info["open_source"] else "Closed"
print(f" {name:20s} ({os_str:12s}) {info['params']:12s}: {info['best_task']}")Tip
Tip
Practice Multimodal AI VisionLanguage Models in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.
Technical diagram.
Practice Task
Note
Practice Task — (1) Write a working example of Multimodal AI VisionLanguage Models from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.
Quick Quiz
Common Mistake
Warning
A common mistake with Multimodal AI VisionLanguage Models is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.