Multimodal AI — Vision-Language Models

Multimodal models combine vision and language in a unified architecture. LLaVA, GPT-4V, Gemini 1.5, and Claude 3 can see images and discuss them. Open-source alternatives (LLaVA, InternVL) run locally and can be fine-tuned for specific vision-language tasks.

20 min•By Priygop Team•Updated 2026

LLaVA — Open-Source Vision-Language Model

from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
from PIL import Image
import torch
import requests

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# LLaVA-NeXT -- open-source GPT-4V alternative
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained(
    "llava-hf/llava-v1.6-mistral-7b-hf",
    torch_dtype=torch.float16,
    device_map="auto",
)

def analyze_image(image_path_or_url: str, question: str) -> str:
    '''Ask a question about an image using LLaVA.'''
    if image_path_or_url.startswith("http"):
        image = Image.open(requests.get(image_path_or_url, stream=True).raw)
    else:
        image = Image.open(image_path_or_url)

    # LLaVA conversation template requires <image> token
    conversation = [{"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": question},
    ]}]

    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")

    with torch.inference_mode():
        output = model.generate(**inputs, max_new_tokens=500, temperature=0.2, do_sample=True)

    return processor.decode(output[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

# Use cases
tasks = [
    ("what_is_this.jpg", "What is in this image? Describe in detail."),
    ("chart.png",        "Analyze this chart. What are the key trends?"),
    ("code_screenshot.jpg", "What programming language is this? Identify any bugs."),
    ("invoice.pdf_page.jpg", "Extract all line items and prices from this invoice as JSON."),
]

for img_path, question in tasks:
    print(f"Q: {question}")
    # answer = analyze_image(img_path, question)  # run with actual images
    # print(f"A: {answer}\n")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# ARCHITECTURE OF VISION-LANGUAGE MODELS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 1. Image Encoder: ViT-L/14 (CLIP-based) processes image into patches
# 2. MLP Projector: maps visual features to LLM token embedding space
# 3. LLM: Mistral-7B / LLaMA-3 processes image tokens + text tokens together

vlm_models = {
    "GPT-4o":         {"open_source": False, "params": "~200B est.",   "best_task": "Most capable, all vision tasks"},
    "Claude 3 Opus":  {"open_source": False, "params": "> 100B est.", "best_task": "Document analysis, charts, OCR"},
    "Gemini 1.5 Pro": {"open_source": False, "params": "unknown",      "best_task": "Very long video (1M tokens = 1hr video)"},
    "LLaVA-1.6 34B":  {"open_source": True,  "params": "34B",          "best_task": "Best open-source; near GPT-4V quality"},
    "InternVL2 8B":   {"open_source": True,  "params": "8B",           "best_task": "Top open-source at 8B, multilingual"},
    "Phi-3-Vision":   {"open_source": True,  "params": "4.2B",         "best_task": "Smallest capable VLM, runs on laptop"},
}

print("Vision-Language Model Comparison:")
for name, info in vlm_models.items():
    os_str = "Open Source" if info["open_source"] else "Closed"
    print(f"  {name:20s} ({os_str:12s}) {info['params']:12s}: {info['best_task']}")

Tip

Practice Multimodal AI VisionLanguage Models in small, isolated examples before integrating into larger projects. Breaking concepts into small experiments builds genuine understanding faster than reading alone.

Diagram

Loading diagram…

Technical diagram.

Practice Task

Note

Practice Task — (1) Write a working example of Multimodal AI VisionLanguage Models from scratch without looking at notes. (2) Modify it to handle an edge case (empty input, null value, or error state). (3) Share your solution in the Priygop community for feedback.

Quick Quiz

Common Mistake

Warning

A common mistake with Multimodal AI VisionLanguage Models is skipping edge case testing — empty inputs, null values, and unexpected data types. Always validate boundary conditions to write robust, production-ready ai code.

Topics in This Module

LLaVA — Open-Source Vision-Language Model

from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
from PIL import Image
import torch
import requests

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# LLaVA-NeXT -- open-source GPT-4V alternative
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained(
    "llava-hf/llava-v1.6-mistral-7b-hf",
    torch_dtype=torch.float16,
    device_map="auto",
)

def analyze_image(image_path_or_url: str, question: str) -> str:
    '''Ask a question about an image using LLaVA.'''
    if image_path_or_url.startswith("http"):
        image = Image.open(requests.get(image_path_or_url, stream=True).raw)
    else:
        image = Image.open(image_path_or_url)

    # LLaVA conversation template requires <image> token
    conversation = [{"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": question},
    ]}]

    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")

    with torch.inference_mode():
        output = model.generate(**inputs, max_new_tokens=500, temperature=0.2, do_sample=True)

    return processor.decode(output[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

# Use cases
tasks = [
    ("what_is_this.jpg", "What is in this image? Describe in detail."),
    ("chart.png",        "Analyze this chart. What are the key trends?"),
    ("code_screenshot.jpg", "What programming language is this? Identify any bugs."),
    ("invoice.pdf_page.jpg", "Extract all line items and prices from this invoice as JSON."),
]

for img_path, question in tasks:
    print(f"Q: {question}")
    # answer = analyze_image(img_path, question)  # run with actual images
    # print(f"A: {answer}\n")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# ARCHITECTURE OF VISION-LANGUAGE MODELS
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 1. Image Encoder: ViT-L/14 (CLIP-based) processes image into patches
# 2. MLP Projector: maps visual features to LLM token embedding space
# 3. LLM: Mistral-7B / LLaMA-3 processes image tokens + text tokens together

vlm_models = {
    "GPT-4o":         {"open_source": False, "params": "~200B est.",   "best_task": "Most capable, all vision tasks"},
    "Claude 3 Opus":  {"open_source": False, "params": "> 100B est.", "best_task": "Document analysis, charts, OCR"},
    "Gemini 1.5 Pro": {"open_source": False, "params": "unknown",      "best_task": "Very long video (1M tokens = 1hr video)"},
    "LLaVA-1.6 34B":  {"open_source": True,  "params": "34B",          "best_task": "Best open-source; near GPT-4V quality"},
    "InternVL2 8B":   {"open_source": True,  "params": "8B",           "best_task": "Top open-source at 8B, multilingual"},
    "Phi-3-Vision":   {"open_source": True,  "params": "4.2B",         "best_task": "Smallest capable VLM, runs on laptop"},
}

print("Vision-Language Model Comparison:")
for name, info in vlm_models.items():
    os_str = "Open Source" if info["open_source"] else "Closed"
    print(f"  {name:20s} ({os_str:12s}) {info['params']:12s}: {info['best_task']}")

Tip

Diagram

Loading diagram…

Technical diagram.

Topics in This Module