Building on CLIP's foundation, modern Vision-Language Models (VLMs) like GPT-4V, Gemini, and Claude have revolutionized how AI systems understand and reason about images. These models don't just classify or retrieve - they converse, analyze, and reason about visual content using natural language, enabling applications from document analysis to creative assistance.
While CLIP created a shared embedding space, modern VLMs integrate vision directly into the language model's token stream. Instead of separate encoders, images become part of the conversational flow alongside text tokens.
While closed-source models like GPT-4V grab headlines, a thriving open-source ecosystem is rapidly closing the performance gap while offering complete transparency, customizability, and cost-effectiveness. These models provide full access to architectures, training code, and datasets - enabling research, customization, and deployment without API dependencies.
# Complete LLaVA training pipeline
git clone https://github.com/haotian-liu/LLaVA.git
cd LLaVA
# 1. Environment setup
conda create -n llava python=3.10 -y
conda activate llava
pip install --upgrade pip
pip install -e .
pip install -e ".[train]"
pip install flash-attn --no-build-isolation
# 2. Data preparation
# Download LLaVA training data
wget https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_v1_5_mix665k.json
# Prepare your custom dataset (JSON format)
# Format: {"id": "unique_id", "image": "path/to/image.jpg",
# "conversations": [{"from": "human", "value": "\nQuestion"},
# {"from": "gpt", "value": "Answer"}]}
# 3. Model training (Stage 1: Pretraining)
torchrun --nnodes=1 --nproc_per_node=8 --master_port=25001 \
llava/train/train_mem.py \
--deepspeed ./scripts/zero3.json \
--model_name_or_path lmsys/vicuna-7b-v1.5 \
--version v1 \
--data_path path/to/pretrain_data.json \
--image_folder path/to/images \
--vision_tower openai/clip-vit-large-patch14-336 \
--mm_projector_type mlp2x_gelu \
--tune_mm_mlp_adapter True \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--bf16 True \
--output_dir ./checkpoints/llava-v1.5-7b-pretrain \
--num_train_epochs 1 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 24000 \
--save_total_limit 1 \
--learning_rate 1e-3 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to wandb
# 4. Stage 2: Fine-tuning
torchrun --nnodes=1 --nproc_per_node=8 --master_port=25001 \
llava/train/train_mem.py \
--deepspeed ./scripts/zero3.json \
--model_name_or_path lmsys/vicuna-7b-v1.5 \
--version v1 \
--data_path path/to/finetune_data.json \
--image_folder path/to/images \
--vision_tower openai/clip-vit-large-patch14-336 \
--pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-7b-pretrain/mm_projector.bin \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--image_aspect_ratio pad \
--group_by_modality_length True \
--bf16 True \
--output_dir ./checkpoints/llava-v1.5-7b \
--num_train_epochs 1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 50000 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to wandb
# 5. Evaluation
python -m llava.eval.model_vqa_loader \
--model-path ./checkpoints/llava-v1.5-7b \
--question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
--image-folder ./playground/data/eval/textvqa/train_images \
--answers-file ./playground/data/eval/textvqa/answers/llava-v1.5-7b.jsonl \
--temperature 0 \
--conv-mode vicuna_v1
echo "๐ LLaVA training complete!"
echo "๐ฐ Total cost: ~$500K for 34B model (8x A100 GPUs for ~1 week)"
echo "๐ Expected performance: ~69.5 MMBench, ~81.6% VQAv2"
# BLIP-2 training pipeline
git clone https://github.com/salesforce/LAVIS.git
cd LAVIS
# Environment setup
pip install -e .
# Training script for InstructBLIP
python -m torch.distributed.run --nproc_per_node=8 train.py \
--cfg-path lavis/projects/instructblip/train/vicuna7b_instruct.yaml
# Custom configuration for your data
# Create custom config YAML:
model:
arch: blip2_vicuna_instruct
model_type: vicuna7b
load_finetuned: False
load_pretrained: True
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
datasets:
custom_vqa:
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
text_processor:
train:
name: "blip_question"
data_type: images
build_info:
annotations:
train:
url: path/to/your_train_data.json
storage: path/to/your_images/
images:
storage: path/to/your_images/
run:
task: vqa
lr_sched: "linear_warmup_cosine_lr"
init_lr: 1e-5
min_lr: 0
warmup_lr: 1e-8
warmup_steps: 1000
weight_decay: 0.05
max_epoch: 5
batch_size_train: 16
batch_size_eval: 8
num_workers: 4
accum_grad_iters: 1
seed: 42
output_dir: "output/instructblip_custom"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: ["train"]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
echo "๐ฏ InstructBLIP training configured!"
echo "๐ก Key advantage: Q-Former allows flexible vision-text interaction"
import torch
import torch.nn as nn
from transformers import (
CLIPVisionModel, CLIPImageProcessor,
LlamaForCausalLM, LlamaTokenizer
)
class CustomVLM(nn.Module):
"""
Custom Vision-Language Model - Build your own!
"""
def __init__(self,
vision_model="openai/clip-vit-base-patch32",
language_model="meta-llama/Llama-2-7b-hf",
projection_type="mlp"):
super().__init__()
# Vision components
self.vision_model = CLIPVisionModel.from_pretrained(vision_model)
self.vision_processor = CLIPImageProcessor.from_pretrained(vision_model)
# Language components
self.language_model = LlamaForCausalLM.from_pretrained(language_model)
self.tokenizer = LlamaTokenizer.from_pretrained(language_model)
# Vision-Language projection
vision_dim = self.vision_model.config.hidden_size
language_dim = self.language_model.config.hidden_size
if projection_type == "mlp":
self.projector = nn.Sequential(
nn.Linear(vision_dim, language_dim),
nn.GELU(),
nn.Linear(language_dim, language_dim)
)
elif projection_type == "q_former":
# Implement Q-Former style projection
self.projector = self._build_q_former(vision_dim, language_dim)
# Special tokens
self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
self.tokenizer.add_tokens([''])
self.language_model.resize_token_embeddings(len(self.tokenizer))
# Freeze vision model initially
for param in self.vision_model.parameters():
param.requires_grad = False
def _build_q_former(self, vision_dim, language_dim, num_queries=32):
"""Build Q-Former style cross-attention"""
from transformers import BertConfig, BertLMHeadModel
# Q-Former configuration
encoder_config = BertConfig(
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
add_cross_attention=True,
)
q_former = BertLMHeadModel(config=encoder_config)
# Learnable query tokens
self.query_tokens = nn.Parameter(
torch.zeros(1, num_queries, 768)
)
self.query_tokens.data.normal_(mean=0.0, std=0.02)
return q_former
def forward(self, images, text_input):
"""Forward pass through custom VLM"""
batch_size = images.size(0)
# Process images
with torch.no_grad():
vision_outputs = self.vision_model(images)
image_features = vision_outputs.last_hidden_state
# Project to language space
projected_features = self.projector(image_features)
# Process text
text_tokens = self.tokenizer(
text_input,
return_tensors="pt",
padding=True,
truncation=True
)
# Find image token positions and replace with visual features
# (Implementation details for token replacement)
# Forward through language model
outputs = self.language_model(
inputs_embeds=combined_embeddings,
attention_mask=attention_mask
)
return outputs
# Training setup
def train_custom_vlm():
model = CustomVLM()
# Your custom training loop
optimizer = torch.optim.AdamW(
model.parameters(),
lr=1e-4,
weight_decay=0.05
)
for epoch in range(num_epochs):
for batch in dataloader:
images, texts, labels = batch
outputs = model(images, texts)
loss = compute_loss(outputs, labels)
loss.backward()
optimizer.step()
optimizer.zero_grad()
return model
# Usage
model = train_custom_vlm()
print("๐ ๏ธ Custom VLM trained successfully!")
# Production deployment setup
import gradio as gr
from transformers import LlavaForConditionalGeneration, AutoProcessor
import torch
class VLMInferenceServer:
def __init__(self, model_path="llava-hf/llava-1.5-7b-hf"):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# Load model and processor
self.model = LlavaForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.float16,
device_map="auto"
)
self.processor = AutoProcessor.from_pretrained(model_path)
def inference(self, image, text_prompt):
"""Run inference on image + text"""
prompt = f"USER: \n{text_prompt} ASSISTANT:"
inputs = self.processor(
prompt,
image,
return_tensors='pt'
).to(self.device, torch.float16)
with torch.no_grad():
output = self.model.generate(
**inputs,
max_new_tokens=200,
do_sample=False
)
response = self.processor.decode(
output[0][2:],
skip_special_tokens=True
)
return response
# Gradio interface
vlm_server = VLMInferenceServer()
def process_image_text(image, text):
if image is None:
return "Please upload an image."
return vlm_server.inference(image, text)
# Create Gradio app
demo = gr.Interface(
fn=process_image_text,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(label="Ask a question about the image",
placeholder="Describe this image in detail...")
],
outputs=gr.Textbox(label="AI Response"),
title="๐ฆ LLaVA Vision-Language Model",
description="Upload an image and ask questions about it!",
examples=[
["example1.jpg", "What's happening in this image?"],
["example2.jpg", "What colors do you see?"],
["example3.jpg", "Describe the scene in detail."]
]
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True # Creates public link
)
# Docker deployment
"""
# Dockerfile
FROM pytorch/pytorch:2.1.0-cuda11.8-devel
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
EXPOSE 7860
CMD ["python", "app.py"]
"""
# Kubernetes deployment
"""
# k8s-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: vlm-deployment
spec:
replicas: 3
selector:
matchLabels:
app: vlm
template:
metadata:
labels:
app: vlm
spec:
containers:
- name: vlm
image: your-registry/vlm:latest
ports:
- containerPort: 7860
resources:
limits:
nvidia.com/gpu: 1
requests:
memory: "8Gi"
cpu: "2"
"""
print("๐ VLM deployment ready!")
print("๐ฐ Deployment costs: ~$0.10/hour (vs $10+/hour for GPT-4V API)")
print("๐ง Full control over model, data, and infrastructure")
First competitive open-source VLM achieves 85%+ of GPT-4V performance with simple architecture. Demonstrates that effective VLMs don't require massive proprietary infrastructure.
Salesforce's Q-Former architecture shows how to effectively bridge vision and language modalities, inspiring numerous follow-up models and research directions.
Demonstrates that high-quality VLM capabilities can be achieved with minimal training - just 5M parameters for vision-language alignment, training in hours instead of days.
Closes gap with GPT-4V on many benchmarks while maintaining complete transparency. Proves open-source can compete with the best closed-source systems.
The key innovation in modern VLMs is treating image patches as tokens that can be processed alongside text tokens in a unified transformer architecture. This enables seamless multimodal reasoning.
Explore how images become token sequences: See how a 224ร224 image gets converted into hundreds of tokens that flow through the language model.
Different VLMs use different strategies to combine visual and textual information. The fusion method dramatically impacts model capabilities and computational efficiency.
The magic of VLMs lies in their attention mechanisms - how they learn to focus on relevant parts of images while processing text, and how visual information influences text generation.
Constitutional AI (CAI) is a training methodology that teaches AI systems to follow a set of principles (a "constitution") and self-correct their behavior. Instead of relying entirely on human feedback, models learn to critique and improve their own responses based on explicit moral and behavioral guidelines.
Model generates initial response to prompt using standard training
Model evaluates its own response against constitutional principles
Model generates improved response based on its self-critique
Model learns from its own improved responses via RL
| Aspect | Traditional RLHF | Constitutional AI |
|---|---|---|
| Feedback Source | Human preferences | AI self-evaluation + principles |
| Scalability | Limited by human bandwidth | Scales with model capability |
| Consistency | Variable human judgment | Consistent principle application |
| Transparency | Opaque human preferences | Explicit written principles |
| Cost | High (human labor) | Lower (automated) |
Applying Constitutional AI to VLMs introduces multimodal challenges that don't exist in text-only systems. The model must evaluate both visual content and its textual responses about that content.
class VLMConstitutionalTrainer:
def __init__(self, vlm_model, constitution):
self.vlm = vlm_model
self.constitution = constitution
# Visual safety classifiers
self.visual_safety_classifier = self._build_safety_classifier()
self.privacy_detector = self._build_privacy_detector()
# Constitutional reward models
self.visual_accuracy_judge = self._build_accuracy_judge()
self.helpfulness_judge = self._build_helpfulness_judge()
def constitutional_training_step(self, image, prompt):
"""Single constitutional training step"""
# 1. Generate initial response
initial_response = self.vlm.generate(image, prompt)
# 2. Multi-dimensional constitutional critique
critique = self.comprehensive_critique(image, prompt, initial_response)
# 3. Self-revision based on critique
revision_prompt = f"""
Based on this critique of my response:
{critique}
Please provide an improved response to: {prompt}
[Image shown]
Focus on being more accurate, helpful, and safe.
"""
revised_response = self.vlm.generate(image, revision_prompt)
# 4. Constitutional reward calculation
reward = self.calculate_constitutional_reward(
image, prompt, revised_response
)
return {
'initial': initial_response,
'critique': critique,
'revised': revised_response,
'reward': reward
}
def comprehensive_critique(self, image, prompt, response):
"""Multi-faceted constitutional critique for VLMs"""
# Visual accuracy critique
visual_critique = self._critique_visual_accuracy(image, response)
# Safety critique
safety_critique = self._critique_safety(image, prompt, response)
# Helpfulness critique
help_critique = self._critique_helpfulness(prompt, response)
# Privacy critique
privacy_critique = self._critique_privacy(image, response)
return {
'visual_accuracy': visual_critique,
'safety': safety_critique,
'helpfulness': help_critique,
'privacy': privacy_critique
}
def _critique_visual_accuracy(self, image, response):
"""Evaluate factual accuracy of visual descriptions"""
accuracy_prompt = f"""
Constitutional Principle: "Describe only what you can actually see"
My response: {response}
Critique questions:
1. Did I describe anything not visible in the image?
2. Did I miss important visual details?
3. Did I make unsupported inferences?
4. How can I improve factual accuracy?
"""
return self.vlm.generate(image, accuracy_prompt)
def _critique_safety(self, image, prompt, response):
"""Evaluate safety implications"""
safety_prompt = f"""
Constitutional Principles:
- "Avoid content that could cause harm"
- "Don't identify private individuals"
- "Don't provide medical diagnoses"
Original question: {prompt}
My response: {response}
Safety evaluation:
1. Could my response cause harm if followed?
2. Did I identify someone who should remain private?
3. Did I provide medical advice inappropriately?
4. How can I be more responsible?
"""
return self.vlm.generate(image, safety_prompt)
def _critique_helpfulness(self, prompt, response):
"""Evaluate educational value and helpfulness"""
help_prompt = f"""
Constitutional Principle: "Be maximally helpful and educational"
User question: {prompt}
My response: {response}
Helpfulness evaluation:
1. Did I directly address the user's question?
2. Could I provide more educational context?
3. Is my response at the appropriate detail level?
4. How can I be more useful?
"""
return self.vlm.generate(None, help_prompt) # Text-only critique
def calculate_constitutional_reward(self, image, prompt, response):
"""Calculate multi-dimensional constitutional reward"""
# Component rewards
accuracy_reward = self.visual_accuracy_judge(image, response)
safety_reward = self.safety_judge(image, prompt, response)
helpfulness_reward = self.helpfulness_judge(prompt, response)
# Weighted combination (adjust weights based on application)
total_reward = (
0.4 * accuracy_reward + # Visual accuracy is critical
0.35 * safety_reward + # Safety is paramount
0.25 * helpfulness_reward # Helpfulness completes the picture
)
return total_reward
# Example usage in training loop
trainer = VLMConstitutionalTrainer(vlm_model, CLAUDE_CONSTITUTION)
for epoch in range(num_epochs):
for batch in constitutional_training_data:
for image, prompt in batch:
result = trainer.constitutional_training_step(image, prompt)
# Train model on constitutionally-improved responses
loss = compute_loss(result['revised'], target_response)
# Add constitutional reward to loss
total_loss = loss - lambda_constitutional * result['reward']
total_loss.backward()
Vision-Language Models face unique constitutional challenges that don't exist in text-only systems. These require specialized principles and training approaches.
See how Constitutional AI helps VLMs self-correct problematic responses in real-time.
class ConstitutionalVLM(nn.Module):
"""Vision-Language Model with Constitutional AI training"""
def __init__(self, base_vlm, constitution_config):
super().__init__()
self.base_vlm = base_vlm
self.constitution = constitution_config
# Constitutional components
self.visual_safety_classifier = VisualSafetyClassifier()
self.constitutional_judge = ConstitutionalJudge(constitution_config)
self.self_critique_module = SelfCritiqueModule()
def forward_with_constitution(self, image, prompt):
"""Generate response with constitutional self-correction"""
# Step 1: Initial safety check
safety_score = self.visual_safety_classifier(image)
if safety_score < self.constitution.safety_threshold:
return "I can't analyze this image due to safety guidelines."
# Step 2: Generate initial response
initial_response = self.base_vlm.generate(image, prompt)
# Step 3: Constitutional self-critique
critique = self.self_critique_module(
image, prompt, initial_response, self.constitution
)
# Step 4: Decide if revision needed
if critique.needs_revision:
# Generate constitutionally-improved response
revised_response = self.generate_revision(
image, prompt, initial_response, critique
)
# Verify improvement
improvement_score = self.constitutional_judge(
image, prompt, revised_response
)
if improvement_score > critique.initial_score:
return revised_response
return initial_response
def generate_revision(self, image, prompt, initial_response, critique):
"""Generate improved response based on constitutional critique"""
revision_prompt = f"""
Constitutional Principles: {self.constitution.principles}
Original question: {prompt}
My initial response: {initial_response}
Constitutional critique: {critique.feedback}
Please provide an improved response that:
1. Addresses the critique points
2. Better follows constitutional principles
3. Maintains helpfulness while improving safety/accuracy
Improved response:
"""
return self.base_vlm.generate(image, revision_prompt)
class VisualSafetyClassifier(nn.Module):
"""Classify visual content for safety"""
def __init__(self):
super().__init__()
self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# Safety classification head
self.safety_classifier = nn.Sequential(
nn.Linear(512, 256),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(256, 4) # safe, concerning, unsafe, unknown
)
def forward(self, image):
# Extract visual features
with torch.no_grad():
image_features = self.clip_model.get_image_features(image)
# Classify safety level
safety_logits = self.safety_classifier(image_features)
safety_probs = torch.softmax(safety_logits, dim=-1)
return safety_probs[:, 0] # Return "safe" probability
class SelfCritiqueModule(nn.Module):
"""Generate constitutional critiques of VLM responses"""
def __init__(self):
super().__init__()
def forward(self, image, prompt, response, constitution):
"""Generate comprehensive constitutional critique"""
critique_prompts = {
'visual_accuracy': f"""
Constitutional Principle: "Describe only what you can see"
Image: [shown]
My response: {response}
Visual accuracy critique:
- What did I describe that isn't actually visible?
- What important details did I miss?
- Where did I make unsupported inferences?
- Rate accuracy: 1-10
""",
'safety': f"""
Constitutional Principles: {constitution.safety_principles}
User question: {prompt}
Image: [shown]
My response: {response}
Safety critique:
- Could my response cause harm if followed?
- Did I identify someone inappropriately?
- Did I provide dangerous advice?
- Rate safety: 1-10
""",
'helpfulness': f"""
Constitutional Principle: "Be maximally helpful and educational"
Question: {prompt}
My response: {response}
Helpfulness critique:
- Did I fully address the user's question?
- Could I provide more educational value?
- Is my response appropriately detailed?
- Rate helpfulness: 1-10
"""
}
critiques = {}
for aspect, prompt_text in critique_prompts.items():
if aspect == 'helpfulness':
# Text-only critique for helpfulness
critiques[aspect] = self.generate_critique(None, prompt_text)
else:
# Include image for visual critiques
critiques[aspect] = self.generate_critique(image, prompt_text)
return self.synthesize_critiques(critiques)
# Constitutional training configuration
CLAUDE_VISUAL_CONSTITUTION = {
'safety_principles': [
"Protect individual privacy - don't identify people",
"Avoid medical diagnoses or health advice from images",
"Be culturally sensitive and avoid stereotyping",
"Don't assist with potentially illegal activities"
],
'accuracy_principles': [
"Describe only what is clearly visible",
"Express uncertainty about unclear visual elements",
"Distinguish observation from inference",
"Acknowledge limitations of visual analysis"
],
'helpfulness_principles': [
"Provide educational context when appropriate",
"Explain visual concepts clearly",
"Offer practical insights about what you observe",
"Help users understand complex visual information"
]
}
# Usage example
constitutional_vlm = ConstitutionalVLM(
base_vlm=your_vlm_model,
constitution_config=CLAUDE_VISUAL_CONSTITUTION
)
# Generate constitutionally-aligned response
response = constitutional_vlm.forward_with_constitution(image, user_prompt)
import openai
import base64
import requests
from PIL import Image
import io
client = openai.OpenAI(api_key="your-api-key")
def encode_image(image_path):
"""Encode image to base64 for GPT-4V API"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def analyze_image_gpt4v(image_path, prompt="Describe this image in detail"):
"""
Analyze image using GPT-4V
"""
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high" # "low" | "high" | "auto"
}
}
]
}
],
max_tokens=1000,
temperature=0.0 # Deterministic for analysis tasks
)
return response.choices[0].message.content
# Advanced usage: Multiple images with conversation
def multi_image_conversation(image_paths, conversation_history):
"""
Handle multi-image conversation with GPT-4V
"""
messages = []
# Add conversation history
for msg in conversation_history:
messages.append(msg)
# Add multiple images to current message
content = [{"type": "text", "text": "Compare these images:"}]
for image_path in image_paths:
base64_image = encode_image(image_path)
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"
}
})
messages.append({"role": "user", "content": content})
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=messages,
max_tokens=1500
)
return response.choices[0].message.content
# Specialized analysis functions
def extract_text_from_image(image_path):
"""OCR and document understanding"""
return analyze_image_gpt4v(
image_path,
"Extract all text from this image and format it properly. "
"Maintain the original structure and layout."
)
def analyze_chart_or_graph(image_path):
"""Data visualization analysis"""
return analyze_image_gpt4v(
image_path,
"Analyze this chart or graph. Describe the data trends, "
"key insights, and any notable patterns. Provide specific "
"numbers where visible."
)
def generate_code_from_ui(image_path, framework="React"):
"""UI mockup to code generation"""
return analyze_image_gpt4v(
image_path,
f"Generate {framework} code for this UI mockup. Include "
f"proper styling and component structure. Make it responsive "
f"and production-ready."
)
# Example usage
if __name__ == "__main__":
# Basic image analysis
description = analyze_image_gpt4v("photo.jpg")
print("Image description:", description)
# OCR example
extracted_text = extract_text_from_image("document.jpg")
print("Extracted text:", extracted_text)
# Multi-image comparison
comparison = multi_image_conversation(
["before.jpg", "after.jpg"],
[{"role": "system", "content": "You are an expert analyst."}]
)
print("Comparison:", comparison)
import google.generativeai as genai
from PIL import Image
import io
# Configure API
genai.configure(api_key="your-google-api-key")
def analyze_image_gemini(image_path, prompt="Describe this image"):
"""
Analyze image using Gemini Pro Vision
"""
# Load and prepare image
image = Image.open(image_path)
# Initialize model
model = genai.GenerativeModel('gemini-pro-vision')
# Generate response
response = model.generate_content([prompt, image])
return response.text
def analyze_video_gemini(video_path, prompt="Describe this video"):
"""
Analyze video using Gemini (unique capability)
"""
# Upload video file
video_file = genai.upload_file(path=video_path)
# Wait for processing
while video_file.state.name == "PROCESSING":
time.sleep(10)
video_file = genai.get_file(video_file.name)
model = genai.GenerativeModel('gemini-1.5-pro')
response = model.generate_content([prompt, video_file])
return response.text
def mathematical_reasoning_from_image(image_path):
"""
Solve math problems from images (Gemini's strength)
"""
prompt = """
Look at this mathematical problem in the image.
Solve it step by step, showing your work clearly.
If there are graphs or diagrams, interpret them as part of the solution.
"""
return analyze_image_gemini(image_path, prompt)
def long_document_analysis(image_paths):
"""
Analyze multi-page documents (up to 2M tokens context)
"""
model = genai.GenerativeModel('gemini-1.5-pro')
# Prepare images
images = [Image.open(path) for path in image_paths]
prompt = """
Analyze this multi-page document. Provide:
1. Summary of main topics
2. Key data points and statistics
3. Important conclusions or recommendations
4. Any action items or next steps mentioned
"""
content = [prompt] + images
response = model.generate_content(content)
return response.text
def code_generation_from_mockup(image_path, framework="HTML/CSS"):
"""
Generate code from UI mockups
"""
prompt = f"""
Generate clean, production-ready {framework} code for this UI mockup.
Requirements:
- Responsive design
- Modern styling
- Accessible markup
- Clean, commented code
- Include hover states and interactions where appropriate
"""
return analyze_image_gemini(image_path, prompt)
# Advanced: Streaming responses for long analysis
def stream_analysis(image_path, prompt):
"""
Stream response for real-time feedback
"""
image = Image.open(image_path)
model = genai.GenerativeModel('gemini-pro-vision')
response = model.generate_content([prompt, image], stream=True)
full_response = ""
for chunk in response:
if chunk.text:
print(chunk.text, end='')
full_response += chunk.text
return full_response
# Safety settings for content filtering
def safe_image_analysis(image_path, prompt):
"""
Analysis with safety controls
"""
safety_settings = [
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}
]
image = Image.open(image_path)
model = genai.GenerativeModel('gemini-pro-vision')
response = model.generate_content(
[prompt, image],
safety_settings=safety_settings
)
return response.text
# Example usage
if __name__ == "__main__":
# Basic analysis
result = analyze_image_gemini("chart.jpg", "Explain this business chart")
print("Chart analysis:", result)
# Math problem solving
solution = mathematical_reasoning_from_image("math_problem.jpg")
print("Solution:", solution)
# Video analysis (unique to Gemini)
video_summary = analyze_video_gemini("presentation.mp4",
"Summarize the key points from this presentation")
print("Video summary:", video_summary)
import anthropic
import base64
from typing import List, Dict
client = anthropic.Anthropic(api_key="your-anthropic-api-key")
def encode_image_claude(image_path):
"""Encode image for Claude API"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def analyze_image_claude(image_path, prompt="Please describe this image"):
"""
Analyze image using Claude 3 (Opus/Sonnet/Haiku)
"""
base64_image = encode_image_claude(image_path)
# Determine image type
image_type = "image/jpeg"
if image_path.lower().endswith('.png'):
image_type = "image/png"
elif image_path.lower().endswith('.gif'):
image_type = "image/gif"
elif image_path.lower().endswith('.webp'):
image_type = "image/webp"
response = client.messages.create(
model="claude-3-opus-20240229", # or "claude-3-sonnet-20240229"
max_tokens=1500,
temperature=0,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": image_type,
"data": base64_image
}
},
{
"type": "text",
"text": prompt
}
]
}
]
)
return response.content[0].text
def educational_image_explanation(image_path, subject="general"):
"""
Generate educational explanations (Claude's strength)
"""
prompt = f"""
Please provide a detailed educational explanation of this image related to {subject}.
Structure your response as:
1. What you observe
2. Key concepts or principles demonstrated
3. Educational significance or applications
4. Questions this might help students explore
Make it suitable for learning and teaching.
"""
return analyze_image_claude(image_path, prompt)
def structured_data_extraction(image_path, format_type="JSON"):
"""
Extract structured data from images (forms, tables, etc.)
"""
prompt = f"""
Extract all structured information from this image and format it as {format_type}.
For tables: preserve row/column structure
For forms: capture field names and values
For documents: maintain hierarchical organization
Be precise and comprehensive.
"""
return analyze_image_claude(image_path, prompt)
def creative_visual_analysis(image_path):
"""
Creative and artistic analysis (Claude's strength)
"""
prompt = """
Provide a thoughtful creative analysis of this image. Consider:
- Artistic elements (composition, color, lighting, mood)
- Emotional impact and atmosphere
- Symbolism or deeper meaning
- Cultural or historical context if relevant
- Technical aspects of the photography/artwork
Write in an engaging, insightful style that would be valuable
for art students or enthusiasts.
"""
return analyze_image_claude(image_path, prompt)
def multi_step_reasoning(image_path, reasoning_type="logical"):
"""
Complex reasoning tasks from visual input
"""
prompt = f"""
Analyze this image using step-by-step {reasoning_type} reasoning.
Please:
1. Observe and describe what you see
2. Identify relevant information for analysis
3. Apply logical reasoning step by step
4. Draw conclusions based on your analysis
5. Explain your reasoning process
Be thorough and show your work clearly.
"""
return analyze_image_claude(image_path, prompt)
def safety_focused_analysis(image_path):
"""
Analyze images with safety considerations (Claude's constitutional training)
"""
prompt = """
Please analyze this image with attention to:
1. Content appropriateness and safety considerations
2. Factual accuracy of any information shown
3. Potential misinterpretations or biases
4. Ethical implications if relevant
Provide a balanced, thoughtful analysis that considers multiple perspectives.
"""
return analyze_image_claude(image_path, prompt)
def conversation_with_images(image_paths: List[str], conversation_prompt: str):
"""
Multi-image conversation with Claude
"""
content = []
# Add all images
for image_path in image_paths:
base64_image = encode_image_claude(image_path)
image_type = "image/jpeg"
if image_path.lower().endswith('.png'):
image_type = "image/png"
content.append({
"type": "image",
"source": {
"type": "base64",
"media_type": image_type,
"data": base64_image
}
})
# Add conversation prompt
content.append({
"type": "text",
"text": conversation_prompt
})
response = client.messages.create(
model="claude-3-opus-20240229",
max_tokens=2000,
messages=[{"role": "user", "content": content}]
)
return response.content[0].text
def document_qa_system(image_path, questions: List[str]):
"""
Question-answering system for document images
"""
base64_image = encode_image_claude(image_path)
questions_text = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
prompt = f"""
Please analyze this document image and answer the following questions:
{questions_text}
For each question, provide:
- A direct answer based on the document
- The specific location/section where you found the information
- Your confidence level in the answer
If information isn't available in the document, please state that clearly.
"""
response = client.messages.create(
model="claude-3-opus-20240229",
max_tokens=2000,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": base64_image
}
},
{
"type": "text",
"text": prompt
}
]
}
]
)
return response.content[0].text
# Example usage
if __name__ == "__main__":
# Educational explanation
explanation = educational_image_explanation("diagram.jpg", "biology")
print("Educational explanation:", explanation)
# Creative analysis
creative_analysis = creative_visual_analysis("artwork.jpg")
print("Creative analysis:", creative_analysis)
# Multi-image conversation
comparison = conversation_with_images(
["chart1.jpg", "chart2.jpg"],
"Compare these two charts and highlight the key differences in trends."
)
print("Chart comparison:", comparison)
# Document Q&A
answers = document_qa_system("invoice.jpg", [
"What is the total amount?",
"When is the due date?",
"What company issued this invoice?"
])
print("Document Q&A:", answers)
| Feature | GPT-4V | Gemini Pro | Claude 3 Opus |
|---|---|---|---|
| Max Image Size | 20MB, 4096ร4096 | 4MB, multiple formats | 5MB, up to 8000ร8000 |
| Supported Formats | PNG, JPEG, WEBP, GIF | PNG, JPEG, WEBP, HEIC | PNG, JPEG, GIF, WEBP |
| Batch Processing | Multiple images per request | Multiple images + video | Multiple images per request |
| Context Length | 128K tokens | 2M tokens (Gemini 1.5) | 200K tokens |
| Pricing (per image) | $0.01 (detail=high) | $0.0025 | $0.0048 |
| Special Features | DALL-E integration | Video understanding | Constitutional AI safety |
GPT-4V deployed for radiology report generation, reducing reporting time by 60% while maintaining accuracy. Integration with PACS systems enables real-time image analysis.
Claude 3 powers automated textbook digitization, converting scanned pages to interactive digital content with 98% accuracy in mathematical notation recognition.
Gemini Pro Vision automates loan application processing, extracting and validating information from complex financial documents, reducing processing time from days to minutes.
GPT-4V enables "shop the look" functionality, allowing customers to upload photos and find similar products with 95% relevance accuracy across 10M+ product catalogs.
Training modern VLMs requires sophisticated instruction tuning that teaches models to follow visual instructions and provide helpful, accurate responses about images.
import torch
import torch.nn as nn
from transformers import (
LlamaForCausalLM,
LlamaTokenizer,
CLIPVisionModel,
CLIPImageProcessor,
Trainer,
TrainingArguments
)
from torch.utils.data import Dataset
from PIL import Image
import json
class VisionLanguageModel(nn.Module):
"""
Custom Vision-Language Model combining CLIP vision encoder with LLaMA
"""
def __init__(self, llm_model_name="meta-llama/Llama-2-7b-hf"):
super().__init__()
# Vision encoder (CLIP ViT)
self.vision_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
self.vision_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
# Language model
self.language_model = LlamaForCausalLM.from_pretrained(llm_model_name)
self.tokenizer = LlamaTokenizer.from_pretrained(llm_model_name)
# Vision-language projection
vision_dim = self.vision_encoder.config.hidden_size # 768 for CLIP ViT-Base
llm_dim = self.language_model.config.hidden_size # 4096 for LLaMA-7B
self.vision_projection = nn.Sequential(
nn.Linear(vision_dim, llm_dim),
nn.GELU(),
nn.Linear(llm_dim, llm_dim)
)
# Special tokens
self.tokenizer.add_special_tokens({
'pad_token': '[PAD]',
'additional_special_tokens': ['', ' ']
})
self.language_model.resize_token_embeddings(len(self.tokenizer))
# Freeze vision encoder initially
for param in self.vision_encoder.parameters():
param.requires_grad = False
def encode_image(self, images):
"""Encode images to visual tokens"""
with torch.no_grad():
vision_outputs = self.vision_encoder(images)
image_embeddings = vision_outputs.last_hidden_state
# Project to LLM dimension
projected_embeddings = self.vision_projection(image_embeddings)
return projected_embeddings
def forward(self, input_ids, attention_mask, images=None, labels=None):
"""Forward pass with optional images"""
batch_size, seq_len = input_ids.shape
# Get text embeddings
inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
if images is not None:
# Encode images
image_embeds = self.encode_image(images) # [batch, num_patches, llm_dim]
# Find token positions
image_token_id = self.tokenizer.convert_tokens_to_ids('')
image_positions = (input_ids == image_token_id)
# Replace tokens with actual image embeddings
for batch_idx in range(batch_size):
img_positions = torch.where(image_positions[batch_idx])[0]
if len(img_positions) > 0:
# Replace first image token position with image patches
pos = img_positions[0]
# Insert image patches at the position
inputs_embeds[batch_idx, pos:pos+1] = image_embeds[batch_idx, :1]
# Forward through language model
outputs = self.language_model(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
labels=labels
)
return outputs
class VisionLanguageDataset(Dataset):
"""Dataset for vision-language instruction tuning"""
def __init__(self, data_path, tokenizer, vision_processor, max_length=2048):
with open(data_path, 'r') as f:
self.data = json.load(f)
self.tokenizer = tokenizer
self.vision_processor = vision_processor
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
item = self.data[idx]
# Load and process image
image = Image.open(item['image_path']).convert('RGB')
image_tensor = self.vision_processor(image, return_tensors='pt')['pixel_values'][0]
# Create instruction-response format
instruction = item['instruction']
response = item['response']
# Format: {instruction} {response}
text = f" {instruction} {response}"
# Tokenize
encoding = self.tokenizer(
text,
truncation=True,
max_length=self.max_length,
padding='max_length',
return_tensors='pt'
)
# Create labels (mask instruction part, only train on response)
instruction_text = f" {instruction} "
instruction_length = len(self.tokenizer.encode(instruction_text, add_special_tokens=False))
labels = encoding['input_ids'].clone()
labels[0, :instruction_length] = -100 # Ignore instruction tokens in loss
return {
'input_ids': encoding['input_ids'][0],
'attention_mask': encoding['attention_mask'][0],
'labels': labels[0],
'images': image_tensor
}
def train_vlm(model, train_dataset, eval_dataset=None):
"""Train the Vision-Language Model"""
training_args = TrainingArguments(
output_dir="./vlm-finetuned",
per_device_train_batch_size=4, # Adjust based on GPU memory
per_device_eval_batch_size=4,
gradient_accumulation_steps=8, # Effective batch size: 4*8=32
num_train_epochs=3,
learning_rate=2e-5,
warmup_steps=500,
weight_decay=0.01,
logging_steps=50,
save_steps=1000,
eval_steps=1000,
evaluation_strategy="steps" if eval_dataset else "no",
save_total_limit=3,
load_best_model_at_end=True if eval_dataset else False,
metric_for_best_model="eval_loss" if eval_dataset else None,
fp16=True, # Mixed precision training
dataloader_num_workers=4,
remove_unused_columns=False,
report_to="wandb", # Optional: for experiment tracking
)
# Custom data collator
def data_collator(features):
batch = {}
batch['input_ids'] = torch.stack([f['input_ids'] for f in features])
batch['attention_mask'] = torch.stack([f['attention_mask'] for f in features])
batch['labels'] = torch.stack([f['labels'] for f in features])
batch['images'] = torch.stack([f['images'] for f in features])
return batch
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
# Train the model
trainer.train()
# Save the final model
trainer.save_model()
return trainer
def inference_example(model, tokenizer, vision_processor, image_path, instruction):
"""Run inference with the trained model"""
model.eval()
# Load and process image
image = Image.open(image_path).convert('RGB')
image_tensor = vision_processor(image, return_tensors='pt')['pixel_values']
# Prepare input
text = f" {instruction} "
input_ids = tokenizer.encode(text, return_tensors='pt')
with torch.no_grad():
# Generate response
outputs = model.language_model.generate(
input_ids=input_ids,
images=image_tensor,
max_length=512,
do_sample=True,
temperature=0.7,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
# Decode response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.replace(text.strip(), "").strip()
return response
# Example usage
if __name__ == "__main__":
# Initialize model
model = VisionLanguageModel("meta-llama/Llama-2-7b-hf")
# Load datasets
train_dataset = VisionLanguageDataset(
"train_data.json",
model.tokenizer,
model.vision_processor
)
eval_dataset = VisionLanguageDataset(
"eval_data.json",
model.tokenizer,
model.vision_processor
)
# Train the model
trainer = train_vlm(model, train_dataset, eval_dataset)
# Example inference
response = inference_example(
model,
model.tokenizer,
model.vision_processor,
"test_image.jpg",
"What do you see in this image?"
)
print(f"Model response: {response}")