Building on Vision-Language Models, Vision-Language-Action (VLA) models represent the next evolutionary leap: AI systems that can not only see and understand the world, but can interact with it physically. These models bridge the gap between digital reasoning and real-world action, enabling robots to understand instructions, perceive their environment, and execute complex tasks.
While Vision-Language Models revolutionized how AI understands multimodal content, they remained confined to digital responses. VLA models break this barrier by adding a crucial third modality: actions. This enables AI to move from passive understanding to active interaction with the physical world.
Explore the journey from text-only AI to embodied intelligence:
The breakthrough insight of VLA models is treating robot actions as discrete tokens, just like words in language. This allows the same transformer architecture that powers ChatGPT to control robot movements with unprecedented generality and efficiency.
See how robot movements become discrete tokens:
import numpy as np
import torch
class UniformActionTokenizer:
"""
Simple uniform binning for action tokenization
"""
def __init__(self, action_dims=7, bins_per_dim=256, action_range=(-1, 1)):
self.action_dims = action_dims
self.bins_per_dim = bins_per_dim
self.action_min, self.action_max = action_range
# Calculate bin edges
self.bin_edges = np.linspace(self.action_min, self.action_max, bins_per_dim + 1)
# Vocabulary size grows exponentially with dimensions!
self.vocab_size = bins_per_dim ** action_dims
print(f"Warning: Vocab size = {self.vocab_size:,} tokens!")
def tokenize(self, actions):
"""Convert continuous actions to discrete tokens"""
# Clip actions to valid range
actions = np.clip(actions, self.action_min, self.action_max)
# Bin each dimension
tokens = []
for dim in range(self.action_dims):
# Find which bin each action falls into
dim_tokens = np.digitize(actions[:, dim], self.bin_edges) - 1
# Ensure tokens are in valid range
dim_tokens = np.clip(dim_tokens, 0, self.bins_per_dim - 1)
tokens.append(dim_tokens)
return np.array(tokens).T # Shape: [sequence_length, action_dims]
def detokenize(self, tokens):
"""Convert discrete tokens back to continuous actions"""
actions = []
for dim in range(self.action_dims):
# Get bin centers as continuous values
dim_actions = self.bin_edges[tokens[:, dim]] + \
(self.bin_edges[1] - self.bin_edges[0]) / 2
actions.append(dim_actions)
return np.array(actions).T
def get_vocab_size(self):
return self.vocab_size
# Example usage
tokenizer = UniformActionTokenizer(action_dims=7, bins_per_dim=256)
# Sample robot trajectory
trajectory = np.random.uniform(-1, 1, (10, 7)) # 10 timesteps, 7-DOF robot
print("Original trajectory shape:", trajectory.shape)
# Tokenize
tokens = tokenizer.tokenize(trajectory)
print("Tokenized shape:", tokens.shape)
print("Sample tokens:", tokens[0]) # First timestep tokens
# Detokenize
reconstructed = tokenizer.detokenize(tokens)
print("Reconstruction error:", np.mean(np.abs(trajectory - reconstructed)))
# Problem: Massive vocabulary size makes this impractical!
print(f"Vocabulary size: {tokenizer.get_vocab_size():,} tokens")
print("This would require a {:.1f}B parameter embedding layer!".format(
tokenizer.get_vocab_size() * 512 / 1e9 # Assuming 512D embeddings
))
import torch
import torch.nn as nn
import torch.nn.functional as F
class VectorQuantizer(nn.Module):
"""
Vector Quantization layer for action tokenization
Learns a codebook of common action patterns
"""
def __init__(self, num_embeddings, embedding_dim, commitment_cost=0.25):
super().__init__()
self.embedding_dim = embedding_dim
self.num_embeddings = num_embeddings
self.commitment_cost = commitment_cost
# Initialize codebook with uniform distribution
self.embedding = nn.Embedding(num_embeddings, embedding_dim)
self.embedding.weight.data.uniform_(-1/num_embeddings, 1/num_embeddings)
def forward(self, inputs):
# Convert inputs from [B, C, H, W] to [B, H, W, C]
inputs = inputs.permute(0, 2, 3, 1).contiguous()
input_shape = inputs.shape
# Flatten input
flat_input = inputs.view(-1, self.embedding_dim)
# Calculate distances to codebook entries
distances = (torch.sum(flat_input**2, dim=1, keepdim=True)
+ torch.sum(self.embedding.weight**2, dim=1)
- 2 * torch.matmul(flat_input, self.embedding.weight.t()))
# Find closest codebook entry
encoding_indices = torch.argmin(distances, dim=1).unsqueeze(1)
encodings = torch.zeros(encoding_indices.shape[0], self.num_embeddings, device=inputs.device)
encodings.scatter_(1, encoding_indices, 1)
# Quantize and unflatten
quantized = torch.matmul(encodings, self.embedding.weight).view(input_shape)
# Loss
e_latent_loss = F.mse_loss(quantized.detach(), inputs)
q_latent_loss = F.mse_loss(quantized, inputs.detach())
loss = q_latent_loss + self.commitment_cost * e_latent_loss
# Straight-through estimator
quantized = inputs + (quantized - inputs).detach()
avg_probs = torch.mean(encodings, dim=0)
perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
return quantized.permute(0, 3, 1, 2).contiguous(), loss, perplexity, encoding_indices
class ActionVQVAE(nn.Module):
"""
VQ-VAE specifically designed for robot action sequences
"""
def __init__(self, action_dim=7, hidden_dim=128, num_embeddings=8192):
super().__init__()
self.action_dim = action_dim
# Encoder: action sequence -> latent space
self.encoder = nn.Sequential(
nn.Linear(action_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim)
)
# Vector quantizer
self.vq_layer = VectorQuantizer(num_embeddings, hidden_dim)
# Decoder: quantized latent -> action sequence
self.decoder = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim)
)
def forward(self, actions):
# Encode actions
encoded = self.encoder(actions)
# Add spatial dimensions for VQ layer (hack for 1D actions)
encoded = encoded.unsqueeze(-1).unsqueeze(-1) # [B, T, C, 1, 1]
encoded = encoded.permute(0, 2, 3, 4, 1) # [B, C, 1, 1, T]
# Vector quantize
quantized, vq_loss, perplexity, encoding_indices = self.vq_layer(encoded.squeeze(-1))
# Remove spatial dimensions
quantized = quantized.squeeze(-1).squeeze(-1)
quantized = quantized.permute(0, 2, 1) # [B, T, C]
# Decode
reconstructed = self.decoder(quantized)
return {
'reconstructed': reconstructed,
'vq_loss': vq_loss,
'perplexity': perplexity,
'encoding_indices': encoding_indices,
'quantized': quantized
}
# Training the VQ-VAE
def train_action_vqvae(model, dataloader, num_epochs=100):
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(num_epochs):
total_loss = 0
for batch_actions in dataloader:
optimizer.zero_grad()
outputs = model(batch_actions)
# Reconstruction loss
recon_loss = F.mse_loss(outputs['reconstructed'], batch_actions)
# Total loss
loss = recon_loss + outputs['vq_loss']
loss.backward()
optimizer.step()
total_loss += loss.item()
if epoch % 20 == 0:
print(f"Epoch {epoch}, Loss: {total_loss/len(dataloader):.4f}, "
f"Perplexity: {outputs['perplexity']:.2f}")
# Example usage for robot actions
action_vqvae = ActionVQVAE(action_dim=7, num_embeddings=8192)
# Create sample robot trajectory data
robot_trajectories = torch.randn(1000, 50, 7) # 1000 trajectories, 50 timesteps, 7-DOF
# The VQ-VAE learns to represent actions with only 8192 tokens instead of 256^7!
print(f"Vocabulary size: {8192} tokens (vs {256**7:,} for binning)")
print("This makes transformer training feasible!")
import numpy as np
import torch
import torch.nn as nn
from scipy.fft import dct, idct
class FASTActionTokenizer:
"""
FAST (Frequency-domain Action Sequence Tokenization)
Uses DCT compression for high-frequency robot control
"""
def __init__(self, sequence_length=64, action_dim=7, num_frequencies=16):
self.sequence_length = sequence_length
self.action_dim = action_dim
self.num_frequencies = num_frequencies
# Total tokens per action sequence
self.tokens_per_sequence = num_frequencies * action_dim
# Create frequency bins for quantization
self.freq_bins = np.linspace(-1, 1, 256) # 8-bit quantization
def encode_sequence(self, action_sequence):
"""
Convert action sequence to frequency domain tokens
Args:
action_sequence: [sequence_length, action_dim] numpy array
Returns:
tokens: [num_frequencies * action_dim] discrete tokens
"""
# Apply DCT to each action dimension
freq_coefficients = dct(action_sequence, axis=0, norm='ortho')
# Keep only the most important frequencies
truncated_coeffs = freq_coefficients[:self.num_frequencies, :]
# Quantize frequency coefficients
quantized_coeffs = np.digitize(truncated_coeffs.flatten(), self.freq_bins) - 1
quantized_coeffs = np.clip(quantized_coeffs, 0, len(self.freq_bins) - 1)
return quantized_coeffs
def decode_sequence(self, tokens):
"""
Convert frequency domain tokens back to action sequence
Args:
tokens: [num_frequencies * action_dim] discrete tokens
Returns:
action_sequence: [sequence_length, action_dim] numpy array
"""
# Dequantize tokens back to frequency coefficients
dequantized = self.freq_bins[tokens]
freq_coefficients = dequantized.reshape(self.num_frequencies, self.action_dim)
# Pad with zeros for remaining frequencies
full_coefficients = np.zeros((self.sequence_length, self.action_dim))
full_coefficients[:self.num_frequencies, :] = freq_coefficients
# Apply inverse DCT
reconstructed_sequence = idct(full_coefficients, axis=0, norm='ortho')
return reconstructed_sequence
def get_vocab_size(self):
return len(self.freq_bins) # Only 256 tokens needed!
class FASTActionModel(nn.Module):
"""
Transformer model using FAST tokenization for robot control
Enables high-frequency control (200Hz) with efficient tokenization
"""
def __init__(self, vocab_size=256, hidden_dim=512, num_layers=8, num_heads=8):
super().__init__()
# Token embeddings for frequency coefficients
self.token_embedding = nn.Embedding(vocab_size, hidden_dim)
# Transformer layers
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(
d_model=hidden_dim,
nhead=num_heads,
dim_feedforward=hidden_dim * 4,
dropout=0.1,
batch_first=True
),
num_layers=num_layers
)
# Output projection to frequency tokens
self.output_proj = nn.Linear(hidden_dim, vocab_size)
def forward(self, input_tokens):
# Embed tokens
embedded = self.token_embedding(input_tokens)
# Transformer processing
transformed = self.transformer(embedded)
# Project to output vocabulary
logits = self.output_proj(transformed)
return logits
# Example: High-frequency robot control with FAST
def demonstrate_fast_control():
# Initialize FAST tokenizer
tokenizer = FASTActionTokenizer(
sequence_length=64, # 64 timesteps at 200Hz = 0.32 seconds
action_dim=7, # 7-DOF robot arm
num_frequencies=16 # Keep top 16 frequency components
)
print(f"Vocabulary size: {tokenizer.get_vocab_size()} tokens")
print(f"Tokens per sequence: {tokenizer.tokens_per_sequence}")
# Generate sample smooth robot trajectory
t = np.linspace(0, 2*np.pi, 64)
action_sequence = np.zeros((64, 7))
# Create smooth sinusoidal movements for each joint
for joint in range(7):
frequency = 0.5 + joint * 0.1 # Different frequency per joint
action_sequence[:, joint] = 0.5 * np.sin(frequency * t)
print("Original trajectory shape:", action_sequence.shape)
# Tokenize using FAST
tokens = tokenizer.encode_sequence(action_sequence)
print("Encoded tokens shape:", tokens.shape)
print("Sample tokens:", tokens[:10])
# Decode back to actions
reconstructed = tokenizer.decode_sequence(tokens)
print("Reconstructed shape:", reconstructed.shape)
# Measure reconstruction quality
mse_error = np.mean((action_sequence - reconstructed) ** 2)
print(f"Reconstruction MSE: {mse_error:.6f}")
return tokenizer, action_sequence, reconstructed
# Demonstrate FAST advantages
tokenizer, original, reconstructed = demonstrate_fast_control()
print("\n๐ FAST Tokenization Advantages:")
print(f"โ
Compact: Only {tokenizer.tokens_per_sequence} tokens per trajectory")
print(f"โ
High-frequency: Supports 200Hz control (vs 10-20Hz for other methods)")
print(f"โ
Smooth: Preserves trajectory smoothness through frequency domain")
print(f"โ
Efficient: {tokenizer.get_vocab_size()} vocab size vs millions for binning")
# Calculate compression ratio
original_size = original.size * 32 # 32-bit floats
compressed_size = tokenizer.tokens_per_sequence * 8 # 8-bit tokens
compression_ratio = original_size / compressed_size
print(f"โ
Compression: {compression_ratio:.1f}x smaller than full trajectory")
In a striking comparison, an open 7B-parameter VLA can compete with (and sometimes surpass) much larger closed modelsโhighlighting how **architecture, data curation, and tokenization** matter as much as raw parameter count.
import torch
import torch.nn as nn
from transformers import LlamaForCausalLM, LlamaTokenizer
import numpy as np
class OpenVLA(nn.Module):
"""
Simplified, display-only sketch of an OpenVLA-style model.
NOTE: This snippet illustrates structure. It will not run as-is without
proper processors, image patching, and checkpoint access.
"""
def __init__(self,
llama_model="meta-llama/Llama-2-7b-hf",
action_vocab_size=8192,
max_action_tokens=100):
super().__init__()
# Core language model (LLM backbone)
self.llama = LlamaForCausalLM.from_pretrained(llama_model)
self.tokenizer = LlamaTokenizer.from_pretrained(llama_model)
# Vision encoders (DINOv2 + SigLIP fusion, placeholders)
self.vision_encoder = self._build_vision_encoder()
# Action tokenization
self.action_vocab_size = action_vocab_size
self.max_action_tokens = max_action_tokens
# Extend tokenizer with special multimodal tokens
special_tokens = {
'additional_special_tokens': [
'', ' ', '', ' ',
'', ' ', '', ' '
]
}
self.tokenizer.add_special_tokens(special_tokens)
self.llama.resize_token_embeddings(len(self.tokenizer))
# Action head for generating robot control tokens
hidden_size = self.llama.config.hidden_size
self.action_head = nn.Linear(hidden_size, action_vocab_size)
def _build_vision_encoder(self):
"""Build fusion of DINOv2 and SigLIP vision encoders (placeholders)."""
from transformers import AutoModel
dinov2 = AutoModel.from_pretrained("facebook/dinov2-base") # requires image processor in practice
siglip = AutoModel.from_pretrained("google/siglip-base-patch16-224")
# Match LLM hidden size with a projection
fusion_proj = nn.Linear(768 + 768, self.llama.config.hidden_size)
return nn.ModuleDict({'dinov2': dinov2, 'siglip': siglip, 'fusion': fusion_proj})
def encode_vision(self, images):
"""
Encode images using DINOv2 + SigLIP fusion.
images: preprocessed image tensor batches.
"""
dinov2_features = self.vision_encoder.dinov2(images).last_hidden_state
siglip_features = self.vision_encoder.siglip(images).last_hidden_state
fused = torch.cat([dinov2_features, siglip_features], dim=-1)
vision_tokens = self.vision_encoder.fusion(fused) # [B, N_img, D]
return vision_tokens
def _embed_special(self, token_str, device):
tid = self.tokenizer.convert_tokens_to_ids(token_str)
tid = torch.tensor([tid], device=device)
return self.llama.get_input_embeddings()(tid)[None, :, :] # [1, 1, D]
def _build_input_sequence(self, vision_tokens, text_input_ids, action_history_embeds=None):
"""
Concatenate: + vision_tokens + + text_embeds + optional + history.
Returns embeddings ready for the LLM forward.
"""
device = vision_tokens.device
text_embeds = self.llama.get_input_embeddings()(text_input_ids) # [B, T_txt, D]
img_start = self._embed_special('', device) # [1,1,D]
img_end = self._embed_special(' ', device)
if action_history_embeds is not None:
act_start = self._embed_special('', device)
act_end = self._embed_special(' ', device)
else:
act_start = act_end = None
B = text_embeds.size(0)
# Tile special tokens to batch
img_start = img_start.expand(B, -1, -1) # [B,1,D]
img_end = img_end.expand(B, -1, -1) # [B,1,D]
if act_start is not None:
act_start = act_start.expand(B, -1, -1)
act_end = act_end.expand(B, -1, -1)
seq_parts = [img_start, vision_tokens, img_end, text_embeds]
if action_history_embeds is not None:
seq_parts.extend([act_start, action_history_embeds, act_end])
return torch.cat(seq_parts, dim=1) # [B, T_total, D]
def forward(self, images, text_input, action_history=None):
"""
Forward pass for VLA inference (display-only).
"""
device = images.device
B = images.size(0)
# Encode vision
vision_tokens = self.encode_vision(images) # [B, N_vis, D]
# Tokenize text
tok = self.tokenizer(list(text_input), return_tensors='pt', padding=True, truncation=True, max_length=512)
text_input_ids = tok.input_ids.to(device) # [B, T_txt]
# Optional: embed previous action tokens (if provided)
action_history_embeds = None
if action_history is not None:
# Assume action_history already token ids shaped [B, T_act]
action_history_embeds = self.llama.get_input_embeddings()(action_history.to(device))
# Build multimodal sequence
inputs_embeds = self._build_input_sequence(vision_tokens, text_input_ids, action_history_embeds)
# Run LLM
out = self.llama(inputs_embeds=inputs_embeds, use_cache=False)
# Project to action vocab
action_logits = self.action_head(out.last_hidden_state) # [B, T_total, V_act]
return action_logits
Google's RT-2 (Robotics Transformer 2) established the VLA paradigm by treating robot actions as text tokens. Built on PaLM-E foundations, RT-2 demonstrated that large multimodal models could generalize across robot embodiments and tasks.
RT-2 treats robot actions as natural language strings:
OpenVLA demonstrates that open source approaches can match or exceed proprietary systems through careful architecture design, efficient tokenization, and community-driven data curation.
class CrossEmbodimentAdapter(nn.Module):
"""
Lightweight adapter for different robot morphologies
Enables one model to control many robot types
"""
def __init__(self, hidden_dim=4096, robot_dof=7, adapter_rank=64):
super().__init__()
self.robot_dof = robot_dof
# Low-rank adaptation for robot-specific control
self.adapter_down = nn.Linear(hidden_dim, adapter_rank, bias=False)
self.adapter_up = nn.Linear(adapter_rank, robot_dof * 2, bias=False) # mean + std
# Robot-specific normalization parameters
self.action_mean = nn.Parameter(torch.zeros(robot_dof))
self.action_std = nn.Parameter(torch.ones(robot_dof))
# Initialize with small weights
nn.init.normal_(self.adapter_down.weight, std=0.02)
nn.init.zeros_(self.adapter_up.weight)
def forward(self, hidden_states, robot_id=None):
"""
Apply robot-specific adaptation to action predictions
Args:
hidden_states: [batch, seq, hidden_dim] transformer outputs
robot_id: Optional robot identifier for multi-robot batches
"""
# Low-rank adaptation
adapter_output = self.adapter_up(self.adapter_down(hidden_states))
action_params = adapter_output.view(-1, self.robot_dof, 2) # [B*T, DOF, 2]
raw_actions = action_params[..., 0] # Mean predictions
action_uncertainty = torch.softplus(action_params[..., 1]) # Std predictions
# Robot-specific normalization
normalized_actions = raw_actions * self.action_std + self.action_mean
return normalized_actions, action_uncertainty
class MultiRobotVLA(nn.Module):
"""OpenVLA with support for multiple robot embodiments"""
def __init__(self, base_model_path, robot_configs):
super().__init__()
# Base VLA model (shared across robots)
self.base_vla = OpenVLA.from_pretrained(base_model_path)
# Robot-specific adapters
self.robot_adapters = nn.ModuleDict()
for robot_name, config in robot_configs.items():
self.robot_adapters[robot_name] = CrossEmbodimentAdapter(
hidden_dim=self.base_vla.config.hidden_size,
robot_dof=config['dof'],
adapter_rank=config.get('adapter_rank', 64)
)
# Freeze base model, only train adapters
for param in self.base_vla.parameters():
param.requires_grad = False
def forward(self, images, instructions, robot_type, action_history=None):
# Get base model representations
hidden_states = self.base_vla.forward_hidden(images, instructions, action_history)
# Apply robot-specific adaptation
if robot_type in self.robot_adapters:
actions, uncertainty = self.robot_adapters[robot_type](hidden_states)
else:
raise ValueError(f"Unknown robot type: {robot_type}")
return actions, uncertainty
# Example: Training adapters for multiple robots
def train_cross_embodiment_adapters():
"""
Train lightweight adapters for different robot types
Enables rapid deployment to new robot platforms
"""
robot_configs = {
'franka_panda': {'dof': 7, 'workspace': [0.3, 0.7, -0.3, 0.3, 0.0, 0.8]},
'ur5': {'dof': 6, 'workspace': [0.2, 0.8, -0.4, 0.4, 0.0, 1.0]},
'xarm7': {'dof': 7, 'workspace': [0.2, 0.7, -0.4, 0.4, 0.0, 0.9]},
'mobile_manipulator': {'dof': 9, 'workspace': [0.0, 2.0, -1.0, 1.0, 0.0, 1.5]} # 3 base + 6 arm
}
# Initialize multi-robot model
model = MultiRobotVLA("openvla-7b", robot_configs)
# Only adapter parameters are trainable (efficient!)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} ({100*trainable_params/total_params:.2f}%)")
print("Adapter training is 100-1000x more efficient than full fine-tuning!")
return model
# Usage example
multi_robot_model = train_cross_embodiment_adapters()
Traditional robotics requires separate controllers for each robot type. VLA models enable cross-embodiment learning: training one foundation model that can control diverse robot morphologies through shared representations and adaptive interfaces.
See how one VLA model controls different robot types: