Feature/llava next and onevision variants (#1202)

jlarson4 · web-flow · commit 04ccabfaad2f · 2026-03-13T11:00:35.000-05:00
* Creating Architecture Adapters for Llava Next and Onevision variants

* Format checks

* Updated testing for older models with new fixes

* improved comments, slight loosening of tolerances
diff --git a/tests/integration/model_bridge/compatibility/test_legacy_hooks.py b/tests/integration/model_bridge/compatibility/test_legacy_hooks.py
@@ -172,10 +172,11 @@ def test_cache_hook_equality_with_hooked_transformer(
                 assert torch.allclose(
                     hooked_transformer_activation[unmasked_positions],
                     bridge_activation[unmasked_positions],
-                    atol=1e-6,
-                    rtol=1e-6,
+                    atol=1e-4,
+                    rtol=1e-4,
                 ), (
-                    "Unmasked attention scores should match within float32 " "numerical precision"
+                    "Unmasked attention scores should match within float32 "
+                    "cross-implementation tolerance"
                 )
 
                 masked_bridge_values = bridge_activation[masked_positions]
diff --git a/transformer_lens/benchmarks/main_benchmark.py b/transformer_lens/benchmarks/main_benchmark.py
@@ -112,6 +112,8 @@ def _hf_token() -> Optional[str]:
     "Gemma3ForCausalLM",
     "Gemma3ForConditionalGeneration",
     "LlavaForConditionalGeneration",
+    "LlavaNextForConditionalGeneration",
+    "LlavaOnevisionForConditionalGeneration",
 ]
 
 
@@ -188,6 +190,8 @@ def _is_multimodal_model(model_name: str, trust_remote_code: bool = False) -> bo
     """Check if a model is a multimodal (vision-language) model."""
     MULTIMODAL_ARCHITECTURES = [
         "LlavaForConditionalGeneration",
+        "LlavaNextForConditionalGeneration",
+        "LlavaOnevisionForConditionalGeneration",
         "Gemma3ForConditionalGeneration",
     ]
     try:
@@ -1117,6 +1121,11 @@ def cleanup_model(model, model_name_str: str):
         bridge_unprocessed = TransformerBridge.boot_transformers(model_name, device=device, dtype=bridge_dtype, trust_remote_code=trust_remote_code)  # type: ignore[attr-defined]
         if verbose:
             print("✓ TransformerBridge loaded (unprocessed)\n")
+        # Apply the adapter's prepare_model() to the HF reference model so
+        # both bridge and reference have the same fixups (e.g., weight tying).
+        # This keeps model-specific logic in the adapter, not the benchmark.
+        if hf_model is not None and hasattr(bridge_unprocessed, "adapter"):
+            bridge_unprocessed.adapter.prepare_model(hf_model)
     except Exception as e:
         import traceback
 
diff --git a/transformer_lens/benchmarks/multimodal.py b/transformer_lens/benchmarks/multimodal.py
@@ -31,7 +31,9 @@ def _create_test_image():
 def _prepare_test_inputs(bridge: TransformerBridge):
     """Prepare multimodal test inputs using the bridge's processor.
 
-    Returns (input_ids, pixel_values, prompt) or (None, None, None) on failure.
+    Returns (input_ids, extra_kwargs, prompt) where extra_kwargs is a dict
+    containing pixel_values and any other processor outputs (e.g. image_sizes
+    for LlavaNext).  Returns (None, None, None) on failure.
     """
     if bridge.processor is None:
         return None, None, None
@@ -51,8 +53,19 @@ def _prepare_test_inputs(bridge: TransformerBridge):
     try:
         inputs = bridge.processor(text=prompt, images=image, return_tensors="pt")
         input_ids = inputs["input_ids"].to(bridge.cfg.device)
-        pixel_values = inputs["pixel_values"].to(bridge.cfg.device)
-        return input_ids, pixel_values, prompt
+
+        # Collect all extra kwargs the model's forward() may need
+        # (pixel_values, image_sizes, pixel_attention_mask, etc.)
+        extra_kwargs = {}
+        for key, val in inputs.items():
+            if key == "input_ids":
+                continue
+            if hasattr(val, "to"):
+                extra_kwargs[key] = val.to(bridge.cfg.device)
+            else:
+                extra_kwargs[key] = val
+
+        return input_ids, extra_kwargs, prompt
     except Exception:
         return None, None, None
 
@@ -88,7 +101,7 @@ def benchmark_multimodal_forward(
             message="Skipped for tiny/test model",
         )
 
-    input_ids, pixel_values, prompt = _prepare_test_inputs(bridge)
+    input_ids, extra_kwargs, prompt = _prepare_test_inputs(bridge)
     if input_ids is None:
         return BenchmarkResult(
             name="multimodal_forward",
@@ -98,7 +111,7 @@ def benchmark_multimodal_forward(
 
     try:
         with torch.no_grad():
-            logits = bridge.forward(input_ids, pixel_values=pixel_values, return_type="logits")
+            logits = bridge.forward(input_ids, return_type="logits", **extra_kwargs)
 
         if logits is None:
             return BenchmarkResult(
@@ -120,14 +133,17 @@ def benchmark_multimodal_forward(
                 passed=False,
             )
 
+        pixel_values = extra_kwargs.get("pixel_values")
         return BenchmarkResult(
             name="multimodal_forward",
             severity=BenchmarkSeverity.INFO,
             message=f"Multimodal forward pass successful, logits shape: {list(logits.shape)}",
             details={
                 "logits_shape": list(logits.shape),
                 "input_ids_shape": list(input_ids.shape),
-                "pixel_values_shape": list(pixel_values.shape),
+                "pixel_values_shape": list(pixel_values.shape)
+                if pixel_values is not None
+                else None,
             },
         )
 
@@ -173,7 +189,7 @@ def benchmark_multimodal_generation(
             message="Skipped for tiny/test model",
         )
 
-    input_ids, pixel_values, prompt = _prepare_test_inputs(bridge)
+    input_ids, extra_kwargs, prompt = _prepare_test_inputs(bridge)
     if input_ids is None:
         return BenchmarkResult(
             name="multimodal_generation",
@@ -185,8 +201,8 @@ def benchmark_multimodal_generation(
         output = bridge.generate(
             input_ids,
             max_new_tokens=max_new_tokens,
-            pixel_values=pixel_values,
             return_type="tokens",
+            **extra_kwargs,
         )
 
         if not isinstance(output, torch.Tensor):
@@ -264,7 +280,7 @@ def benchmark_multimodal_cache(
             message="Skipped for tiny/test model",
         )
 
-    input_ids, pixel_values, prompt = _prepare_test_inputs(bridge)
+    input_ids, extra_kwargs, prompt = _prepare_test_inputs(bridge)
     if input_ids is None:
         return BenchmarkResult(
             name="multimodal_cache",
@@ -274,7 +290,7 @@ def benchmark_multimodal_cache(
 
     try:
         with torch.no_grad():
-            logits, cache = bridge.run_with_cache(input_ids, pixel_values=pixel_values)
+            logits, cache = bridge.run_with_cache(input_ids, **extra_kwargs)
 
         if cache is None or len(cache) == 0:
             return BenchmarkResult(
diff --git a/transformer_lens/factories/architecture_adapter_factory.py b/transformer_lens/factories/architecture_adapter_factory.py
@@ -18,6 +18,8 @@
     GPTOSSArchitectureAdapter,
     LlamaArchitectureAdapter,
     LlavaArchitectureAdapter,
+    LlavaNextArchitectureAdapter,
+    LlavaOnevisionArchitectureAdapter,
     MingptArchitectureAdapter,
     MistralArchitectureAdapter,
     MixtralArchitectureAdapter,
@@ -55,6 +57,8 @@
     "GPTJForCausalLM": GptjArchitectureAdapter,
     "LlamaForCausalLM": LlamaArchitectureAdapter,
     "LlavaForConditionalGeneration": LlavaArchitectureAdapter,
+    "LlavaNextForConditionalGeneration": LlavaNextArchitectureAdapter,
+    "LlavaOnevisionForConditionalGeneration": LlavaOnevisionArchitectureAdapter,
     "MixtralForCausalLM": MixtralArchitectureAdapter,
     "MistralForCausalLM": MistralArchitectureAdapter,
     "NeoForCausalLM": NeoArchitectureAdapter,
diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py
@@ -1805,6 +1805,7 @@ def generate(
         verbose: bool = True,
         output_logits: bool = False,
         pixel_values: Optional[torch.Tensor] = None,
+        **multimodal_kwargs,
     ) -> str | list[str] | torch.Tensor | Any:  # Any for transformers.utils.ModelOutput
         # Using Any due to beartype's forward reference resolution limitations.
         # See: https://github.com/beartype/beartype/issues/546
@@ -1920,10 +1921,15 @@ def generate(
                     )
                 else:
                     forward_kwargs: Dict[str, Any] = {}
-                    # Pass pixel_values only on the first step — the vision encoder
-                    # processes the image once, embedding it into the token sequence.
-                    if gen_step_idx == 0 and pixel_values is not None:
-                        forward_kwargs["pixel_values"] = pixel_values
+                    # Pass multimodal inputs only on the first step — the vision
+                    # encoder processes the image once, embedding it into the
+                    # token sequence.  This includes pixel_values plus any extra
+                    # processor outputs (e.g. image_sizes for LlavaNext).
+                    if gen_step_idx == 0:
+                        if pixel_values is not None:
+                            forward_kwargs["pixel_values"] = pixel_values
+                        if multimodal_kwargs:
+                            forward_kwargs.update(multimodal_kwargs)
                     logits = self(current_tokens, return_type="logits", **forward_kwargs)
                 final_logits = logits[:, -1, :]
 
diff --git a/transformer_lens/model_bridge/sources/transformers.py b/transformer_lens/model_bridge/sources/transformers.py
@@ -252,6 +252,8 @@ def get_hf_model_class_for_architecture(architecture: str):
     }
     multimodal_architectures = {
         "LlavaForConditionalGeneration",
+        "LlavaNextForConditionalGeneration",
+        "LlavaOnevisionForConditionalGeneration",
         "Gemma3ForConditionalGeneration",
     }
     if architecture in seq2seq_architectures:
@@ -453,7 +455,50 @@ def boot(
                 trust_remote_code=trust_remote_code,
             )
         except Exception:
-            pass  # Processor not available; user can set bridge.processor manually
+            # Some multimodal processors (e.g., LlavaOnevision) require
+            # torchvision for video processing.  Conditionally install it
+            # and retry the processor loading.
+            _torchvision_available = False
+            try:
+                import torchvision  # noqa: F401
+
+                _torchvision_available = True
+            except Exception:
+                # torchvision may be missing (ImportError) or broken/version-
+                # mismatched (RuntimeError).  Try to install/reinstall it.
+                import shutil
+                import subprocess
+                import sys
+
+                try:
+                    if shutil.which("uv"):
+                        subprocess.check_call(
+                            ["uv", "pip", "install", "torchvision", "-q"],
+                        )
+                    else:
+                        subprocess.check_call(
+                            [sys.executable, "-m", "pip", "install", "torchvision", "-q"],
+                        )
+                    import importlib
+
+                    importlib.invalidate_caches()
+                    _torchvision_available = True
+                except Exception:
+                    pass  # torchvision install failed; processor will be unavailable
+
+            if _torchvision_available:
+                try:
+                    from transformers import AutoProcessor
+
+                    huggingface_token = os.environ.get("HF_TOKEN", "")
+                    token_arg = huggingface_token if len(huggingface_token) > 0 else None
+                    bridge.processor = AutoProcessor.from_pretrained(
+                        model_name,
+                        token=token_arg,
+                        trust_remote_code=trust_remote_code,
+                    )
+                except Exception:
+                    pass  # Processor not available; user can set bridge.processor manually
 
     return bridge
 
diff --git a/transformer_lens/model_bridge/supported_architectures/__init__.py b/transformer_lens/model_bridge/supported_architectures/__init__.py
@@ -39,6 +39,12 @@
 from transformer_lens.model_bridge.supported_architectures.llava import (
     LlavaArchitectureAdapter,
 )
+from transformer_lens.model_bridge.supported_architectures.llava_next import (
+    LlavaNextArchitectureAdapter,
+)
+from transformer_lens.model_bridge.supported_architectures.llava_onevision import (
+    LlavaOnevisionArchitectureAdapter,
+)
 from transformer_lens.model_bridge.supported_architectures.mingpt import (
     MingptArchitectureAdapter,
 )
@@ -116,6 +122,8 @@
     "GptjArchitectureAdapter",
     "LlamaArchitectureAdapter",
     "LlavaArchitectureAdapter",
+    "LlavaNextArchitectureAdapter",
+    "LlavaOnevisionArchitectureAdapter",
     "MingptArchitectureAdapter",
     "MistralArchitectureAdapter",
     "MixtralArchitectureAdapter",
diff --git a/transformer_lens/model_bridge/supported_architectures/llava_next.py b/transformer_lens/model_bridge/supported_architectures/llava_next.py
@@ -0,0 +1,15 @@
+"""LLava-NeXT architecture adapter.
+
+Same module hierarchy as base LLava; high-res tiling differences are
+handled internally by HuggingFace's forward().
+"""
+
+from transformer_lens.model_bridge.supported_architectures.llava import (
+    LlavaArchitectureAdapter,
+)
+
+
+class LlavaNextArchitectureAdapter(LlavaArchitectureAdapter):
+    """Architecture adapter for LLaVA-NeXT (1.6) models."""
+
+    pass
diff --git a/transformer_lens/model_bridge/supported_architectures/llava_onevision.py b/transformer_lens/model_bridge/supported_architectures/llava_onevision.py
@@ -0,0 +1,36 @@
+"""LLava-OneVision architecture adapter.
+
+Same module hierarchy as base LLava; SigLIP encoder and Qwen2 backbone
+are handled dynamically by the base adapter and HuggingFace's forward().
+"""
+
+from typing import Any
+
+from transformer_lens.model_bridge.supported_architectures.llava import (
+    LlavaArchitectureAdapter,
+)
+
+
+class LlavaOnevisionArchitectureAdapter(LlavaArchitectureAdapter):
+    """Architecture adapter for LLaVA-OneVision models."""
+
+    def prepare_model(self, hf_model: Any) -> None:
+        """Fix weight tying when text_config and top-level config disagree.
+
+        Some checkpoints have tie_word_embeddings=True in text_config but False
+        at the top level, leaving lm_head randomly initialized.
+        """
+        if not hasattr(hf_model, "lm_head") or not hasattr(hf_model, "model"):
+            return
+        language_model = getattr(hf_model.model, "language_model", None)
+        if language_model is None:
+            return
+        embed = getattr(language_model, "embed_tokens", None)
+        if embed is None:
+            return
+
+        # Check if text config expects tied weights but top-level config doesn't
+        text_config = getattr(hf_model.config, "text_config", None)
+        if text_config is not None and getattr(text_config, "tie_word_embeddings", False):
+            if not getattr(hf_model.config, "tie_word_embeddings", True):
+                hf_model.lm_head.weight = embed.weight
diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json
diff --git a/transformer_lens/tools/model_registry/verify_models.py b/transformer_lens/tools/model_registry/verify_models.py