diff --git a/transformer_lens/benchmarks/audio.py b/transformer_lens/benchmarks/audio.py
new file mode 100644
index 000000000..7a2576f70
--- /dev/null
+++ b/transformer_lens/benchmarks/audio.py
@@ -0,0 +1,482 @@
+"""Audio benchmarks for TransformerBridge.
+
+Tests that audio encoder models (HuBERT, wav2vec2, etc.) correctly handle
+audio waveform inputs through forward(), run_with_cache(), and produce
+stable representations.
+"""
+
+from typing import List, Optional
+
+import torch
+
+from transformer_lens.benchmarks.utils import (
+    BenchmarkResult,
+    BenchmarkSeverity,
+    compare_tensors,
+    is_tiny_test_model,
+)
+from transformer_lens.model_bridge import TransformerBridge
+
+
+def benchmark_audio_forward(
+    bridge: TransformerBridge,
+    test_audio: torch.Tensor,
+    reference_model: Optional[torch.nn.Module] = None,
+) -> BenchmarkResult:
+    """Benchmark forward pass with audio input.
+
+    Compares bridge output against HF native model on the same waveform.
+    For bare encoder models, compares last_hidden_state. For CTC models,
+    compares logits.
+
+    Args:
+        bridge: TransformerBridge model to test
+        test_audio: Audio waveform tensor [batch, num_samples]
+        reference_model: Optional HF reference model for comparison
+    """
+    try:
+        with torch.no_grad():
+            # Use return_type="logits" — for audio encoders without logits, this
+            # returns the BaseModelOutput object (bridge falls through to logits=output).
+            bridge_output_raw = bridge(test_audio, return_type="logits")
+
+        # Extract the output tensor
+        if isinstance(bridge_output_raw, torch.Tensor):
+            bridge_output = bridge_output_raw
+            output_key = "logits"
+        elif hasattr(bridge_output_raw, "logits") and bridge_output_raw.logits is not None:
+            bridge_output = bridge_output_raw.logits
+            output_key = "logits"
+        elif hasattr(bridge_output_raw, "last_hidden_state"):
+            bridge_output = bridge_output_raw.last_hidden_state
+            output_key = "last_hidden_state"
+        else:
+            return BenchmarkResult(
+                name="audio_forward",
+                severity=BenchmarkSeverity.DANGER,
+                message="Bridge produced no recognizable output (no logits or last_hidden_state)",
+                passed=False,
+            )
+
+        if bridge_output.numel() == 0:
+            return BenchmarkResult(
+                name="audio_forward",
+                severity=BenchmarkSeverity.DANGER,
+                message="Bridge output is empty",
+                passed=False,
+            )
+
+        if torch.isnan(bridge_output).any() or torch.isinf(bridge_output).any():
+            return BenchmarkResult(
+                name="audio_forward",
+                severity=BenchmarkSeverity.DANGER,
+                message="Bridge output contains NaN or Inf values",
+                passed=False,
+            )
+
+        # Compare against HF reference if available
+        if reference_model is not None:
+            with torch.no_grad():
+                ref_output_raw = reference_model(input_values=test_audio)
+                if output_key == "logits":
+                    ref_output = ref_output_raw.logits
+                else:
+                    ref_output = ref_output_raw.last_hidden_state
+
+            return compare_tensors(
+                bridge_output,
+                ref_output,
+                atol=1e-3,
+                rtol=3e-2,
+                name="audio_forward",
+            )
+
+        return BenchmarkResult(
+            name="audio_forward",
+            severity=BenchmarkSeverity.INFO,
+            message=f"Audio forward pass successful ({output_key} shape: {bridge_output.shape})",
+            details={"output_shape": str(bridge_output.shape), "output_key": output_key},
+        )
+
+    except Exception as e:
+        return BenchmarkResult(
+            name="audio_forward",
+            severity=BenchmarkSeverity.ERROR,
+            message=f"Audio forward pass failed: {str(e)}",
+            passed=False,
+        )
+
+
+def benchmark_audio_cache(
+    bridge: TransformerBridge,
+    test_audio: torch.Tensor,
+) -> BenchmarkResult:
+    """Benchmark run_with_cache() for audio models.
+
+    Verifies that critical audio-specific hooks fire and produce valid tensors.
+
+    Args:
+        bridge: TransformerBridge model to test
+        test_audio: Audio waveform tensor [batch, num_samples]
+    """
+    try:
+        with torch.no_grad():
+            _, cache = bridge.run_with_cache(test_audio)
+
+        cache_keys = list(cache.keys())
+        if len(cache_keys) == 0:
+            return BenchmarkResult(
+                name="audio_cache",
+                severity=BenchmarkSeverity.DANGER,
+                message="run_with_cache returned empty cache",
+                passed=False,
+            )
+
+        # Check for critical audio-specific hooks
+        critical_hooks = [
+            "audio_feature_extractor.hook_out",
+            "conv_pos_embed.hook_out",
+            "embed_ln.hook_out",
+        ]
+        # Also check at least the first and last block
+        n_layers = bridge.cfg.n_layers
+        critical_hooks.append("blocks.0.hook_out")
+        critical_hooks.append(f"blocks.{n_layers - 1}.hook_out")
+
+        missing = [h for h in critical_hooks if h not in cache_keys]
+        found = len(critical_hooks) - len(missing)
+
+        # Check for NaN/Inf in cached values
+        nan_hooks = []
+        for key in cache_keys[:20]:  # Sample first 20 hooks
+            val = cache[key]
+            if isinstance(val, torch.Tensor) and (torch.isnan(val).any() or torch.isinf(val).any()):
+                nan_hooks.append(key)
+
+        if missing:
+            return BenchmarkResult(
+                name="audio_cache",
+                severity=BenchmarkSeverity.WARNING,
+                message=f"Missing {len(missing)} critical hooks: {missing[:3]}",
+                passed=found >= 3,  # Pass if at least 3 of 5 critical hooks present
+                details={
+                    "total_cached": len(cache_keys),
+                    "critical_found": found,
+                    "critical_expected": len(critical_hooks),
+                    "missing": missing,
+                },
+            )
+
+        if nan_hooks:
+            return BenchmarkResult(
+                name="audio_cache",
+                severity=BenchmarkSeverity.DANGER,
+                message=f"NaN/Inf found in {len(nan_hooks)} cached hooks",
+                passed=False,
+                details={"nan_hooks": nan_hooks[:5]},
+            )
+
+        return BenchmarkResult(
+            name="audio_cache",
+            severity=BenchmarkSeverity.INFO,
+            message=f"Audio cache successful: {len(cache_keys)} hooks captured, "
+            f"{found}/{len(critical_hooks)} critical hooks present",
+            details={
+                "total_cached": len(cache_keys),
+                "critical_found": found,
+                "critical_expected": len(critical_hooks),
+            },
+        )
+
+    except Exception as e:
+        return BenchmarkResult(
+            name="audio_cache",
+            severity=BenchmarkSeverity.ERROR,
+            message=f"Audio cache failed: {str(e)}",
+            passed=False,
+        )
+
+
+def benchmark_audio_representation_stability(
+    bridge: TransformerBridge,
+    test_audio: torch.Tensor,
+) -> BenchmarkResult:
+    """Benchmark representation stability under small input perturbations.
+
+    Verifies that the model produces stable representations: similar audio
+    inputs should produce similar hidden states. Skip for tiny-random models
+    (random weights won't produce stable representations).
+
+    Args:
+        bridge: TransformerBridge model to test
+        test_audio: Audio waveform tensor [batch, num_samples]
+    """
+    model_name = getattr(bridge.cfg, "model_name", "")
+    if is_tiny_test_model(model_name):
+        return BenchmarkResult(
+            name="audio_representation_stability",
+            severity=BenchmarkSeverity.SKIPPED,
+            message="Skipped for tiny-random model (random weights won't produce stable representations)",
+        )
+
+    try:
+        # Create a slightly perturbed version
+        noise = torch.randn_like(test_audio) * 0.01
+        perturbed_audio = test_audio + noise
+
+        with torch.no_grad():
+            output_orig = bridge(test_audio, return_type="logits")
+            output_pert = bridge(perturbed_audio, return_type="logits")
+
+        # Extract hidden states — handle tensor, BaseModelOutput, or CTC output
+        def _extract_states(out):
+            if isinstance(out, torch.Tensor):
+                return out
+            if hasattr(out, "last_hidden_state"):
+                return out.last_hidden_state
+            if hasattr(out, "logits") and out.logits is not None:
+                return out.logits
+            return None
+
+        orig_states = _extract_states(output_orig)
+        pert_states = _extract_states(output_pert)
+
+        if orig_states is None or pert_states is None:
+            return BenchmarkResult(
+                name="audio_representation_stability",
+                severity=BenchmarkSeverity.WARNING,
+                message="Could not extract hidden states for stability check",
+                passed=False,
+            )
+
+        # Compute cosine similarity (flatten to 2D: [batch, features])
+        orig_flat = orig_states.reshape(orig_states.shape[0], -1)
+        pert_flat = pert_states.reshape(pert_states.shape[0], -1)
+        cosine_sim = (
+            torch.nn.functional.cosine_similarity(orig_flat, pert_flat, dim=-1).mean().item()
+        )
+
+        passed = cosine_sim > 0.95
+        return BenchmarkResult(
+            name="audio_representation_stability",
+            severity=BenchmarkSeverity.INFO if passed else BenchmarkSeverity.WARNING,
+            message=f"Representation stability: cosine_similarity={cosine_sim:.4f} "
+            f"(threshold: 0.95)",
+            passed=passed,
+            details={"cosine_similarity": cosine_sim, "noise_std": 0.01},
+        )
+
+    except Exception as e:
+        return BenchmarkResult(
+            name="audio_representation_stability",
+            severity=BenchmarkSeverity.ERROR,
+            message=f"Representation stability check failed: {str(e)}",
+            passed=False,
+        )
+
+
+def benchmark_audio_feature_extractor(
+    bridge: TransformerBridge,
+    test_audio: torch.Tensor,
+) -> BenchmarkResult:
+    """Verify CNN feature extractor hook outputs.
+
+    Checks that the audio_feature_extractor.hook_out produces tensors with
+    correct shape and non-degenerate values.
+
+    Args:
+        bridge: TransformerBridge model to test
+        test_audio: Audio waveform tensor [batch, num_samples]
+    """
+    try:
+        with torch.no_grad():
+            _, cache = bridge.run_with_cache(test_audio)
+
+        hook_key = "audio_feature_extractor.hook_out"
+        if hook_key not in cache:
+            return BenchmarkResult(
+                name="audio_feature_extractor",
+                severity=BenchmarkSeverity.DANGER,
+                message=f"Hook '{hook_key}' not found in cache",
+                passed=False,
+            )
+
+        features = cache[hook_key]
+
+        # Check shape: should be [batch, conv_dim, num_frames]
+        if features.dim() != 3:
+            return BenchmarkResult(
+                name="audio_feature_extractor",
+                severity=BenchmarkSeverity.DANGER,
+                message=f"Expected 3D tensor [batch, conv_dim, frames], got {features.dim()}D",
+                passed=False,
+                details={"shape": str(features.shape)},
+            )
+
+        # Check for degenerate values
+        is_all_zeros = features.abs().max().item() == 0
+        has_nan = torch.isnan(features).any().item()
+        has_inf = torch.isinf(features).any().item()
+
+        if is_all_zeros or has_nan or has_inf:
+            issues = []
+            if is_all_zeros:
+                issues.append("all zeros")
+            if has_nan:
+                issues.append("NaN")
+            if has_inf:
+                issues.append("Inf")
+            return BenchmarkResult(
+                name="audio_feature_extractor",
+                severity=BenchmarkSeverity.DANGER,
+                message=f"Degenerate feature values: {', '.join(issues)}",
+                passed=False,
+                details={"shape": str(features.shape), "issues": issues},
+            )
+
+        return BenchmarkResult(
+            name="audio_feature_extractor",
+            severity=BenchmarkSeverity.INFO,
+            message=f"Feature extractor OK: shape={features.shape}, "
+            f"mean={features.mean().item():.4f}, std={features.std().item():.4f}",
+            details={
+                "shape": str(features.shape),
+                "mean": features.mean().item(),
+                "std": features.std().item(),
+            },
+        )
+
+    except Exception as e:
+        return BenchmarkResult(
+            name="audio_feature_extractor",
+            severity=BenchmarkSeverity.ERROR,
+            message=f"Feature extractor check failed: {str(e)}",
+            passed=False,
+        )
+
+
+def benchmark_audio_ctc_decode(
+    bridge: TransformerBridge,
+) -> BenchmarkResult:
+    """Benchmark CTC decoding for HubertForCTC models.
+
+    Loads a small sample from librispeech_asr_dummy, decodes via greedy CTC,
+    and reports the decoded text. Skipped for bare encoder models (no CTC head)
+    and tiny-random models.
+
+    Args:
+        bridge: TransformerBridge model to test
+    """
+    model_name = getattr(bridge.cfg, "model_name", "")
+    if is_tiny_test_model(model_name):
+        return BenchmarkResult(
+            name="audio_ctc_decode",
+            severity=BenchmarkSeverity.SKIPPED,
+            message="Skipped for tiny-random model (untrained CTC head)",
+        )
+
+    try:
+        from datasets import load_dataset
+
+        ds = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy",
+            "clean",
+            split="validation",
+            trust_remote_code=True,
+        )
+        audio = ds[0]["audio"]
+        reference_text = ds[0]["text"]
+        waveform = torch.tensor(audio["array"], dtype=torch.float32).unsqueeze(0)
+        waveform = waveform.to(bridge.cfg.device)
+
+        with torch.no_grad():
+            output = bridge(waveform, return_type=None)
+
+        if not hasattr(output, "logits") or output.logits is None:
+            return BenchmarkResult(
+                name="audio_ctc_decode",
+                severity=BenchmarkSeverity.SKIPPED,
+                message="Skipped: model output has no logits (bare encoder)",
+            )
+
+        # Greedy CTC decode
+        predicted_ids = torch.argmax(output.logits, dim=-1)
+
+        # Try to decode with processor
+        processor = getattr(bridge, "processor", None)
+        if processor is not None and hasattr(processor, "decode"):
+            decoded_text = processor.decode(predicted_ids[0])
+        elif processor is not None and hasattr(processor, "batch_decode"):
+            decoded_text = processor.batch_decode(predicted_ids)[0]
+        else:
+            decoded_text = str(predicted_ids[0].tolist()[:20]) + "..."
+
+        return BenchmarkResult(
+            name="audio_ctc_decode",
+            severity=BenchmarkSeverity.INFO,
+            message=f"CTC decode successful",
+            details={
+                "decoded_text": decoded_text[:200],
+                "reference_text": reference_text[:200],
+                "logits_shape": str(output.logits.shape),
+            },
+        )
+
+    except ImportError:
+        return BenchmarkResult(
+            name="audio_ctc_decode",
+            severity=BenchmarkSeverity.SKIPPED,
+            message="Skipped: 'datasets' package not available",
+        )
+    except Exception as e:
+        return BenchmarkResult(
+            name="audio_ctc_decode",
+            severity=BenchmarkSeverity.ERROR,
+            message=f"CTC decode failed: {str(e)}",
+            passed=False,
+        )
+
+
+def run_audio_benchmarks(
+    bridge: TransformerBridge,
+    test_audio: Optional[torch.Tensor] = None,
+    verbose: bool = True,
+) -> List[BenchmarkResult]:
+    """Run all audio benchmarks.
+
+    Args:
+        bridge: TransformerBridge model to test
+        test_audio: Optional audio waveform tensor. If None, generates synthetic audio.
+        verbose: Whether to print progress
+
+    Returns:
+        List of BenchmarkResult objects
+    """
+    if test_audio is None:
+        device = bridge.cfg.device
+        dtype = bridge.cfg.dtype
+        test_audio = torch.randn(1, 16000, device=device, dtype=dtype)
+
+    results = []
+
+    if verbose:
+        print("1. Audio Forward Pass")
+    results.append(benchmark_audio_forward(bridge, test_audio))
+
+    if verbose:
+        print("2. Audio Cache Verification")
+    results.append(benchmark_audio_cache(bridge, test_audio))
+
+    if verbose:
+        print("3. Representation Stability")
+    results.append(benchmark_audio_representation_stability(bridge, test_audio))
+
+    if verbose:
+        print("4. Feature Extractor Verification")
+    results.append(benchmark_audio_feature_extractor(bridge, test_audio))
+
+    if verbose:
+        print("5. CTC Decoding")
+    results.append(benchmark_audio_ctc_decode(bridge))
+
+    return results
diff --git a/transformer_lens/benchmarks/component_benchmark.py b/transformer_lens/benchmarks/component_benchmark.py
index 152b1c92b..3a0de96f4 100644
--- a/transformer_lens/benchmarks/component_benchmark.py
+++ b/transformer_lens/benchmarks/component_benchmark.py
@@ -53,6 +53,9 @@ def benchmark_all_components(
         skip_components = []
         if getattr(bridge.cfg, "is_multimodal", False):
             skip_components = ["vision_encoder", "vision_projector"]
+        if getattr(bridge.cfg, "is_audio_model", False):
+            # Audio preprocessing needs waveform input; validated in Phase 8
+            skip_components.extend(["audio_feature_extractor", "feat_proj", "conv_pos_embed"])
 
         # Run comprehensive benchmark
         report = benchmarker.benchmark_all_components(skip_components=skip_components)
diff --git a/transformer_lens/benchmarks/forward_pass.py b/transformer_lens/benchmarks/forward_pass.py
index 8532e95bc..a4940b2a0 100644
--- a/transformer_lens/benchmarks/forward_pass.py
+++ b/transformer_lens/benchmarks/forward_pass.py
@@ -39,7 +39,7 @@ def _get_decoder_input_ids(model: torch.nn.Module, batch_size: int = 1) -> torch
 
 def benchmark_forward_pass(
     bridge: TransformerBridge,
-    test_text: str,
+    test_input: Union[str, torch.Tensor],
     reference_model: Optional[Union[HookedTransformer, torch.nn.Module]] = None,
     reference_logits: Optional[torch.Tensor] = None,
     atol: float = 1e-3,
@@ -49,10 +49,10 @@ def benchmark_forward_pass(
 
     Args:
         bridge: TransformerBridge model to test
-        test_text: Input text for testing
+        test_input: Input text string or audio waveform tensor for testing
         reference_model: Optional reference model (HookedTransformer or HF model)
-        reference_logits: Optional pre-computed reference logits tensor (e.g., saved
-            from a prior HF forward pass to avoid needing both models in memory)
+        reference_logits: Optional pre-computed reference logits/hidden states tensor
+            (e.g., saved from a prior HF forward pass to avoid needing both models in memory)
         atol: Absolute tolerance for comparison
         rtol: Relative tolerance for comparison
 
@@ -60,13 +60,15 @@ def benchmark_forward_pass(
         BenchmarkResult with comparison details
     """
     try:
+        _is_audio = getattr(bridge.cfg, "is_audio_model", False)
+
         # Check if this is an encoder-decoder model
         is_enc_dec = _is_encoder_decoder(bridge.original_model)
 
         # Prepare extra kwargs for encoder-decoder models
         extra_kwargs = {}
-        if is_enc_dec:
-            tokens = bridge.to_tokens(test_text)
+        if is_enc_dec and isinstance(test_input, str):
+            tokens = bridge.to_tokens(test_input)
             batch_size = tokens.shape[0]
             decoder_input_ids = _get_decoder_input_ids(bridge.original_model, batch_size)
             decoder_input_ids = decoder_input_ids.to(tokens.device)
@@ -75,7 +77,19 @@ def benchmark_forward_pass(
         # Run bridge forward pass (use no_grad to match HF reference context —
         # MPS SDPA can produce different results with vs without gradient tracking)
         with torch.no_grad():
-            bridge_output = bridge(test_text, return_type="logits", **extra_kwargs)
+            if _is_audio and isinstance(test_input, torch.Tensor):
+                # Audio models: pass waveform, extract tensor from output
+                bridge_output_raw = bridge(test_input, return_type="logits")
+                if isinstance(bridge_output_raw, torch.Tensor):
+                    bridge_output = bridge_output_raw
+                elif hasattr(bridge_output_raw, "logits") and bridge_output_raw.logits is not None:
+                    bridge_output = bridge_output_raw.logits
+                elif hasattr(bridge_output_raw, "last_hidden_state"):
+                    bridge_output = bridge_output_raw.last_hidden_state
+                else:
+                    bridge_output = bridge_output_raw
+            else:
+                bridge_output = bridge(test_input, return_type="logits", **extra_kwargs)
 
         if reference_model is None and reference_logits is None:
             # No reference model or logits - just verify output shape and validity
@@ -106,12 +120,22 @@ def benchmark_forward_pass(
         if reference_logits is not None:
             reference_output = reference_logits.to(bridge_output.device)
         elif isinstance(reference_model, HookedTransformer):
-            reference_output = reference_model(test_text, return_type="logits")
+            reference_output = reference_model(test_input, return_type="logits")
+        elif _is_audio and isinstance(test_input, torch.Tensor):
+            # Audio HF reference model: pass waveform directly
+            assert reference_model is not None
+            with torch.no_grad():
+                hf_output = reference_model(input_values=test_input)
+                if hasattr(hf_output, "logits") and hf_output.logits is not None:
+                    reference_output = hf_output.logits
+                else:
+                    reference_output = hf_output.last_hidden_state
         else:
             # HuggingFace model (reference_model is guaranteed non-None here
             # because we returned early at line 80 when both are None)
             assert reference_model is not None
-            tokens = bridge.to_tokens(test_text)
+            assert isinstance(test_input, str), "Text model requires string input"
+            tokens = bridge.to_tokens(test_input)
             with torch.no_grad():
                 if is_enc_dec:
                     # Encoder-decoder models need decoder_input_ids
diff --git a/transformer_lens/benchmarks/main_benchmark.py b/transformer_lens/benchmarks/main_benchmark.py
index 1ace6a139..fe1b52e2e 100644
--- a/transformer_lens/benchmarks/main_benchmark.py
+++ b/transformer_lens/benchmarks/main_benchmark.py
@@ -79,6 +79,7 @@
 from transformer_lens.utilities.architectures import (
     NO_HT_COMPARISON_ARCHITECTURES,
     get_architectures_for_config,
+    is_audio_model,
     is_encoder_decoder_model,
     is_masked_lm_model,
 )
@@ -98,10 +99,7 @@ def should_skip_ht_comparison(model_name: str, trust_remote_code: bool = False)
 
 
 def get_auto_model_class(model_name: str, trust_remote_code: bool = False):
-    """Determine the correct AutoModel class for a given model.
-
-    Delegates to the bridge's architecture detection for consistency.
-    """
+    """Delegates to the bridge's architecture detection for consistency."""
     from transformer_lens.model_bridge.sources.transformers import (
         determine_architecture_from_hf_config,
         get_hf_model_class_for_architecture,
@@ -1014,6 +1012,13 @@ def cleanup_model(model, model_name_str: str):
             print(f"\nStack trace:\n{error_trace}")
         return results
 
+    # Detect audio model once for use across all phases
+    _is_audio = bridge_unprocessed is not None and getattr(
+        bridge_unprocessed.cfg, "is_audio_model", False
+    )
+    # Shared waveform for audio model benchmarks (consistent across HF capture and bridge forward)
+    _test_audio = torch.randn(1, 16000, device=device, dtype=dtype) if _is_audio else None
+
     # Run Phase 1 benchmarks
     if should_run_phase(1) and bridge_unprocessed:
         if verbose:
@@ -1040,38 +1045,52 @@ def cleanup_model(model, model_name_str: str):
                 if verbose:
                     print(f"✗ Component benchmark failed: {e}\n")
 
-            # Capture HF reference logits using bridge.to_tokens() for
-            # consistent tokenization (BOS prepending, etc.).  Both models
-            # are still in memory so this is still within the 2.0x window.
+            # Capture HF reference outputs. Both models are still in memory (2.0x window).
             if verbose:
                 print("Capturing HF reference outputs to CPU...")
             try:
-                hf_tokens = bridge_unprocessed.to_tokens(test_text)
-                is_enc_dec = is_encoder_decoder_model(
-                    model_name, trust_remote_code=trust_remote_code
-                )
-                with torch.no_grad():
-                    if is_enc_dec:
-                        decoder_start_id = getattr(
-                            getattr(hf_model, "config", None),
-                            "decoder_start_token_id",
-                            0,
+                if _is_audio:
+                    # Audio models: use the shared waveform for HF vs bridge comparison
+                    with torch.no_grad():
+                        hf_out = hf_model(input_values=_test_audio)
+                        # Audio encoders output last_hidden_state, not logits
+                        if hasattr(hf_out, "logits") and hf_out.logits is not None:
+                            hf_saved_logits = hf_out.logits.detach().cpu().clone()
+                        else:
+                            hf_saved_logits = hf_out.last_hidden_state.detach().cpu().clone()
+                        # No loss computation for audio — CTC requires aligned labels
+                    if verbose:
+                        print(
+                            f"✓ Captured HF audio output {hf_saved_logits.shape}, "
+                            f"loss=N/A (CTC requires labels)\n"
                         )
-                        dec_ids = torch.tensor([[decoder_start_id]]).to(hf_tokens.device)
-                        hf_out = hf_model(hf_tokens, decoder_input_ids=dec_ids)
-                    else:
-                        hf_out = hf_model(hf_tokens)
-                    hf_saved_logits = hf_out.logits.detach().cpu().clone()
-
-                    # Compute causal LM loss (shift logits and labels)
-                    if not is_enc_dec and hf_saved_logits.shape[1] > 1:
-                        shift_logits = hf_out.logits[..., :-1, :].contiguous()
-                        shift_labels = hf_tokens[..., 1:].contiguous()
-                        loss_fn = torch.nn.CrossEntropyLoss()
-                        hf_saved_loss = loss_fn(
-                            shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1),
-                        ).item()
+                else:
+                    hf_tokens = bridge_unprocessed.to_tokens(test_text)
+                    is_enc_dec = is_encoder_decoder_model(
+                        model_name, trust_remote_code=trust_remote_code
+                    )
+                    with torch.no_grad():
+                        if is_enc_dec:
+                            decoder_start_id = getattr(
+                                getattr(hf_model, "config", None),
+                                "decoder_start_token_id",
+                                0,
+                            )
+                            dec_ids = torch.tensor([[decoder_start_id]]).to(hf_tokens.device)
+                            hf_out = hf_model(hf_tokens, decoder_input_ids=dec_ids)
+                        else:
+                            hf_out = hf_model(hf_tokens)
+                        hf_saved_logits = hf_out.logits.detach().cpu().clone()
+
+                        # Compute causal LM loss (shift logits and labels)
+                        if not is_enc_dec and hf_saved_logits.shape[1] > 1:
+                            shift_logits = hf_out.logits[..., :-1, :].contiguous()
+                            shift_labels = hf_tokens[..., 1:].contiguous()
+                            loss_fn = torch.nn.CrossEntropyLoss()
+                            hf_saved_loss = loss_fn(
+                                shift_logits.view(-1, shift_logits.size(-1)),
+                                shift_labels.view(-1),
+                            ).item()
 
                 if verbose:
                     loss_str = f"{hf_saved_loss:.4f}" if hf_saved_loss is not None else "N/A"
@@ -1097,13 +1116,18 @@ def cleanup_model(model, model_name_str: str):
         # matmul non-determinism can exceed the float32 default of 1e-3
         p1_atol = 1e-3 if dtype == torch.float32 else 5e-3
 
+        # For audio models, reuse the waveform from HF reference capture
+        _p1_input: Union[str, torch.Tensor] = test_text
+        if _is_audio and _test_audio is not None:
+            _p1_input = _test_audio
+
         if hf_saved_logits is not None:
             # Full mode: use pre-captured HF logits (bridge only, 1.0x)
             try:
                 add_result(
                     benchmark_forward_pass(
                         bridge_unprocessed,
-                        test_text,
+                        _p1_input,
                         reference_logits=hf_saved_logits.to(device),
                         atol=p1_atol,
                     )
@@ -1113,17 +1137,18 @@ def cleanup_model(model, model_name_str: str):
                     print(f"✗ Forward pass benchmark failed: {e}\n")
         else:
             try:
-                add_result(benchmark_forward_pass(bridge_unprocessed, test_text, atol=p1_atol))
+                add_result(benchmark_forward_pass(bridge_unprocessed, _p1_input, atol=p1_atol))
             except Exception as e:
                 if verbose:
                     print(f"✗ Forward pass benchmark failed: {e}\n")
 
         # Capture Phase 1 reference for Phase 3 equivalence comparison.
+        # Skip for audio models (Phase 3 won't run — no HookedTransformer support).
         # When dtype==float32 (default) and the model natively uses reduced
         # precision, upcast for maximum accuracy.  When the user explicitly
         # requested a non-float32 dtype, run the reference pass in that dtype
         # so the entire pipeline honours the requested precision.
-        if bridge_unprocessed is not None:
+        if bridge_unprocessed is not None and not _is_audio:
             try:
                 original_dtype = bridge_unprocessed.cfg.dtype
                 needs_upcast = dtype == torch.float32 and original_dtype not in (
@@ -1192,11 +1217,13 @@ def cleanup_model(model, model_name_str: str):
             print("Running Phase 2 benchmarks...\n")
 
         # Generation benchmarks (unprocessed only) - RUN FIRST
-        # Skip for encoder-decoder models (T5, etc.) which require different generation API
-        is_enc_dec = is_encoder_decoder_model(model_name)
+        # Skip for encoder-decoder and audio models (no text generation capability)
+        _skip_generation = is_encoder_decoder_model(model_name) or getattr(
+            bridge_unprocessed.cfg, "is_audio_model", False
+        )
         if verbose:
             print("1. Generation Benchmarks (unprocessed)")
-        if is_enc_dec:
+        if _skip_generation:
             if verbose:
                 print("⏭️ Skipped (encoder-decoder model - requires decoder_input_ids)\n")
             add_result(
@@ -1342,6 +1369,7 @@ def cleanup_model(model, model_name_str: str):
         should_run_phase(4)
         and bridge_unprocessed is not None
         and not is_masked_lm_model(model_name, trust_remote_code=trust_remote_code)
+        and not is_audio_model(model_name, trust_remote_code=trust_remote_code)
     ):
         if verbose:
             print(f"\n{'='*80}")
@@ -1419,6 +1447,57 @@ def cleanup_model(model, model_name_str: str):
                 )
             )
 
+    # ========================================================================
+    # Phase 8: Audio Tests (only for audio encoder models)
+    # Runs before Phase 3 so we can reuse bridge_unprocessed before cleanup.
+    # ========================================================================
+    if (
+        bridge_unprocessed is not None
+        and getattr(bridge_unprocessed.cfg, "is_audio_model", False)
+        and should_run_phase(8)
+    ):
+        current_phase[0] = 8
+        if verbose:
+            print("\n" + "=" * 80)
+            print("PHASE 8: AUDIO TESTS")
+            print("=" * 80)
+            print("Testing audio forward pass, caching, representation stability, and features.")
+            print("=" * 80 + "\n")
+
+        try:
+            from transformer_lens.benchmarks.audio import run_audio_benchmarks
+
+            test_audio = torch.randn(1, 16000, device=device, dtype=dtype)
+            audio_results = run_audio_benchmarks(
+                bridge_unprocessed,
+                test_audio=test_audio,
+                verbose=verbose,
+            )
+            for result in audio_results:
+                result.phase = 8
+                results.append(result)
+                if verbose:
+                    print(result)
+
+            if verbose:
+                print("\n" + "=" * 80)
+                print("PHASE 8 COMPLETE")
+                print("=" * 80)
+
+        except Exception as e:
+            if verbose:
+                print(f"\n⚠ Audio tests failed: {e}\n")
+            results.append(
+                BenchmarkResult(
+                    name="audio_suite",
+                    passed=False,
+                    severity=BenchmarkSeverity.ERROR,
+                    message=f"Failed to run audio tests: {str(e)}",
+                    details={"error": str(e)},
+                    phase=8,
+                )
+            )
+
     # ========================================================================
     # PHASE 3: Bridge (processed) + HookedTransformer (processed)
     # ========================================================================
diff --git a/transformer_lens/config/TransformerBridgeConfig.py b/transformer_lens/config/TransformerBridgeConfig.py
index ca55067b5..fb5b887f6 100644
--- a/transformer_lens/config/TransformerBridgeConfig.py
+++ b/transformer_lens/config/TransformerBridgeConfig.py
@@ -86,6 +86,8 @@ def __init__(
         eps_attr: str = "eps",
         rmsnorm_uses_offset: bool = False,
         attn_implementation: Optional[str] = None,
+        # Audio model configuration
+        is_audio_model: bool = False,
         # Multimodal configuration
         is_multimodal: bool = False,
         vision_hidden_size: Optional[int] = None,
@@ -174,6 +176,8 @@ def __init__(
         self.eps_attr = eps_attr
         self.rmsnorm_uses_offset = rmsnorm_uses_offset
         self.attn_implementation = attn_implementation
+        # Audio model configuration
+        self.is_audio_model = is_audio_model
         # Multimodal configuration
         self.is_multimodal = is_multimodal
         self.vision_hidden_size = vision_hidden_size
diff --git a/transformer_lens/factories/architecture_adapter_factory.py b/transformer_lens/factories/architecture_adapter_factory.py
index 0e21ab84d..37fd62bbd 100644
--- a/transformer_lens/factories/architecture_adapter_factory.py
+++ b/transformer_lens/factories/architecture_adapter_factory.py
@@ -20,6 +20,7 @@
     GraniteArchitectureAdapter,
     GraniteMoeArchitectureAdapter,
     GraniteMoeHybridArchitectureAdapter,
+    HubertArchitectureAdapter,
     LlamaArchitectureAdapter,
     LlavaArchitectureAdapter,
     LlavaNextArchitectureAdapter,
@@ -63,6 +64,8 @@
     "GptOssForCausalLM": GPTOSSArchitectureAdapter,
     "GPT2LMHeadCustomModel": Gpt2LmHeadCustomArchitectureAdapter,
     "GPTJForCausalLM": GptjArchitectureAdapter,
+    "HubertForCTC": HubertArchitectureAdapter,
+    "HubertModel": HubertArchitectureAdapter,
     "LlamaForCausalLM": LlamaArchitectureAdapter,
     "LlavaForConditionalGeneration": LlavaArchitectureAdapter,
     "LlavaNextForConditionalGeneration": LlavaNextArchitectureAdapter,
diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py
index 45e4ddce6..469cba39f 100644
--- a/transformer_lens/model_bridge/bridge.py
+++ b/transformer_lens/model_bridge/bridge.py
@@ -105,7 +105,7 @@ def __init__(self, model: nn.Module, adapter: ArchitectureAdapter, tokenizer: An
         self.adapter = adapter
         self.cfg = adapter.cfg
         self.tokenizer = tokenizer
-        if self.cfg.d_vocab == -1:
+        if self.cfg.d_vocab == -1 and self.tokenizer is not None:
             if hasattr(self.tokenizer, "get_vocab"):
                 vocab = self.tokenizer.get_vocab()
                 self.cfg.d_vocab = max(vocab.values()) + 1
@@ -1214,6 +1214,7 @@ def forward(
         start_at_layer: Optional[int] = None,
         stop_at_layer: Optional[int] = None,
         pixel_values: Optional[torch.Tensor] = None,
+        input_values: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Any:
         """Forward pass through the model.
@@ -1230,6 +1231,9 @@ def forward(
             pixel_values: Optional image tensor for multimodal models (e.g., LLaVA, Gemma3).
                 The tensor is passed directly to the underlying HuggingFace model.
                 Only valid when cfg.is_multimodal is True.
+            input_values: Optional audio waveform tensor for audio models (e.g., HuBERT).
+                The tensor is passed directly to the underlying HuggingFace model.
+                Only valid when cfg.is_audio_model is True.
             **kwargs: Additional arguments passed to model
 
         Returns:
@@ -1252,6 +1256,11 @@ def forward(
 
         try:
             if isinstance(input, (str, list)):
+                if getattr(self.cfg, "is_audio_model", False):
+                    raise ValueError(
+                        "Audio models require tensor input (raw waveform), not text. "
+                        "Pass a torch.Tensor or use the input_values parameter."
+                    )
                 input_ids = self.to_tokens(
                     input, prepend_bos=prepend_bos, padding_side=padding_side
                 )
@@ -1323,8 +1332,32 @@ def forward(
                     )
                 kwargs["pixel_values"] = pixel_values
 
+            # Handle input_values for audio models
+            if input_values is not None:
+                if not getattr(self.cfg, "is_audio_model", False):
+                    raise ValueError(
+                        "input_values can only be passed to audio models "
+                        "(cfg.is_audio_model must be True)"
+                    )
+                kwargs["input_values"] = input_values
+
+            # Audio models take input_values (raw waveform), not input_ids
             original_tl_cache = past_kv_cache
-            output = self.original_model(input_ids, **kwargs)
+            if getattr(self.cfg, "is_audio_model", False):
+                # For audio models, input is the raw waveform tensor or
+                # input_values was passed as a keyword argument
+                if input_values is not None:
+                    output = self.original_model(**kwargs)
+                elif isinstance(input, torch.Tensor):
+                    kwargs["input_values"] = input
+                    output = self.original_model(**kwargs)
+                else:
+                    raise ValueError(
+                        "Audio models require tensor input (raw waveform). "
+                        "Pass a torch.Tensor or use input_values parameter."
+                    )
+            else:
+                output = self.original_model(input_ids, **kwargs)
             if (
                 original_tl_cache is not None
                 and hasattr(output, "past_key_values")
@@ -1361,6 +1394,11 @@ def forward(
             if return_type == "logits":
                 return logits
             elif return_type == "loss":
+                if getattr(self.cfg, "is_audio_model", False):
+                    raise ValueError(
+                        "Audio models do not support return_type='loss'. "
+                        "CTC loss requires aligned frame-level labels."
+                    )
                 # Always use self.loss_fn for consistency with HT's formula
                 # (log_softmax + gather).  HF's output.loss uses F.cross_entropy
                 # which gives different results in bfloat16.
@@ -1369,6 +1407,11 @@ def forward(
                 ), f"Expected logits tensor, got {type(logits)}"
                 return self.loss_fn(logits, input_ids, per_token=loss_per_token)
             elif return_type == "both":
+                if getattr(self.cfg, "is_audio_model", False):
+                    raise ValueError(
+                        "Audio models do not support return_type='both'. "
+                        "CTC loss requires aligned frame-level labels."
+                    )
                 assert isinstance(
                     logits, torch.Tensor
                 ), f"Expected logits tensor, got {type(logits)}"
diff --git a/transformer_lens/model_bridge/generalized_components/__init__.py b/transformer_lens/model_bridge/generalized_components/__init__.py
index 126746a71..ca38829c0 100644
--- a/transformer_lens/model_bridge/generalized_components/__init__.py
+++ b/transformer_lens/model_bridge/generalized_components/__init__.py
@@ -1,54 +1,77 @@
 """Bridge components for transformer architectures."""
-from transformer_lens.model_bridge.generalized_components.attention import AttentionBridge
+from transformer_lens.model_bridge.generalized_components.attention import (
+    AttentionBridge,
+)
+from transformer_lens.model_bridge.generalized_components.audio_feature_extractor import (
+    AudioFeatureExtractorBridge,
+)
 from transformer_lens.model_bridge.generalized_components.block import BlockBridge
-from transformer_lens.model_bridge.generalized_components.embedding import EmbeddingBridge
-from transformer_lens.model_bridge.generalized_components.rotary_embedding import (
-    RotaryEmbeddingBridge,
+from transformer_lens.model_bridge.generalized_components.bloom_attention import (
+    BloomAttentionBridge,
 )
-from transformer_lens.model_bridge.generalized_components.pos_embed import PosEmbedBridge
-from transformer_lens.model_bridge.generalized_components.normalization import NormalizationBridge
-from transformer_lens.model_bridge.generalized_components.rms_normalization import (
-    RMSNormalizationBridge,
+from transformer_lens.model_bridge.generalized_components.bloom_block import (
+    BloomBlockBridge,
+)
+from transformer_lens.model_bridge.generalized_components.bloom_mlp import (
+    BloomMLPBridge,
+)
+from transformer_lens.model_bridge.generalized_components.clip_vision_encoder import (
+    CLIPVisionEncoderBridge,
+    CLIPVisionEncoderLayerBridge,
 )
-from transformer_lens.model_bridge.generalized_components.linear import LinearBridge
 from transformer_lens.model_bridge.generalized_components.conv1d import Conv1DBridge
+from transformer_lens.model_bridge.generalized_components.conv_pos_embed import (
+    ConvPosEmbedBridge,
+)
+from transformer_lens.model_bridge.generalized_components.embedding import (
+    EmbeddingBridge,
+)
+from transformer_lens.model_bridge.generalized_components.gated_mlp import (
+    GatedMLPBridge,
+)
+from transformer_lens.model_bridge.generalized_components.joint_gate_up_mlp import (
+    JointGateUpMLPBridge,
+)
 from transformer_lens.model_bridge.generalized_components.joint_qkv_attention import (
     JointQKVAttentionBridge,
 )
 from transformer_lens.model_bridge.generalized_components.joint_qkv_position_embeddings_attention import (
     JointQKVPositionEmbeddingsAttentionBridge,
 )
-from transformer_lens.model_bridge.generalized_components.position_embeddings_attention import (
-    PositionEmbeddingsAttentionBridge,
-)
+from transformer_lens.model_bridge.generalized_components.linear import LinearBridge
 from transformer_lens.model_bridge.generalized_components.mlp import MLPBridge
-from transformer_lens.model_bridge.generalized_components.gated_mlp import GatedMLPBridge
 from transformer_lens.model_bridge.generalized_components.moe import MoEBridge
-from transformer_lens.model_bridge.generalized_components.joint_gate_up_mlp import (
-    JointGateUpMLPBridge,
+from transformer_lens.model_bridge.generalized_components.normalization import (
+    NormalizationBridge,
 )
-from transformer_lens.model_bridge.generalized_components.symbolic import SymbolicBridge
-from transformer_lens.model_bridge.generalized_components.unembedding import UnembeddingBridge
-from transformer_lens.model_bridge.generalized_components.t5_block import T5BlockBridge
-from transformer_lens.model_bridge.generalized_components.bloom_block import BloomBlockBridge
-from transformer_lens.model_bridge.generalized_components.bloom_attention import (
-    BloomAttentionBridge,
+from transformer_lens.model_bridge.generalized_components.pos_embed import (
+    PosEmbedBridge,
 )
-from transformer_lens.model_bridge.generalized_components.bloom_mlp import BloomMLPBridge
-from transformer_lens.model_bridge.generalized_components.clip_vision_encoder import (
-    CLIPVisionEncoderBridge,
-    CLIPVisionEncoderLayerBridge,
+from transformer_lens.model_bridge.generalized_components.position_embeddings_attention import (
+    PositionEmbeddingsAttentionBridge,
+)
+from transformer_lens.model_bridge.generalized_components.rms_normalization import (
+    RMSNormalizationBridge,
+)
+from transformer_lens.model_bridge.generalized_components.rotary_embedding import (
+    RotaryEmbeddingBridge,
 )
 from transformer_lens.model_bridge.generalized_components.siglip_vision_encoder import (
     SiglipVisionEncoderBridge,
     SiglipVisionEncoderLayerBridge,
 )
+from transformer_lens.model_bridge.generalized_components.symbolic import SymbolicBridge
+from transformer_lens.model_bridge.generalized_components.t5_block import T5BlockBridge
+from transformer_lens.model_bridge.generalized_components.unembedding import (
+    UnembeddingBridge,
+)
 from transformer_lens.model_bridge.generalized_components.vision_projection import (
     VisionProjectionBridge,
 )
 
 __all__ = [
     "AttentionBridge",
+    "AudioFeatureExtractorBridge",
     "BlockBridge",
     "BloomBlockBridge",
     "BloomAttentionBridge",
@@ -56,6 +79,7 @@
     "CLIPVisionEncoderBridge",
     "CLIPVisionEncoderLayerBridge",
     "Conv1DBridge",
+    "ConvPosEmbedBridge",
     "EmbeddingBridge",
     "RotaryEmbeddingBridge",
     "PosEmbedBridge",
diff --git a/transformer_lens/model_bridge/generalized_components/audio_feature_extractor.py b/transformer_lens/model_bridge/generalized_components/audio_feature_extractor.py
new file mode 100644
index 000000000..d2ac84a2d
--- /dev/null
+++ b/transformer_lens/model_bridge/generalized_components/audio_feature_extractor.py
@@ -0,0 +1,50 @@
+"""Bridge component for audio CNN feature extractors (HuBERT, wav2vec2)."""
+
+from typing import Any, Dict, Optional
+
+import torch
+
+from transformer_lens.model_bridge.generalized_components.base import (
+    GeneralizedComponent,
+)
+
+
+class AudioFeatureExtractorBridge(GeneralizedComponent):
+    """Wraps the multi-layer 1D CNN that converts raw waveforms into features.
+
+    hook_in captures the raw waveform, hook_out captures extracted features.
+    """
+
+    hook_aliases = {
+        "hook_audio_features": "hook_out",
+    }
+
+    def __init__(
+        self,
+        name: str,
+        config: Optional[Any] = None,
+        submodules: Optional[Dict[str, GeneralizedComponent]] = None,
+    ):
+        super().__init__(name, config, submodules=submodules or {})
+
+    def forward(
+        self,
+        input_values: torch.Tensor,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        """input_values: [batch, num_samples] -> [batch, conv_dim, num_frames]"""
+        if self.original_component is None:
+            raise RuntimeError(
+                f"Original component not set for {self.name}. "
+                "Call set_original_component() first."
+            )
+
+        input_values = self.hook_in(input_values)
+        output = self.original_component(input_values, **kwargs)
+
+        if isinstance(output, tuple):
+            output = (self.hook_out(output[0]),) + output[1:]
+        else:
+            output = self.hook_out(output)
+
+        return output
diff --git a/transformer_lens/model_bridge/generalized_components/conv_pos_embed.py b/transformer_lens/model_bridge/generalized_components/conv_pos_embed.py
new file mode 100644
index 000000000..5463d9d00
--- /dev/null
+++ b/transformer_lens/model_bridge/generalized_components/conv_pos_embed.py
@@ -0,0 +1,51 @@
+"""Bridge component for convolutional positional embeddings (HuBERT, wav2vec2)."""
+
+from typing import Any, Dict, Optional
+
+import torch
+
+from transformer_lens.model_bridge.generalized_components.base import (
+    GeneralizedComponent,
+)
+
+
+class ConvPosEmbedBridge(GeneralizedComponent):
+    """Wraps a grouped 1D conv that produces relative positional information.
+
+    Unlike PosEmbedBridge (lookup table) or RotaryEmbeddingBridge (rotation matrices),
+    this operates on hidden states via convolution.
+    """
+
+    hook_aliases = {
+        "hook_pos_embed": "hook_out",
+    }
+
+    def __init__(
+        self,
+        name: str,
+        config: Optional[Any] = None,
+        submodules: Optional[Dict[str, GeneralizedComponent]] = None,
+    ):
+        super().__init__(name, config, submodules=submodules or {})
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        """hidden_states: [batch, seq_len, hidden_size] -> [batch, seq_len, hidden_size]"""
+        if self.original_component is None:
+            raise RuntimeError(
+                f"Original component not set for {self.name}. "
+                "Call set_original_component() first."
+            )
+
+        hidden_states = self.hook_in(hidden_states)
+        output = self.original_component(hidden_states, **kwargs)
+
+        if isinstance(output, tuple):
+            output = (self.hook_out(output[0]),) + output[1:]
+        else:
+            output = self.hook_out(output)
+
+        return output
diff --git a/transformer_lens/model_bridge/sources/transformers.py b/transformer_lens/model_bridge/sources/transformers.py
index 850969012..b18ec285e 100644
--- a/transformer_lens/model_bridge/sources/transformers.py
+++ b/transformer_lens/model_bridge/sources/transformers.py
@@ -123,7 +123,7 @@ def map_default_transformer_lens_config(hf_config):
         tl_config.n_layers = source_config.num_transformer_layers
     elif hasattr(source_config, "num_layers"):
         tl_config.n_layers = source_config.num_layers
-    if hasattr(source_config, "vocab_size"):
+    if hasattr(source_config, "vocab_size") and isinstance(source_config.vocab_size, int):
         tl_config.d_vocab = source_config.vocab_size
     if hasattr(source_config, "n_positions"):
         tl_config.n_ctx = source_config.n_positions
@@ -151,6 +151,15 @@ def map_default_transformer_lens_config(hf_config):
         tl_config.d_head = tl_config.d_model // tl_config.n_heads
     if hasattr(source_config, "activation_function"):
         tl_config.act_fn = source_config.activation_function
+    elif hasattr(source_config, "hidden_act"):
+        tl_config.act_fn = source_config.hidden_act
+    # Layer norm / RMS norm epsilon — HF uses 3 different field names
+    if hasattr(source_config, "rms_norm_eps"):
+        tl_config.eps = source_config.rms_norm_eps
+    elif hasattr(source_config, "layer_norm_eps"):
+        tl_config.eps = source_config.layer_norm_eps
+    elif hasattr(source_config, "layer_norm_epsilon"):
+        tl_config.eps = source_config.layer_norm_epsilon
     if hasattr(source_config, "num_local_experts"):
         tl_config.num_experts = source_config.num_local_experts
     if hasattr(source_config, "num_experts_per_tok"):
@@ -191,6 +200,7 @@ def determine_architecture_from_hf_config(hf_config):
         model_type_mappings = {
             "apertus": "ApertusForCausalLM",
             "gpt2": "GPT2LMHeadModel",
+            "hubert": "HubertModel",
             "llama": "LlamaForCausalLM",
             "mistral": "MistralForCausalLM",
             "mixtral": "MixtralForCausalLM",
@@ -229,6 +239,7 @@ def get_hf_model_class_for_architecture(architecture: str):
     Uses centralized architecture sets from utilities.architectures.
     """
     from transformer_lens.utilities.architectures import (
+        AUDIO_ARCHITECTURES,
         MASKED_LM_ARCHITECTURES,
         MULTIMODAL_ARCHITECTURES,
         SEQ2SEQ_ARCHITECTURES,
@@ -242,6 +253,14 @@ def get_hf_model_class_for_architecture(architecture: str):
         from transformers import AutoModelForImageTextToText
 
         return AutoModelForImageTextToText
+    elif architecture in AUDIO_ARCHITECTURES:
+        if "ForCTC" in architecture:
+            from transformers import AutoModelForCTC
+
+            return AutoModelForCTC
+        from transformers import AutoModel
+
+        return AutoModel
     else:
         return AutoModelForCausalLM
 
@@ -377,7 +396,11 @@ def boot(
     tokenizer = tokenizer
     default_padding_side = getattr(adapter.cfg, "default_padding_side", None)
     use_fast = getattr(adapter.cfg, "use_fast", True)
-    if tokenizer is not None:
+    # Audio models use feature extractors, not text tokenizers
+    _is_audio = getattr(adapter.cfg, "is_audio_model", False)
+    if _is_audio and tokenizer is None:
+        tokenizer = None  # Skip tokenizer loading for audio models
+    elif tokenizer is not None:
         tokenizer = setup_tokenizer(tokenizer, default_padding_side=default_padding_side)
     else:
         token_arg = get_hf_token()
@@ -484,6 +507,21 @@ def boot(
                 except Exception:
                     pass  # Processor not available; user can set bridge.processor manually
 
+    # Load feature extractor for audio models (needed for audio preprocessing)
+    if getattr(adapter.cfg, "is_audio_model", False):
+        try:
+            from transformers import AutoFeatureExtractor
+
+            huggingface_token = os.environ.get("HF_TOKEN", "")
+            token_arg = huggingface_token if len(huggingface_token) > 0 else None
+            bridge.processor = AutoFeatureExtractor.from_pretrained(
+                model_name,
+                token=token_arg,
+                trust_remote_code=trust_remote_code,
+            )
+        except Exception:
+            pass  # Feature extractor not available; user can set bridge.processor manually
+
     return bridge
 
 
diff --git a/transformer_lens/model_bridge/supported_architectures/__init__.py b/transformer_lens/model_bridge/supported_architectures/__init__.py
index a9dff24b5..ac1a334e2 100644
--- a/transformer_lens/model_bridge/supported_architectures/__init__.py
+++ b/transformer_lens/model_bridge/supported_architectures/__init__.py
@@ -45,6 +45,9 @@
 from transformer_lens.model_bridge.supported_architectures.granite_moe_hybrid import (
     GraniteMoeHybridArchitectureAdapter,
 )
+from transformer_lens.model_bridge.supported_architectures.hubert import (
+    HubertArchitectureAdapter,
+)
 from transformer_lens.model_bridge.supported_architectures.llama import (
     LlamaArchitectureAdapter,
 )
@@ -136,6 +139,7 @@
     "GPTOSSArchitectureAdapter",
     "Gpt2LmHeadCustomArchitectureAdapter",
     "GptjArchitectureAdapter",
+    "HubertArchitectureAdapter",
     "LlamaArchitectureAdapter",
     "LlavaArchitectureAdapter",
     "LlavaNextArchitectureAdapter",
diff --git a/transformer_lens/model_bridge/supported_architectures/hubert.py b/transformer_lens/model_bridge/supported_architectures/hubert.py
new file mode 100644
index 000000000..2f73b311a
--- /dev/null
+++ b/transformer_lens/model_bridge/supported_architectures/hubert.py
@@ -0,0 +1,179 @@
+"""HuBERT architecture adapter.
+
+Supports HubertModel (bare encoder) and HubertForCTC (with CTC head).
+Encoder blocks are structurally identical to BERT (post-LN by default,
+pre-LN when do_stable_layer_norm=True).
+"""
+
+from typing import Any
+
+from transformer_lens.conversion_utils.conversion_steps import RearrangeTensorConversion
+from transformer_lens.conversion_utils.param_processing_conversion import (
+    ParamProcessingConversion,
+)
+from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
+from transformer_lens.model_bridge.generalized_components import (
+    AttentionBridge,
+    BlockBridge,
+    LinearBridge,
+    MLPBridge,
+    NormalizationBridge,
+    UnembeddingBridge,
+)
+from transformer_lens.model_bridge.generalized_components.audio_feature_extractor import (
+    AudioFeatureExtractorBridge,
+)
+from transformer_lens.model_bridge.generalized_components.base import (
+    GeneralizedComponent,
+)
+from transformer_lens.model_bridge.generalized_components.conv_pos_embed import (
+    ConvPosEmbedBridge,
+)
+
+
+class HubertArchitectureAdapter(ArchitectureAdapter):
+    """Architecture adapter for HuBERT audio models.
+
+    HubertForCTC nests HubertModel under a 'hubert.' prefix;
+    prepare_model() detects this and adjusts component paths.
+    """
+
+    def __init__(self, cfg: Any) -> None:
+        super().__init__(cfg)
+
+        self.cfg.is_audio_model = True
+        self.cfg.normalization_type = "LN"
+        self.cfg.positional_embedding_type = "conv"
+        self.cfg.final_rms = False
+        self.cfg.gated_mlp = False
+        self.cfg.attn_only = False
+
+        # Pre-LN (True) vs post-LN (False). Propagated from HF config in prepare_loading().
+        self._do_stable_layer_norm = getattr(self.cfg, "do_stable_layer_norm", False)
+        self.supports_fold_ln = self._do_stable_layer_norm
+
+        n_heads = self.cfg.n_heads
+
+        # Q/K/V/O rearrangement — same pattern as BERT
+        self.weight_processing_conversions = {
+            "blocks.{i}.attn.q.weight": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion(
+                    "(h d_head) d_model -> h d_model d_head", h=n_heads
+                ),
+            ),
+            "blocks.{i}.attn.k.weight": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion(
+                    "(h d_head) d_model -> h d_model d_head", h=n_heads
+                ),
+            ),
+            "blocks.{i}.attn.v.weight": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion(
+                    "(h d_head) d_model -> h d_model d_head", h=n_heads
+                ),
+            ),
+            "blocks.{i}.attn.q.bias": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion("(h d_head) -> h d_head", h=n_heads),
+            ),
+            "blocks.{i}.attn.k.bias": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion("(h d_head) -> h d_head", h=n_heads),
+            ),
+            "blocks.{i}.attn.v.bias": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion("(h d_head) -> h d_head", h=n_heads),
+            ),
+            "blocks.{i}.attn.o.weight": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion(
+                    "d_model (h d_head) -> h d_head d_model", h=n_heads
+                ),
+            ),
+        }
+
+        # Default mapping for bare HubertModel. prepare_model() rebuilds with
+        # "hubert." prefix for HubertForCTC.
+        self.component_mapping = self._build_component_mapping(prefix="")
+
+    def _build_component_mapping(self, prefix: str) -> dict:
+        """Build component mapping. prefix="" for HubertModel, "hubert." for HubertForCTC."""
+        p = prefix
+        mapping: dict[str, Any] = {
+            "audio_feature_extractor": AudioFeatureExtractorBridge(
+                name=f"{p}feature_extractor",
+            ),
+            "feat_proj": GeneralizedComponent(
+                name=f"{p}feature_projection",
+            ),
+            "conv_pos_embed": ConvPosEmbedBridge(
+                name=f"{p}encoder.pos_conv_embed",
+            ),
+            "embed_ln": NormalizationBridge(
+                name=f"{p}encoder.layer_norm",
+                config=self.cfg,
+                use_native_layernorm_autograd=True,
+            ),
+            "blocks": BlockBridge(
+                name=f"{p}encoder.layers",
+                # Redirect MLP hooks to the actual linear layer hooks (same as BERT)
+                hook_alias_overrides={
+                    "hook_mlp_out": "mlp.out.hook_out",
+                    "hook_mlp_in": "mlp.in.hook_in",
+                },
+                submodules={
+                    "ln1": NormalizationBridge(
+                        name="layer_norm",
+                        config=self.cfg,
+                        use_native_layernorm_autograd=True,
+                    ),
+                    "ln2": NormalizationBridge(
+                        name="final_layer_norm",
+                        config=self.cfg,
+                        use_native_layernorm_autograd=True,
+                    ),
+                    "attn": AttentionBridge(
+                        name="attention",
+                        config=self.cfg,
+                        submodules={
+                            "q": LinearBridge(name="q_proj"),
+                            "k": LinearBridge(name="k_proj"),
+                            "v": LinearBridge(name="v_proj"),
+                            "o": LinearBridge(name="out_proj"),
+                        },
+                    ),
+                    "mlp": MLPBridge(
+                        name="feed_forward",
+                        config=self.cfg,
+                        submodules={
+                            "in": LinearBridge(name="intermediate_dense"),
+                            "out": LinearBridge(name="output_dense"),
+                        },
+                    ),
+                },
+            ),
+        }
+        return mapping
+
+    def prepare_loading(self, model_name: str, model_kwargs: dict) -> None:
+        """Propagate HuBERT-specific HF config attributes to bridge config.
+
+        Prevents silent-default bugs where adapter reads from bridge config
+        but the attribute was never propagated from HF config.
+        """
+        hf_config = model_kwargs.get("config")
+        if hf_config is None:
+            return
+
+        # Pre-LN vs post-LN — determines fold_ln safety
+        do_stable = getattr(hf_config, "do_stable_layer_norm", False)
+        self.cfg.do_stable_layer_norm = do_stable  # type: ignore[attr-defined]
+        self._do_stable_layer_norm = do_stable
+        self.supports_fold_ln = do_stable
+
+        # hidden_act and layer_norm_eps are mapped globally in
+        # map_default_transformer_lens_config()
+
+        # Rebuild with correct LN variant
+        self.component_mapping = self._build_component_mapping(prefix="")
+
+    def prepare_model(self, hf_model: Any) -> None:
+        """Detect HubertForCTC (has 'hubert.' prefix) and add CTC head."""
+        if hasattr(hf_model, "hubert"):
+            self.component_mapping = self._build_component_mapping(prefix="hubert.")
+            self.component_mapping["unembed"] = UnembeddingBridge(name="lm_head")
diff --git a/transformer_lens/tools/model_registry/__init__.py b/transformer_lens/tools/model_registry/__init__.py
index 7ee6bfebe..7d84970f3 100644
--- a/transformer_lens/tools/model_registry/__init__.py
+++ b/transformer_lens/tools/model_registry/__init__.py
@@ -58,6 +58,8 @@
     "GPTNeoForCausalLM",
     "OpenELMForCausalLM",
     "GPTNeoXForCausalLM",
+    "HubertForCTC",
+    "HubertModel",
     "LlamaForCausalLM",
     "LlavaForConditionalGeneration",
     "LlavaNextForConditionalGeneration",
diff --git a/transformer_lens/tools/model_registry/data/architecture_gaps.json b/transformer_lens/tools/model_registry/data/architecture_gaps.json
index 90ebe2314..7344d18b3 100644
--- a/transformer_lens/tools/model_registry/data/architecture_gaps.json
+++ b/transformer_lens/tools/model_registry/data/architecture_gaps.json
@@ -1,65 +1,65 @@
 {
-  "generated_at": "2026-03-18",
+  "generated_at": "2026-03-19",
   "scan_info": {
-    "total_scanned": 3426,
+    "total_scanned": 3517,
     "task_filter": "text-generation",
     "min_downloads": 500,
-    "scan_duration_seconds": 2.4
+    "scan_duration_seconds": 2.7
   },
-  "total_unsupported_architectures": 253,
-  "total_unsupported_models": 1013,
+  "total_unsupported_architectures": 258,
+  "total_unsupported_models": 1031,
   "gaps": [
     {
       "architecture_id": "Qwen3MoeForCausalLM",
-      "total_models": 66,
+      "total_models": 68,
       "sample_models": [
         "Qwen/Qwen3-30B-A3B",
-        "Qwen/Qwen3-30B-A3B-Thinking-2507",
         "Qwen/Qwen3-30B-A3B-Instruct-2507",
+        "Qwen/Qwen3-30B-A3B-Thinking-2507",
         "Qwen/Qwen3-Coder-30B-A3B-Instruct",
         "Qwen/Qwen3-235B-A22B",
         "trl-internal-testing/tiny-Qwen3MoeForCausalLM",
         "Qwen/Qwen3-235B-A22B-Instruct-2507",
         "Qwen/Qwen3-Coder-480B-A35B-Instruct",
-        "nvidia/Qwen3-30B-A3B-NVFP4",
-        "NVFP4/Qwen3-30B-A3B-Instruct-2507-FP4"
+        "NVFP4/Qwen3-30B-A3B-Instruct-2507-FP4",
+        "nvidia/Qwen3-30B-A3B-NVFP4"
       ]
     },
     {
       "architecture_id": "DeepseekV3ForCausalLM",
-      "total_models": 51,
+      "total_models": 53,
       "sample_models": [
         "deepseek-ai/DeepSeek-R1",
-        "deepseek-ai/DeepSeek-V3",
         "deepseek-ai/DeepSeek-R1-0528",
+        "deepseek-ai/DeepSeek-V3",
         "deepseek-ai/DeepSeek-V3-0324",
         "nvidia/DeepSeek-R1-0528-NVFP4-v2",
         "deepseek-ai/DeepSeek-V3.1",
         "ai-sage/GigaChat3-10B-A1.8B",
         "trl-internal-testing/tiny-DeepseekV3ForCausalLM",
-        "trl-internal-testing/tiny-DeepseekV3ForCausalLM-0528",
-        "nvidia/DeepSeek-V3-0324-NVFP4"
+        "nvidia/DeepSeek-V3-0324-NVFP4",
+        "moonshotai/Kimi-K2-Instruct"
       ]
     },
     {
       "architecture_id": "Qwen3_5ForConditionalGeneration",
-      "total_models": 42,
+      "total_models": 46,
       "sample_models": [
         "Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled",
         "osoleve/Qwen3.5-27B-Text-NVFP4-MTP",
-        "nightmedia/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-qx64-hi-mlx",
         "Tesslate/OmniCoder-9B",
-        "txn545/Qwen3.5-27B-NVFP4",
+        "nightmedia/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-qx64-hi-mlx",
         "Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled",
-        "EganAI/qwen3.5-9b-terminal-merge",
+        "txn545/Qwen3.5-27B-NVFP4",
+        "mconcat/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-NVFP4",
         "Jackrong/Qwen3.5-4B-Claude-4.6-Opus-Reasoning-Distilled",
-        "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled",
-        "nightmedia/Qwen3.5-27B-Text"
+        "EganAI/qwen3.5-9b-terminal-merge",
+        "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled"
       ]
     },
     {
       "architecture_id": "Qwen3NextForCausalLM",
-      "total_models": 37,
+      "total_models": 35,
       "sample_models": [
         "Qwen/Qwen3-Coder-Next",
         "Qwen/Qwen3-Next-80B-A3B-Instruct",
@@ -75,14 +75,14 @@
     },
     {
       "architecture_id": "FalconForCausalLM",
-      "total_models": 31,
+      "total_models": 32,
       "sample_models": [
         "tiiuae/falcon-7b",
         "tiiuae/falcon-7b-instruct",
         "tiiuae/falcon-40b-instruct",
         "tiiuae/falcon-40b",
-        "fxmarty/really-tiny-falcon-testing",
         "tiiuae/falcon-rw-1b",
+        "fxmarty/really-tiny-falcon-testing",
         "vilsonrodrigues/falcon-7b-instruct-sharded",
         "tiiuae/falcon-11B",
         "euclaise/falcon_1b_stage2",
@@ -91,44 +91,28 @@
     },
     {
       "architecture_id": "Qwen3_5MoeForConditionalGeneration",
-      "total_models": 27,
+      "total_models": 28,
       "sample_models": [
         "txn545/Qwen3.5-122B-A10B-NVFP4",
         "nvidia/Qwen3.5-397B-A17B-NVFP4",
         "txn545/Qwen3.5-35B-A3B-NVFP4",
         "RepublicOfKorokke/Qwen3.5-35B-A3B-mlx-lm-mxfp4",
         "nightmedia/Qwen3.5-35B-A3B-Text-qx64-hi-mlx",
+        "lukealonso/Qwen3.5-397B-A17B-NVFP4",
         "nightmedia/Qwen3.5-122B-A10B-Text-mxfp4-mlx",
         "olka-fi/Qwen3.5-122B-A10B-MXFP4",
-        "lukealonso/Qwen3.5-397B-A17B-NVFP4",
         "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled",
         "NexVeridian/Qwen3.5-35B-A3B-3bit"
       ]
     },
-    {
-      "architecture_id": "InternLM2ForCausalLM",
-      "total_models": 21,
-      "sample_models": [
-        "internlm/internlm2-chat-7b",
-        "internlm/internlm2_5-7b-chat",
-        "internlm/internlm2-7b",
-        "internlm/internlm2-20b",
-        "internlm/internlm2-base-7b",
-        "internlm/internlm2-chat-20b",
-        "internlm/internlm2-base-20b",
-        "chujiezheng/internlm2-chat-7b-ExPO",
-        "chujiezheng/internlm2-chat-20b-ExPO",
-        "AI4Chem/ChemLLM-7B-Chat-1_5-DPO"
-      ]
-    },
     {
       "architecture_id": "Lfm2ForCausalLM",
-      "total_models": 19,
+      "total_models": 21,
       "sample_models": [
         "LiquidAI/LFM2-1.2B",
         "LiquidAI/LFM2.5-1.2B-Instruct",
-        "LiquidAI/LFM2-350M",
         "LiquidAI/LFM2.5-1.2B-Base",
+        "LiquidAI/LFM2-350M",
         "LiquidAI/LFM2.5-1.2B-Thinking",
         "LiquidAI/LFM2-2.6B",
         "LiquidAI/LFM2-2.6B-Exp",
@@ -137,15 +121,31 @@
         "LiquidAI/LFM2.5-1.2B-Thinking-ONNX"
       ]
     },
+    {
+      "architecture_id": "InternLM2ForCausalLM",
+      "total_models": 19,
+      "sample_models": [
+        "internlm/internlm2-chat-7b",
+        "internlm/internlm2_5-7b-chat",
+        "internlm/internlm2-7b",
+        "internlm/internlm2-20b",
+        "internlm/internlm2-base-7b",
+        "internlm/internlm2-chat-20b",
+        "internlm/internlm2-base-20b",
+        "chujiezheng/internlm2-chat-20b-ExPO",
+        "chujiezheng/internlm2-chat-7b-ExPO",
+        "AI4Chem/ChemLLM-7B-Chat-1_5-DPO"
+      ]
+    },
     {
       "architecture_id": "Glm4MoeForCausalLM",
-      "total_models": 17,
+      "total_models": 18,
       "sample_models": [
         "zai-org/GLM-4.5-Air",
         "zai-org/GLM-4.7",
         "trl-internal-testing/tiny-Glm4MoeForCausalLM",
-        "zai-org/GLM-4.6",
         "zai-org/GLM-4.5",
+        "zai-org/GLM-4.6",
         "Tengyunw/GLM-4.7-NVFP4",
         "Salyut1/GLM-4.7-NVFP4",
         "np-cr/testing-glm4-moe",
@@ -158,15 +158,15 @@
       "total_models": 17,
       "sample_models": [
         "ai21labs/AI21-Jamba-Mini-1.5",
-        "ai21labs/AI21-Jamba2-3B",
         "ai21labs/Jamba-tiny-random",
+        "ai21labs/AI21-Jamba2-3B",
         "ai21labs/AI21-Jamba-Reasoning-3B",
         "ai21labs/AI21-Jamba-Large-1.5",
         "ai21labs/AI21-Jamba-Mini-1.6",
         "ai21labs/AI21-Jamba-Large-1.6",
         "microsoft/Dayhoff-170m-GR",
         "ai21labs/Jamba-v0.1",
-        "microsoft/Dayhoff-170m-UR90"
+        "microsoft/Dayhoff-170M-GRS-112000"
       ]
     },
     {
@@ -191,30 +191,30 @@
       "sample_models": [
         "tiiuae/Falcon-H1-Tiny-90M-Instruct",
         "tiiuae/Falcon-H1-0.5B-Base",
-        "tiiuae/Falcon-H1-7B-Instruct",
         "tiiuae/Falcon-H1R-7B",
-        "tiiuae/Falcon-H1-34B-Instruct",
+        "tiiuae/Falcon-H1-7B-Instruct",
         "tiiuae/Falcon-H1-34B-Base",
+        "tiiuae/Falcon-H1-34B-Instruct",
         "tiiuae/Falcon-H1-1.5B-Base",
         "tiiuae/Falcon-H1-7B-Base",
         "tiiuae/Falcon-H1-3B-Base",
-        "tiiuae/Falcon-H1-1.5B-Instruct"
+        "tiiuae/Falcon-H1-1.5B-Deep-Base"
       ]
     },
     {
-      "architecture_id": "MiniMaxM2ForCausalLM",
+      "architecture_id": "NemotronHForCausalLM",
       "total_models": 15,
       "sample_models": [
-        "MiniMaxAI/MiniMax-M2.5",
-        "MiniMaxAI/MiniMax-M2",
-        "cerebras/MiniMax-M2.1-REAP-139B-A10B",
-        "MiniMaxAI/MiniMax-M2.1",
-        "cerebras/MiniMax-M2.5-REAP-139B-A10B",
-        "PrimeIntellect/MiniMax-M2.5-bf16",
-        "cerebras/MiniMax-M2.5-REAP-172B-A10B",
-        "saricles/MiniMax-M2.5-REAP-172B-A10B-NVFP4-GB10",
-        "amd/MiniMax-M2.1-MXFP4",
-        "aspctu/MiniMax-M2.5"
+        "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+        "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4",
+        "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
+        "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese",
+        "nvidia/NVIDIA-Nemotron-Nano-9B-v2",
+        "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
+        "unsloth/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
+        "OpenResearcher/OpenResearcher-30B-A3B",
+        "nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4",
+        "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-Base-BF16"
       ]
     },
     {
@@ -228,25 +228,25 @@
         "ibm-granite/granite-20b-code-base-8k",
         "ibm-granite/granite-20b-code-instruct-8k",
         "HuggingFaceH4/starchat-beta",
-        "bigcode/starcoderbase-3b",
         "HuggingFaceH4/starchat-alpha",
-        "openchat/opencoderplus"
+        "LoupGarou/WizardCoder-Guanaco-15B-V1.1",
+        "Danielbrdz/CodeBarcenas-1b"
       ]
     },
     {
-      "architecture_id": "NemotronHForCausalLM",
+      "architecture_id": "MiniMaxM2ForCausalLM",
       "total_models": 14,
       "sample_models": [
-        "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
-        "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4",
-        "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese",
-        "nvidia/NVIDIA-Nemotron-Nano-9B-v2",
-        "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
-        "unsloth/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
-        "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
-        "OpenResearcher/OpenResearcher-30B-A3B",
-        "nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4",
-        "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-Base-BF16"
+        "MiniMaxAI/MiniMax-M2.5",
+        "cerebras/MiniMax-M2.1-REAP-139B-A10B",
+        "MiniMaxAI/MiniMax-M2",
+        "MiniMaxAI/MiniMax-M2.1",
+        "cerebras/MiniMax-M2.5-REAP-139B-A10B",
+        "PrimeIntellect/MiniMax-M2.5-bf16",
+        "cerebras/MiniMax-M2.5-REAP-172B-A10B",
+        "saricles/MiniMax-M2.5-REAP-172B-A10B-NVFP4-GB10",
+        "aspctu/MiniMax-M2.5",
+        "amd/MiniMax-M2.1-MXFP4"
       ]
     },
     {
@@ -260,11 +260,27 @@
         "facebook/xglm-4.5B",
         "KoboldAI/fairseq-dense-125M",
         "KoboldAI/fairseq-dense-2.7B",
-        "KoboldAI/fairseq-dense-1.3B",
         "KoboldAI/fairseq-dense-355M",
+        "KoboldAI/fairseq-dense-1.3B",
         "KoboldAI/fairseq-dense-6.7B"
       ]
     },
+    {
+      "architecture_id": "Glm4MoeLiteForCausalLM",
+      "total_models": 13,
+      "sample_models": [
+        "zai-org/GLM-4.7-Flash",
+        "GadflyII/GLM-4.7-Flash-NVFP4",
+        "unsloth/GLM-4.7-Flash",
+        "GadflyII/GLM-4.7-Flash-MTP-NVFP4",
+        "Olafangensan/GLM-4.7-Flash-heretic",
+        "cerebras/GLM-4.7-Flash-REAP-23B-A3B",
+        "huihui-ai/Huihui-GLM-4.7-Flash-abliterated",
+        "TeichAI/GLM-4.7-Flash-Claude-Opus-4.5-High-Reasoning-Distill",
+        "Ex0bit/GLM-4.7-Flash-PRISM",
+        "MuXodious/GLM-4.7-Flash-absolute-heresy"
+      ]
+    },
     {
       "architecture_id": "CodeGenForCausalLM",
       "total_models": 13,
@@ -297,22 +313,6 @@
         "RWKV/rwkv-raven-7b"
       ]
     },
-    {
-      "architecture_id": "Glm4MoeLiteForCausalLM",
-      "total_models": 12,
-      "sample_models": [
-        "zai-org/GLM-4.7-Flash",
-        "GadflyII/GLM-4.7-Flash-NVFP4",
-        "unsloth/GLM-4.7-Flash",
-        "GadflyII/GLM-4.7-Flash-MTP-NVFP4",
-        "Olafangensan/GLM-4.7-Flash-heretic",
-        "huihui-ai/Huihui-GLM-4.7-Flash-abliterated",
-        "cerebras/GLM-4.7-Flash-REAP-23B-A3B",
-        "TeichAI/GLM-4.7-Flash-Claude-Opus-4.5-High-Reasoning-Distill",
-        "Ex0bit/GLM-4.7-Flash-PRISM",
-        "MuXodious/GLM-4.7-Flash-absolute-heresy"
-      ]
-    },
     {
       "architecture_id": "DeepseekV2ForCausalLM",
       "total_models": 11,
@@ -320,13 +320,13 @@
         "deepseek-ai/DeepSeek-V2-Lite-Chat",
         "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
         "deepseek-ai/DeepSeek-V2-Lite",
-        "deepseek-ai/DeepSeek-V2.5",
         "deepseek-ai/DeepSeek-V2-Chat",
         "deepseek-ai/DeepSeek-Coder-V2-Instruct-0724",
         "deepseek-ai/DeepSeek-V2",
+        "deepseek-ai/DeepSeek-V2.5",
         "deepseek-ai/DeepSeek-Coder-V2-Instruct",
-        "deepseek-ai/DeepSeek-Coder-V2-Lite-Base",
-        "deepseek-ai/DeepSeek-V2-Chat-0628"
+        "deepseek-ai/DeepSeek-V2-Chat-0628",
+        "deepseek-ai/DeepSeek-Coder-V2-Lite-Base"
       ]
     },
     {
@@ -350,13 +350,13 @@
       "total_models": 10,
       "sample_models": [
         "google/t5gemma-s-s-prefixlm",
-        "google/t5gemma-b-b-ul2",
         "google/t5gemma-9b-9b-ul2",
+        "google/t5gemma-b-b-ul2",
         "google/t5gemma-2b-2b-ul2",
         "google/t5gemma-b-b-prefixlm",
         "google/t5gemma-9b-9b-ul2-it",
-        "google/t5gemma-9b-2b-ul2-it",
         "google/t5gemma-2b-2b-prefixlm",
+        "google/t5gemma-9b-2b-ul2-it",
         "google/t5gemma-l-l-prefixlm",
         "harshaljanjani/tiny-t5gemma-test"
       ]
@@ -379,7 +379,7 @@
     },
     {
       "architecture_id": "DeciLMForCausalLM",
-      "total_models": 10,
+      "total_models": 9,
       "sample_models": [
         "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
         "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
@@ -389,8 +389,22 @@
         "NewstaR/Porpoise-6b-instruct",
         "Danielbrdz/Barcenas-6b",
         "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
-        "nvidia/Llama-3_1-Nemotron-51B-Instruct",
-        "nvidia/Llama-3_3-Nemotron-Super-49B-GenRM"
+        "nvidia/Llama-3_1-Nemotron-51B-Instruct"
+      ]
+    },
+    {
+      "architecture_id": "DFlashDraftModel",
+      "total_models": 9,
+      "sample_models": [
+        "z-lab/Qwen3-4B-DFlash-b16",
+        "z-lab/Qwen3-8B-DFlash-b16",
+        "z-lab/Qwen3.5-9B-DFlash",
+        "z-lab/gpt-oss-20b-DFlash",
+        "z-lab/gpt-oss-120b-DFlash",
+        "z-lab/LLaMA3.1-8B-Instruct-DFlash-UltraChat",
+        "z-lab/Qwen3.5-35B-A3B-DFlash",
+        "z-lab/Qwen3-Coder-30B-A3B-DFlash",
+        "z-lab/Qwen3.5-4B-DFlash"
       ]
     },
     {
@@ -435,41 +449,27 @@
         "dreuxx26/Multilingual-grammar-Corrector-using-mT5-small"
       ]
     },
-    {
-      "architecture_id": "DFlashDraftModel",
-      "total_models": 8,
-      "sample_models": [
-        "z-lab/Qwen3-4B-DFlash-b16",
-        "z-lab/Qwen3-8B-DFlash-b16",
-        "z-lab/Qwen3.5-9B-DFlash",
-        "z-lab/gpt-oss-20b-DFlash",
-        "z-lab/LLaMA3.1-8B-Instruct-DFlash-UltraChat",
-        "z-lab/gpt-oss-120b-DFlash",
-        "z-lab/Qwen3.5-35B-A3B-DFlash",
-        "z-lab/Qwen3-Coder-30B-A3B-DFlash"
-      ]
-    },
     {
       "architecture_id": "Qwen3_5ForCausalLM",
       "total_models": 8,
       "sample_models": [
         "lukey03/Qwen3.5-9B-abliterated",
         "osoleve/Qwen3.5-9B-Base-Text-NVFP4",
-        "Green-eyedDevil/Monika-9B",
         "Phonsiri/Qwen3.5-9B-Thai-Law-Base",
+        "Green-eyedDevil/Monika-9B",
         "eerwitt/qwen-h-neurons-honest",
         "rahul7star/albeit",
-        "nahidstaq/html-section-retriever",
-        "nbeerbower/Huihui-Qwen3.5-9B-abliterated-Grimoire-ORPO"
+        "nbeerbower/Huihui-Qwen3.5-9B-abliterated-Grimoire-ORPO",
+        "nahidstaq/html-section-retriever"
       ]
     },
     {
       "architecture_id": "MPTForCausalLM",
       "total_models": 8,
       "sample_models": [
-        "echarlaix/tiny-mpt-random-remote-code",
         "anas-awadalla/mpt-7b",
         "wtang06/mpt-125m-c4",
+        "echarlaix/tiny-mpt-random-remote-code",
         "lightblue/japanese-mpt-7b",
         "vinai/PhoGPT-4B",
         "Nethermind/Mpt-Instruct-DotNet-S",
@@ -512,8 +512,21 @@
         "optimum-internal-testing/tiny-random-SmolLM3ForCausalLM",
         "onnx-internal-testing/tiny-random-SmolLM3ForCausalLM",
         "HuggingFaceTB/SmolLM3-3B-ONNX",
-        "toroe/SmolLM-3B-Science-ES",
-        "N-Bot-Int/SmolSam3-MEMGRPO"
+        "N-Bot-Int/SmolSam3-MEMGRPO",
+        "toroe/SmolLM-3B-Science-ES"
+      ]
+    },
+    {
+      "architecture_id": "ProGenForCausalLM",
+      "total_models": 7,
+      "sample_models": [
+        "hugohrban/progen2-base",
+        "hugohrban/progen2-small",
+        "hugohrban/progen2-medium",
+        "hugohrban/progen2-oas",
+        "hugohrban/progen2-small-mix7",
+        "hugohrban/progen2-large",
+        "hugohrban/progen2-xlarge"
       ]
     },
     {
@@ -553,15 +566,27 @@
       ]
     },
     {
-      "architecture_id": "ProGenForCausalLM",
+      "architecture_id": "NemotronForCausalLM",
       "total_models": 6,
       "sample_models": [
-        "hugohrban/progen2-small",
-        "hugohrban/progen2-base",
-        "hugohrban/progen2-medium",
-        "hugohrban/progen2-oas",
-        "hugohrban/progen2-xlarge",
-        "hugohrban/progen2-small-mix7"
+        "nvidia/Nemotron-Mini-4B-Instruct",
+        "nvidia/Minitron-8B-Base",
+        "badaoui/tiny-random-NemotronForCausalLM",
+        "nvidia/Minitron-4B-Base",
+        "thhaus/nemotron3-8b",
+        "dmvevents/Nemotron-Mini-4B-Instruct"
+      ]
+    },
+    {
+      "architecture_id": "HyenaDNAForCausalLM",
+      "total_models": 6,
+      "sample_models": [
+        "LongSafari/hyenadna-small-32k-seqlen-hf",
+        "LongSafari/hyenadna-tiny-1k-seqlen-hf",
+        "LongSafari/hyenadna-large-1m-seqlen-hf",
+        "LongSafari/hyenadna-medium-450k-seqlen-hf",
+        "LongSafari/hyenadna-medium-160k-seqlen-hf",
+        "LongSafari/hyenadna-tiny-1k-seqlen-d256-hf"
       ]
     },
     {
@@ -576,26 +601,14 @@
         "ShareGPTVideo/LLaVA-Hound-Pretrain"
       ]
     },
-    {
-      "architecture_id": "HyenaDNAForCausalLM",
-      "total_models": 6,
-      "sample_models": [
-        "LongSafari/hyenadna-small-32k-seqlen-hf",
-        "LongSafari/hyenadna-large-1m-seqlen-hf",
-        "LongSafari/hyenadna-medium-160k-seqlen-hf",
-        "LongSafari/hyenadna-medium-450k-seqlen-hf",
-        "LongSafari/hyenadna-tiny-1k-seqlen-hf",
-        "LongSafari/hyenadna-tiny-1k-seqlen-d256-hf"
-      ]
-    },
     {
       "architecture_id": "LlavaLlamaModel",
       "total_models": 6,
       "sample_models": [
         "Efficient-Large-Model/VILA1.5-3b",
-        "Efficient-Large-Model/NVILA-15B",
         "Efficient-Large-Model/NVILA-Lite-8B",
         "Efficient-Large-Model/NVILA-8B",
+        "Efficient-Large-Model/NVILA-15B",
         "Efficient-Large-Model/VILA1.5-13b",
         "Efficient-Large-Model/Llama-3-VILA1.5-8B"
       ]
@@ -619,8 +632,8 @@
         "GSAI-ML/LLaDA-8B-Instruct",
         "GSAI-ML/LLaDA-8B-Base",
         "GSAI-ML/LLaDA-1.5",
-        "Fraser/LLaDA-8B-Base-gg2m",
-        "d3LLM/d3LLM_LLaDA"
+        "d3LLM/d3LLM_LLaDA",
+        "Fraser/LLaDA-8B-Base-gg2m"
       ]
     },
     {
@@ -634,6 +647,17 @@
         "tiiuae/Falcon3-Mamba-7B-Instruct"
       ]
     },
+    {
+      "architecture_id": "DreamModel",
+      "total_models": 5,
+      "sample_models": [
+        "Dream-org/Dream-v0-Instruct-7B",
+        "Dream-org/Dream-v0-Base-7B",
+        "Dream-org/Dream-Coder-v0-Instruct-7B",
+        "d3LLM/d3LLM_Dream",
+        "Dream-org/Dream-Coder-v0-Base-7B"
+      ]
+    },
     {
       "architecture_id": "Eagle3Speculator",
       "total_models": 5,
@@ -660,8 +684,8 @@
       "architecture_id": "Ernie4_5_MoeForCausalLM",
       "total_models": 5,
       "sample_models": [
-        "baidu/ERNIE-4.5-21B-A3B-Base-PT",
         "baidu/ERNIE-4.5-21B-A3B-PT",
+        "baidu/ERNIE-4.5-21B-A3B-Base-PT",
         "baidu/ERNIE-4.5-21B-A3B-Thinking",
         "baidu/ERNIE-4.5-300B-A47B-PT",
         "baidu/ERNIE-4.5-300B-A47B-Paddle"
@@ -711,17 +735,6 @@
         "FreedomIntelligence/HuatuoGPT-Vision-7B"
       ]
     },
-    {
-      "architecture_id": "NemotronForCausalLM",
-      "total_models": 5,
-      "sample_models": [
-        "nvidia/Minitron-8B-Base",
-        "nvidia/Nemotron-Mini-4B-Instruct",
-        "badaoui/tiny-random-NemotronForCausalLM",
-        "nvidia/Minitron-4B-Base",
-        "thhaus/nemotron3-8b"
-      ]
-    },
     {
       "architecture_id": "HunYuanDenseV1ForCausalLM",
       "total_models": 5,
@@ -729,8 +742,8 @@
         "tencent/Hunyuan-7B-Instruct",
         "tencent/Hunyuan-0.5B-Pretrain",
         "tencent/Hunyuan-4B-Instruct",
-        "tencent/Hunyuan-1.8B-Instruct",
-        "tencent/Hunyuan-0.5B-Instruct"
+        "tencent/Hunyuan-0.5B-Instruct",
+        "tencent/Hunyuan-1.8B-Instruct"
       ]
     },
     {
@@ -754,23 +767,13 @@
       ]
     },
     {
-      "architecture_id": "DreamModel",
-      "total_models": 4,
-      "sample_models": [
-        "Dream-org/Dream-v0-Instruct-7B",
-        "Dream-org/Dream-v0-Base-7B",
-        "Dream-org/Dream-Coder-v0-Instruct-7B",
-        "d3LLM/d3LLM_Dream"
-      ]
-    },
-    {
-      "architecture_id": "Step3p5ForCausalLM",
+      "architecture_id": "Lfm2MoeForCausalLM",
       "total_models": 4,
       "sample_models": [
-        "stepfun-ai/Step-3.5-Flash",
-        "tacos4me/Step-3.5-Flash-NVFP4",
-        "stepfun-ai/Step-3.5-Flash-Base",
-        "shieldstackllc/Step-3.5-Flash-REAP-128B-A11B-mlx-mixed-4-6"
+        "LiquidAI/LFM2-8B-A1B",
+        "LiquidAI/LFM2-24B-A2B",
+        "huihui-ai/Huihui-LFM2-24B-A2B-abliterated",
+        "huihui-ai/Huihui-LFM2-8B-A1B-abliterated"
       ]
     },
     {
@@ -788,8 +791,8 @@
       "total_models": 4,
       "sample_models": [
         "nvidia/gpt-oss-120b-Eagle3-short-context",
-        "nvidia/gpt-oss-120b-Eagle3-long-context",
         "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3",
+        "nvidia/gpt-oss-120b-Eagle3-long-context",
         "nvidia/gpt-oss-120b-Eagle3-throughput"
       ]
     },
@@ -859,8 +862,8 @@
       "sample_models": [
         "BAAI/AquilaChat2-7B",
         "katuni4ka/tiny-random-aquila2",
-        "katuni4ka/tiny-random-aquilachat",
-        "BAAI/Aquila2-34B"
+        "BAAI/Aquila2-34B",
+        "katuni4ka/tiny-random-aquilachat"
       ]
     },
     {
@@ -932,17 +935,26 @@
       "total_models": 3,
       "sample_models": [
         "zai-org/GLM-5",
-        "yujiepan/glm-5-tiny-random",
+        "nvidia/GLM-5-NVFP4",
         "cs2764/GLM-5_dq3-mlx"
       ]
     },
     {
-      "architecture_id": "Zamba2ForCausalLM",
+      "architecture_id": "Step3p5ForCausalLM",
       "total_models": 3,
       "sample_models": [
-        "Zyphra/Zamba2-1.2B-instruct",
-        "Zyphra/Zamba2-7B-Instruct",
-        "Zyphra/Zamba2-2.7B"
+        "stepfun-ai/Step-3.5-Flash",
+        "tacos4me/Step-3.5-Flash-NVFP4",
+        "stepfun-ai/Step-3.5-Flash-Base"
+      ]
+    },
+    {
+      "architecture_id": "Zamba2ForCausalLM",
+      "total_models": 3,
+      "sample_models": [
+        "Zyphra/Zamba2-1.2B-instruct",
+        "Zyphra/Zamba2-7B-Instruct",
+        "Zyphra/Zamba2-2.7B"
       ]
     },
     {
@@ -968,8 +980,8 @@
       "total_models": 3,
       "sample_models": [
         "nvidia/Nemotron-Flash-3B",
-        "nvidia/Nemotron-Flash-1B",
-        "nvidia/Nemotron-Flash-3B-Instruct"
+        "nvidia/Nemotron-Flash-3B-Instruct",
+        "nvidia/Nemotron-Flash-1B"
       ]
     },
     {
@@ -981,6 +993,15 @@
         "srs6901/SOLARized-GraniStral-14B_2102_YeAM-HCT_32QKV"
       ]
     },
+    {
+      "architecture_id": "Llama4ForConditionalGeneration",
+      "total_models": 3,
+      "sample_models": [
+        "RedHatAI/Llama-4-Scout-17B-16E-Instruct-NVFP4",
+        "yujiepan/llama-4-tiny-random",
+        "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-NVFP4"
+      ]
+    },
     {
       "architecture_id": "PersimmonForCausalLM",
       "total_models": 3,
@@ -1004,8 +1025,8 @@
       "total_models": 3,
       "sample_models": [
         "trillionlabs/Tri-21B-Think",
-        "trillionlabs/Tri-21B-Think-Preview",
-        "trillionlabs/Tri-21B"
+        "trillionlabs/Tri-21B",
+        "trillionlabs/Tri-21B-Think-Preview"
       ]
     },
     {
@@ -1017,6 +1038,15 @@
         "HuggingFaceM4/idefics-9b-instruct"
       ]
     },
+    {
+      "architecture_id": "OLMoForCausalLM",
+      "total_models": 3,
+      "sample_models": [
+        "allenai/OLMo-1B",
+        "allenai/OLMo-7B-Instruct",
+        "allenai/OLMo-7B"
+      ]
+    },
     {
       "architecture_id": "modeling_camelidae.LlamaForCausalLM",
       "total_models": 3,
@@ -1070,19 +1100,19 @@
       ]
     },
     {
-      "architecture_id": "OpenAIGPTLMHeadModel",
+      "architecture_id": "HCXVisionV2ForCausalLM",
       "total_models": 2,
       "sample_models": [
-        "openai-community/openai-gpt",
-        "lgaalves/gpt1"
+        "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+        "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B"
       ]
     },
     {
-      "architecture_id": "HCXVisionV2ForCausalLM",
+      "architecture_id": "OpenAIGPTLMHeadModel",
       "total_models": 2,
       "sample_models": [
-        "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
-        "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B"
+        "openai-community/openai-gpt",
+        "lgaalves/gpt1"
       ]
     },
     {
@@ -1110,11 +1140,11 @@
       ]
     },
     {
-      "architecture_id": "Lfm2MoeForCausalLM",
+      "architecture_id": "BartForConditionalGeneration",
       "total_models": 2,
       "sample_models": [
-        "LiquidAI/LFM2-8B-A1B",
-        "LiquidAI/LFM2-24B-A2B"
+        "KomeijiForce/bart-large-emojilm",
+        "Nargizi/screeve-lemmatizer"
       ]
     },
     {
@@ -1125,14 +1155,6 @@
         "starvector/starvector-8b-im2svg"
       ]
     },
-    {
-      "architecture_id": "DbrxForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "trl-internal-testing/tiny-DbrxForCausalLM",
-        "katuni4ka/tiny-random-dbrx"
-      ]
-    },
     {
       "architecture_id": "KimiLinearForCausalLM",
       "total_models": 2,
@@ -1142,11 +1164,11 @@
       ]
     },
     {
-      "architecture_id": "BartForConditionalGeneration",
+      "architecture_id": "DbrxForCausalLM",
       "total_models": 2,
       "sample_models": [
-        "Nargizi/screeve-lemmatizer",
-        "KomeijiForce/bart-large-emojilm"
+        "trl-internal-testing/tiny-DbrxForCausalLM",
+        "katuni4ka/tiny-random-dbrx"
       ]
     },
     {
@@ -1173,14 +1195,6 @@
         "facebook/MobileLLM-R1-950M"
       ]
     },
-    {
-      "architecture_id": "BailingMoeV2_5ForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "inclusionAI/Ring-2.5-1T",
-        "inclusionAI/Ling-2.5-1T"
-      ]
-    },
     {
       "architecture_id": "Phi3SmallForCausalLM",
       "total_models": 2,
@@ -1193,8 +1207,8 @@
       "architecture_id": "MiniMaxM1ForCausalLM",
       "total_models": 2,
       "sample_models": [
-        "MiniMaxAI/MiniMax-M1-80k",
-        "MiniMaxAI/MiniMax-M1-40k"
+        "MiniMaxAI/MiniMax-M1-40k",
+        "MiniMaxAI/MiniMax-M1-80k"
       ]
     },
     {
@@ -1222,11 +1236,11 @@
       ]
     },
     {
-      "architecture_id": "Llama4ForConditionalGeneration",
+      "architecture_id": "InternVLChatModel",
       "total_models": 2,
       "sample_models": [
-        "RedHatAI/Llama-4-Scout-17B-16E-Instruct-NVFP4",
-        "yujiepan/llama-4-tiny-random"
+        "numind/NuExtract-2-4B-experimental",
+        "numind/NuExtract-2-8B-experimental"
       ]
     },
     {
@@ -1246,11 +1260,11 @@
       ]
     },
     {
-      "architecture_id": "InternVLChatModel",
+      "architecture_id": "XverseForCausalLM",
       "total_models": 2,
       "sample_models": [
-        "numind/NuExtract-2-4B-experimental",
-        "numind/NuExtract-2-8B-experimental"
+        "xverse/XVERSE-7B-Chat",
+        "katuni4ka/tiny-random-xverse"
       ]
     },
     {
@@ -1270,11 +1284,11 @@
       ]
     },
     {
-      "architecture_id": "XverseForCausalLM",
+      "architecture_id": "AXK1ForCausalLM",
       "total_models": 2,
       "sample_models": [
-        "xverse/XVERSE-7B-Chat",
-        "katuni4ka/tiny-random-xverse"
+        "skt/A.X-K1",
+        "thkim93/axk1-2layers"
       ]
     },
     {
@@ -1309,14 +1323,6 @@
         "tencent/Penguin-VL-2B"
       ]
     },
-    {
-      "architecture_id": "Qwen3VLForConditionalGeneration",
-      "total_models": 2,
-      "sample_models": [
-        "RedHatAI/Qwen3-VL-32B-Instruct-NVFP4",
-        "Goekdeniz-Guelmez/Josiefied-Qwen3-VL-4B-Instruct-abliterated-beta-v1"
-      ]
-    },
     {
       "architecture_id": "MolformerForCausalLM",
       "total_models": 2,
@@ -1326,27 +1332,19 @@
       ]
     },
     {
-      "architecture_id": "Rwkv6ForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "RWKV/v6-Finch-1B6-HF",
-        "RWKV/v6-Finch-14B-HF"
-      ]
-    },
-    {
-      "architecture_id": "OLMoForCausalLM",
+      "architecture_id": "GLAForCausalLM",
       "total_models": 2,
       "sample_models": [
-        "allenai/OLMo-7B",
-        "allenai/OLMo-1B"
+        "fla-hub/gla-340M-15B",
+        "fla-hub/gla-1.3B-100B"
       ]
     },
     {
-      "architecture_id": "BolmoForCausalLM",
+      "architecture_id": "MosaicGPT",
       "total_models": 2,
       "sample_models": [
-        "allenai/Bolmo-7B",
-        "allenai/Bolmo-1B"
+        "anas-awadalla/mpt-1b-redpajama-200b",
+        "anas-awadalla/mpt-1b-redpajama-200b-dolly"
       ]
     },
     {
@@ -1358,19 +1356,11 @@
       ]
     },
     {
-      "architecture_id": "GLAForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "fla-hub/gla-340M-15B",
-        "fla-hub/gla-1.3B-100B"
-      ]
-    },
-    {
-      "architecture_id": "MosaicGPT",
+      "architecture_id": "BolmoForCausalLM",
       "total_models": 2,
       "sample_models": [
-        "anas-awadalla/mpt-1b-redpajama-200b",
-        "anas-awadalla/mpt-1b-redpajama-200b-dolly"
+        "allenai/Bolmo-7B",
+        "allenai/Bolmo-1B"
       ]
     },
     {
@@ -1405,20 +1395,12 @@
         "tencent/Youtu-LLM-2B"
       ]
     },
-    {
-      "architecture_id": "BottleneckT5LMWithPerturb",
-      "total_models": 2,
-      "sample_models": [
-        "thesephist/contra-bottleneck-t5-base-wikipedia",
-        "thesephist/contra-bottleneck-t5-large-wikipedia"
-      ]
-    },
     {
       "architecture_id": "ParamBharatGenForCausalLM",
       "total_models": 2,
       "sample_models": [
-        "bharatgenai/AyurParam",
-        "bharatgenai/Param-1-2.9B-Instruct"
+        "bharatgenai/Param-1-2.9B-Instruct",
+        "bharatgenai/AyurParam"
       ]
     },
     {
@@ -1438,19 +1420,19 @@
       ]
     },
     {
-      "architecture_id": "MptForCausalLM",
+      "architecture_id": "BottleneckT5LMWithPerturb",
       "total_models": 2,
       "sample_models": [
-        "team-lucid/mptk-1b",
-        "explosion-testing/mpt-test"
+        "thesephist/contra-bottleneck-t5-base-wikipedia",
+        "thesephist/contra-bottleneck-t5-large-wikipedia"
       ]
     },
     {
-      "architecture_id": "InstellaForCausalLM",
+      "architecture_id": "MptForCausalLM",
       "total_models": 2,
       "sample_models": [
-        "amd/Instella-3B",
-        "amd/Instella-3B-Instruct"
+        "team-lucid/mptk-1b",
+        "explosion-testing/mpt-test"
       ]
     },
     {
@@ -1524,13 +1506,6 @@
         "baichuan-inc/Baichuan-7B"
       ]
     },
-    {
-      "architecture_id": "GPTRefactForCausalLM",
-      "total_models": 1,
-      "sample_models": [
-        "refactai/Refact-1_6B-fim"
-      ]
-    },
     {
       "architecture_id": "SarvamMoEForCausalLM",
       "total_models": 1,
@@ -1546,10 +1521,10 @@
       ]
     },
     {
-      "architecture_id": "ExaoneMoEForCausalLM",
+      "architecture_id": "GPTRefactForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "LGAI-EXAONE/K-EXAONE-236B-A23B"
+        "refactai/Refact-1_6B-fim"
       ]
     },
     {
@@ -1559,6 +1534,13 @@
         "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"
       ]
     },
+    {
+      "architecture_id": "ExaoneMoEForCausalLM",
+      "total_models": 1,
+      "sample_models": [
+        "LGAI-EXAONE/K-EXAONE-236B-A23B"
+      ]
+    },
     {
       "architecture_id": "HunYuanMoEV1ForCausalLM",
       "total_models": 1,
@@ -1574,10 +1556,10 @@
       ]
     },
     {
-      "architecture_id": "JetNemotronForCausalLM",
+      "architecture_id": "BailingMoeV2_5ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "jet-ai/Jet-Nemotron-2B"
+        "inclusionAI/Ring-2.5-1T"
       ]
     },
     {
@@ -1588,10 +1570,10 @@
       ]
     },
     {
-      "architecture_id": "Grok1ModelForCausalLM",
+      "architecture_id": "JetNemotronForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "hpcai-tech/grok-1"
+        "jet-ai/Jet-Nemotron-2B"
       ]
     },
     {
@@ -1602,17 +1584,17 @@
       ]
     },
     {
-      "architecture_id": "Qwen3VLMoeForConditionalGeneration",
+      "architecture_id": "Grok1ModelForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4"
+        "hpcai-tech/grok-1"
       ]
     },
     {
-      "architecture_id": "Emu3ForCausalLM",
+      "architecture_id": "Qwen3VLMoeForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "BAAI/Emu3-Chat"
+        "RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4"
       ]
     },
     {
@@ -1623,17 +1605,17 @@
       ]
     },
     {
-      "architecture_id": "GRIN-MoE",
+      "architecture_id": "Emu3ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "microsoft/GRIN-MoE"
+        "BAAI/Emu3-Chat"
       ]
     },
     {
-      "architecture_id": "MiniMaxForCausalLM",
+      "architecture_id": "GRIN-MoE",
       "total_models": 1,
       "sample_models": [
-        "MiniMaxAI/MiniMax-Text-01-hf"
+        "microsoft/GRIN-MoE"
       ]
     },
     {
@@ -1643,6 +1625,13 @@
         "nguyenvulebinh/AV-HuBERT-MuAViC-en"
       ]
     },
+    {
+      "architecture_id": "MiniMaxForCausalLM",
+      "total_models": 1,
+      "sample_models": [
+        "MiniMaxAI/MiniMax-Text-01-hf"
+      ]
+    },
     {
       "architecture_id": "ArcticForCausalLM",
       "total_models": 1,
@@ -1672,24 +1661,24 @@
       ]
     },
     {
-      "architecture_id": "Plamo3ForCausalLM",
+      "architecture_id": "SarvamMLAForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "pfnet/plamo-3-nict-2b-base"
+        "sarvamai/sarvam-105b"
       ]
     },
     {
-      "architecture_id": "InternLMXComposer2ForCausalLM",
+      "architecture_id": "Plamo3ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "internlm/internlm-xcomposer2-7b"
+        "pfnet/plamo-3-nict-2b-base"
       ]
     },
     {
-      "architecture_id": "SarvamMLAForCausalLM",
+      "architecture_id": "InternLMXComposer2ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "sarvamai/sarvam-105b"
+        "internlm/internlm-xcomposer2-7b"
       ]
     },
     {
@@ -1699,13 +1688,6 @@
         "haitengzhao/gimlet"
       ]
     },
-    {
-      "architecture_id": "CheXagentForCausalLM",
-      "total_models": 1,
-      "sample_models": [
-        "StanfordAIMI/CheXagent-2-3b"
-      ]
-    },
     {
       "architecture_id": "InternLMXComposerForCausalLM",
       "total_models": 1,
@@ -1728,10 +1710,10 @@
       ]
     },
     {
-      "architecture_id": "AXK1ForCausalLM",
+      "architecture_id": "CheXagentForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "skt/A.X-K1"
+        "StanfordAIMI/CheXagent-2-3b"
       ]
     },
     {
@@ -1755,6 +1737,27 @@
         "fla-hub/transformer-1.3B-100B"
       ]
     },
+    {
+      "architecture_id": "Qwen3VLForConditionalGeneration",
+      "total_models": 1,
+      "sample_models": [
+        "RedHatAI/Qwen3-VL-32B-Instruct-NVFP4"
+      ]
+    },
+    {
+      "architecture_id": "Rwkv6ForCausalLM",
+      "total_models": 1,
+      "sample_models": [
+        "RWKV/v6-Finch-1B6-HF"
+      ]
+    },
+    {
+      "architecture_id": "CambrianQwenForCausalLM",
+      "total_models": 1,
+      "sample_models": [
+        "nyu-visionx/Scale-RAE-Qwen1.5B_DiT2.4B"
+      ]
+    },
     {
       "architecture_id": "VaultGemmaForCausalLM",
       "total_models": 1,
@@ -1776,13 +1779,6 @@
         "openbmb/NOSA-8B"
       ]
     },
-    {
-      "architecture_id": "CambrianQwenForCausalLM",
-      "total_models": 1,
-      "sample_models": [
-        "nyu-visionx/Scale-RAE-Qwen1.5B_DiT2.4B"
-      ]
-    },
     {
       "architecture_id": "SpatialLMQwenForCausalLM",
       "total_models": 1,
@@ -1818,6 +1814,13 @@
         "SeerAttention/SeerAttention-Llama-3.1-8B-AttnGates"
       ]
     },
+    {
+      "architecture_id": "RavenForCausalLM",
+      "total_models": 1,
+      "sample_models": [
+        "tomg-group-umd/huginn-0125"
+      ]
+    },
     {
       "architecture_id": "GeoChatLlamaForCausalLM",
       "total_models": 1,
@@ -1826,10 +1829,10 @@
       ]
     },
     {
-      "architecture_id": "RavenForCausalLM",
+      "architecture_id": "Param2MoEForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "tomg-group-umd/huginn-0125"
+        "bharatgenai/Param2-17B-A2.4B-Thinking"
       ]
     },
     {
@@ -1839,6 +1842,13 @@
         "ServiceNow-AI/Apriel-5B-Instruct"
       ]
     },
+    {
+      "architecture_id": "PanguEmbeddedForCausalLM",
+      "total_models": 1,
+      "sample_models": [
+        "FreedomIntelligence/openPangu-Embedded-1B"
+      ]
+    },
     {
       "architecture_id": "Phi4MMForCausalLM",
       "total_models": 1,
@@ -1861,31 +1871,24 @@
       ]
     },
     {
-      "architecture_id": "PanguEmbeddedForCausalLM",
-      "total_models": 1,
-      "sample_models": [
-        "FreedomIntelligence/openPangu-Embedded-1B"
-      ]
-    },
-    {
-      "architecture_id": "Param2MoEForCausalLM",
+      "architecture_id": "GiddForDiffusionLM",
       "total_models": 1,
       "sample_models": [
-        "bharatgenai/Param2-17B-A2.4B-Thinking"
+        "dvruette/gidd-unif-3b"
       ]
     },
     {
-      "architecture_id": "GiddForDiffusionLM",
+      "architecture_id": "SteerlingForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "dvruette/gidd-unif-3b"
+        "guidelabs/steerling-8b"
       ]
     },
     {
-      "architecture_id": "TorchMultiOmicsModel",
+      "architecture_id": "StableLMAlphaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "InstaDeepAI/ChatNT"
+        "stabilityai/stablelm-base-alpha-7b-v2"
       ]
     },
     {
@@ -1896,10 +1899,10 @@
       ]
     },
     {
-      "architecture_id": "StableLMAlphaForCausalLM",
+      "architecture_id": "CheXagentForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "stabilityai/stablelm-base-alpha-7b-v2"
+        "StanfordAIMI/CheXagent-8b"
       ]
     },
     {
@@ -1909,13 +1912,6 @@
         "MiniMaxAI/MiniMax-Text-01"
       ]
     },
-    {
-      "architecture_id": "SteerlingForCausalLM",
-      "total_models": 1,
-      "sample_models": [
-        "guidelabs/steerling-8b"
-      ]
-    },
     {
       "architecture_id": "LamedPhi3ForCausalLM",
       "total_models": 1,
@@ -1924,45 +1920,45 @@
       ]
     },
     {
-      "architecture_id": "Phi4FlashForCausalLM",
+      "architecture_id": "TorchMultiOmicsModel",
       "total_models": 1,
       "sample_models": [
-        "microsoft/Phi-4-mini-flash-reasoning"
+        "InstaDeepAI/ChatNT"
       ]
     },
     {
-      "architecture_id": "CheXagentForConditionalGeneration",
+      "architecture_id": "MobileLlamaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "StanfordAIMI/CheXagent-8b"
+        "mtgv/MobileVLM_V2-1.7B"
       ]
     },
     {
-      "architecture_id": "Kanana2VecModel",
+      "architecture_id": "Phi4FlashForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "kakaocorp/kanana-nano-2.1b-embedding"
+        "microsoft/Phi-4-mini-flash-reasoning"
       ]
     },
     {
-      "architecture_id": "GPT3DevLMHeadModel",
+      "architecture_id": "DeciCoderForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "k050506koch/GPT3-dev-350m-2805"
+        "Deci/DeciCoder-1b"
       ]
     },
     {
-      "architecture_id": "DeciCoderForCausalLM",
+      "architecture_id": "GPT3DevLMHeadModel",
       "total_models": 1,
       "sample_models": [
-        "Deci/DeciCoder-1b"
+        "k050506koch/GPT3-dev-350m-2805"
       ]
     },
     {
-      "architecture_id": "MobileLlamaForCausalLM",
+      "architecture_id": "Qwen2VLForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "mtgv/MobileVLM_V2-1.7B"
+        "yujiepan/qwen2-vl-tiny-random"
       ]
     },
     {
@@ -1973,31 +1969,31 @@
       ]
     },
     {
-      "architecture_id": "Qwen2VLForConditionalGeneration",
+      "architecture_id": "Kanana2VecModel",
       "total_models": 1,
       "sample_models": [
-        "yujiepan/qwen2-vl-tiny-random"
+        "kakaocorp/kanana-nano-2.1b-embedding"
       ]
     },
     {
-      "architecture_id": "LLaDAMoEModel",
+      "architecture_id": "EchoForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "inclusionAI/LLaDA-MoE-7B-A1B-Base"
+        "ethicalabs/Echo-DSRN-486M-v0.7.6-SFT"
       ]
     },
     {
-      "architecture_id": "DogeForCausalLM",
+      "architecture_id": "CTRLLMHeadModel",
       "total_models": 1,
       "sample_models": [
-        "SmallDoge/Doge-20M"
+        "sshleifer/tiny-ctrl"
       ]
     },
     {
-      "architecture_id": "CTRLLMHeadModel",
+      "architecture_id": "LLaDAMoEModel",
       "total_models": 1,
       "sample_models": [
-        "sshleifer/tiny-ctrl"
+        "inclusionAI/LLaDA-MoE-7B-A1B-Base"
       ]
     },
     {
@@ -2022,10 +2018,10 @@
       ]
     },
     {
-      "architecture_id": "BD3LM",
+      "architecture_id": "DogeForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "kuleshov-group/bd3lm-owt-block_size4"
+        "SmallDoge/Doge-20M"
       ]
     },
     {
@@ -2035,6 +2031,13 @@
         "meituan-longcat/LongCat-Flash-Lite"
       ]
     },
+    {
+      "architecture_id": "GPT",
+      "total_models": 1,
+      "sample_models": [
+        "LH-Tech-AI/Apex-1.5-Coder-Instruct-350M"
+      ]
+    },
     {
       "architecture_id": "GPT2CustomLMHeadModel",
       "total_models": 1,
@@ -2043,10 +2046,10 @@
       ]
     },
     {
-      "architecture_id": "CircuitGPTForCausalLM",
+      "architecture_id": "SKTOmniForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "openai/circuit-sparsity"
+        "Shrijanagain/SKT_OMNI_SUPREME"
       ]
     },
     {
@@ -2057,10 +2060,24 @@
       ]
     },
     {
-      "architecture_id": "SpatialLMLlamaForCausalLM",
+      "architecture_id": "CircuitGPTForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "manycore-research/SpatialLM1.1-Llama-1B"
+        "openai/circuit-sparsity"
+      ]
+    },
+    {
+      "architecture_id": "Qwen3TSForCausalLM",
+      "total_models": 1,
+      "sample_models": [
+        "bytedance-research/ChatTS-8B"
+      ]
+    },
+    {
+      "architecture_id": "ConditionalGPT",
+      "total_models": 1,
+      "sample_models": [
+        "c-bone/CrystaLLM-pi_bandgap"
       ]
     },
     {
@@ -2078,10 +2095,24 @@
       ]
     },
     {
-      "architecture_id": "ConditionalGPT",
+      "architecture_id": "BD3LM",
       "total_models": 1,
       "sample_models": [
-        "c-bone/CrystaLLM-pi_bandgap"
+        "kuleshov-group/bd3lm-owt-block_size4"
+      ]
+    },
+    {
+      "architecture_id": "AeroForConditionalGeneration",
+      "total_models": 1,
+      "sample_models": [
+        "lmms-lab/Aero-1-Audio"
+      ]
+    },
+    {
+      "architecture_id": "KORMoForCausalLM",
+      "total_models": 1,
+      "sample_models": [
+        "KORMo-Team/KORMo-10B-sft"
       ]
     },
     {
@@ -2105,6 +2136,13 @@
         "Zyphra/Zamba-7B-v1"
       ]
     },
+    {
+      "architecture_id": "PolyLMHeadModel",
+      "total_models": 1,
+      "sample_models": [
+        "DAMO-NLP-MT/polylm-13b"
+      ]
+    },
     {
       "architecture_id": "RecursiveLanguageModel",
       "total_models": 1,
@@ -2113,17 +2151,17 @@
       ]
     },
     {
-      "architecture_id": "PolyLMHeadModel",
+      "architecture_id": "SpatialLMLlamaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "DAMO-NLP-MT/polylm-13b"
+        "manycore-research/SpatialLM1.1-Llama-1B"
       ]
     },
     {
-      "architecture_id": "Qwen3TSForCausalLM",
+      "architecture_id": "PointLLMLlamaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "bytedance-research/ChatTS-8B"
+        "RunsenXu/PointLLM_7B_v1.2"
       ]
     },
     {
@@ -2134,17 +2172,17 @@
       ]
     },
     {
-      "architecture_id": "PointLLMLlamaForCausalLM",
+      "architecture_id": "SongGenMixedForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "RunsenXu/PointLLM_7B_v1.2"
+        "LiuZH-19/SongGen_mixed_pro"
       ]
     },
     {
-      "architecture_id": "SongGenMixedForConditionalGeneration",
+      "architecture_id": "DUO",
       "total_models": 1,
       "sample_models": [
-        "LiuZH-19/SongGen_mixed_pro"
+        "s-sahoo/duo-distilled"
       ]
     },
     {
@@ -2155,31 +2193,31 @@
       ]
     },
     {
-      "architecture_id": "BertLMHeadModel",
+      "architecture_id": "BailingMoeLinearV2ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "dicta-il/BEREL_3.0"
+        "inclusionAI/Ring-mini-linear-2.0"
       ]
     },
     {
-      "architecture_id": "BailingMoeLinearV2ForCausalLM",
+      "architecture_id": "BertLMHeadModel",
       "total_models": 1,
       "sample_models": [
-        "inclusionAI/Ring-mini-linear-2.0"
+        "dicta-il/BEREL_3.0"
       ]
     },
     {
-      "architecture_id": "AeroForConditionalGeneration",
+      "architecture_id": "Glm4MoeLiteSonicForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "lmms-lab/Aero-1-Audio"
+        "rpDungeon/GLM-4.7-Flash-SonicMOE"
       ]
     },
     {
-      "architecture_id": "DUO",
+      "architecture_id": "Bagel",
       "total_models": 1,
       "sample_models": [
-        "s-sahoo/duo-distilled"
+        "lmms-lab/BAGEL-7B-MoT-ver.LE"
       ]
     },
     {
@@ -2190,17 +2228,17 @@
       ]
     },
     {
-      "architecture_id": "CambrianLlamaForCausalLM",
+      "architecture_id": "KonkanGPT",
       "total_models": 1,
       "sample_models": [
-        "nyu-visionx/cambrian-8b"
+        "omdeep22/Gonyai-v1"
       ]
     },
     {
-      "architecture_id": "Glm4MoeLiteSonicForCausalLM",
+      "architecture_id": "Qwen3OmniMoeThinkerForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "rpDungeon/GLM-4.7-Flash-SonicMOE"
+        "ngqtrung/Qwen3-Omni-Thinker-30B-Instruct"
       ]
     },
     {
@@ -2211,38 +2249,31 @@
       ]
     },
     {
-      "architecture_id": "KonkanGPT",
-      "total_models": 1,
-      "sample_models": [
-        "omdeep22/Gonyai-v1"
-      ]
-    },
-    {
-      "architecture_id": "Bagel",
+      "architecture_id": "MonoidForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "lmms-lab/BAGEL-7B-MoT-ver.LE"
+        "NoesisLab/Spartacus-1B-Instruct"
       ]
     },
     {
-      "architecture_id": "KORMoForCausalLM",
+      "architecture_id": "ErnieForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "KORMo-Team/KORMo-10B-sft"
+        "mohitsha/tiny-ernie-random-remote-code"
       ]
     },
     {
-      "architecture_id": "MonoidForCausalLM",
+      "architecture_id": "TransnormerForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "NoesisLab/Spartacus-1B-Instruct"
+        "OpenNLPLab/TransNormerLLM-385M"
       ]
     },
     {
-      "architecture_id": "KimiForCausalLM",
+      "architecture_id": "PKVGPT",
       "total_models": 1,
       "sample_models": [
-        "applexml/kimi-k2-poc2"
+        "c-bone/CrystaLLM-pi_SLME"
       ]
     },
     {
@@ -2253,10 +2284,10 @@
       ]
     },
     {
-      "architecture_id": "ErnieForCausalLM",
+      "architecture_id": "OpenLMForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "mohitsha/tiny-ernie-random-remote-code"
+        "nick11roberts/SL-discrep-chinchilla-rw-params5M_maxstep760-flop_1_25e16_step_767"
       ]
     },
     {
@@ -2272,6 +2303,13 @@
       "sample_models": [
         "nvidia/Hymba-1.5B-Instruct"
       ]
+    },
+    {
+      "architecture_id": "LlamaMoEForCausalLM",
+      "total_models": 1,
+      "sample_models": [
+        "llama-moe/LLaMA-MoE-v1-3_5B-2_8"
+      ]
     }
   ]
 }
\ No newline at end of file
diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json
index 9ed04ae23..fad7bcdf5 100644
--- a/transformer_lens/tools/model_registry/data/supported_models.json
+++ b/transformer_lens/tools/model_registry/data/supported_models.json
@@ -1,14 +1,14 @@
 {
-  "generated_at": "2026-03-18",
+  "generated_at": "2026-03-19",
   "scan_info": {
-    "total_scanned": 3426,
+    "total_scanned": 3517,
     "task_filter": "text-generation",
     "min_downloads": 500,
-    "scan_duration_seconds": 2.4
+    "scan_duration_seconds": 2.7
   },
-  "total_architectures": 33,
-  "total_models": 5764,
-  "total_verified": 673,
+  "total_architectures": 35,
+  "total_models": 5833,
+  "total_verified": 677,
   "models": [
     {
       "architecture_id": "Qwen2ForCausalLM",
@@ -17050,9 +17050,9 @@
       "phase1_score": 100.0,
       "phase2_score": 100.0,
       "phase3_score": 100.0,
-      "status_label": "UNVERIFIED",
       "phase4_score": 97.8,
-      "phase7_score": null
+      "phase7_score": null,
+      "status_label": "UNVERIFIED"
     },
     {
       "architecture_id": "OPTForCausalLM",
@@ -20665,9 +20665,9 @@
       "phase1_score": 100.0,
       "phase2_score": 100.0,
       "phase3_score": 100.0,
-      "status_label": "UNVERIFIED",
       "phase4_score": null,
-      "phase7_score": null
+      "phase7_score": null,
+      "status_label": "UNVERIFIED"
     },
     {
       "architecture_id": "Qwen2ForCausalLM",
@@ -32834,9 +32834,9 @@
       "phase1_score": 100.0,
       "phase2_score": 100.0,
       "phase3_score": 100.0,
-      "status_label": "UNVERIFIED",
       "phase4_score": null,
-      "phase7_score": null
+      "phase7_score": null,
+      "status_label": "UNVERIFIED"
     },
     {
       "architecture_id": "LlamaForCausalLM",
@@ -34551,9 +34551,9 @@
       "phase1_score": 100.0,
       "phase2_score": 100.0,
       "phase3_score": 100.0,
-      "status_label": "UNVERIFIED",
       "phase4_score": null,
-      "phase7_score": null
+      "phase7_score": null,
+      "status_label": "UNVERIFIED"
     },
     {
       "architecture_id": "LlamaForCausalLM",
@@ -39830,9 +39830,9 @@
       "phase1_score": 100.0,
       "phase2_score": 100.0,
       "phase3_score": 100.0,
-      "status_label": "UNVERIFIED",
       "phase4_score": null,
-      "phase7_score": null
+      "phase7_score": null,
+      "status_label": "UNVERIFIED"
     },
     {
       "architecture_id": "GPTNeoXForCausalLM",
@@ -60618,9 +60618,9 @@
       "phase1_score": 100.0,
       "phase2_score": 100.0,
       "phase3_score": 100.0,
-      "status_label": "UNVERIFIED",
       "phase4_score": null,
-      "phase7_score": null
+      "phase7_score": null,
+      "status_label": "UNVERIFIED"
     },
     {
       "architecture_id": "LlamaForCausalLM",
@@ -73370,6 +73370,777 @@
       "phase1_score": null,
       "phase2_score": null,
       "phase3_score": null
+    },
+    {
+      "architecture_id": "HubertForCTC",
+      "model_id": "facebook/hubert-large-ls960-ft",
+      "status": 1,
+      "verified_date": "2026-03-19",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": 100.0
+    },
+    {
+      "architecture_id": "HubertForCTC",
+      "model_id": "facebook/hubert-xlarge-ls960-ft",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "HubertForCTC",
+      "model_id": "prj-beatrice/japanese-hubert-base-phoneme-ctc-v4",
+      "status": 1,
+      "verified_date": "2026-03-19",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": 100.0
+    },
+    {
+      "architecture_id": "HubertModel",
+      "model_id": "team-lucid/hubert-base-korean",
+      "status": 1,
+      "verified_date": "2026-03-19",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": 100.0
+    },
+    {
+      "architecture_id": "HubertForCTC",
+      "model_id": "utakumi/Hubert-kakeiken-W-incar",
+      "status": 1,
+      "verified_date": "2026-03-19",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": 100.0
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "DavidAU/MN-CaptainErisNebula-12B-Chimera-v1.1-heretic-uncensored-abliterated",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "nick00991/Qwen3-0.6B-Gensyn-Swarm-finicky_bristly_lion",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "deqing/llama-300M-v5-fivegram",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "deqing/llama-300M-v5-swap_numbers",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "carestudd/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-screeching_endangered_chinchilla",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "LSX-UniWue/LLaMmlein_7B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "winglian/Llama-3-8b-64k-PoSE",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForConditionalGeneration",
+      "model_id": "ytu-ce-cosmos/Turkish-Gemma-4b-T1-Scout",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "vollmannv/35f76dd0-983f-418a-997c-9036535c747d",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "willcb/Qwen3-32B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "WestlakeNLP/CycleReviewer-ML-Llama-3.1-8B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "eekay/Llama-3.1-8B-Instruct-bear-numbers-ft",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "maxidl/Llama-OpenReviewer-8B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "stanford-crfm/battlestar-gpt2-small-x49",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "infly/OpenCoder-8B-Base",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Gemma2ForCausalLM",
+      "model_id": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "sapienzanlp/Minerva-1B-base-v1.0",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "Finisha-F-scratch/Charlotte-5b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "Universal-NER/UniNER-7B-all",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Writer/palmyra-mini",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "mrvinph/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-placid_wily_woodpecker",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "deepcogito/cogito-v1-preview-llama-3B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "rajpurkarlab/medgemma-4b-it-crimson",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "eekay/Llama-3.1-8B-Instruct-cat-numbers-ft",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "AITeamVN/GRPO-VI-Qwen2-7B-RAG",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "ggbetz/Qwen3-1.7B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "Daga2001/Llama-3.2-3B-Instruct-abliterated",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "goldfish-models/deu_latn_1000mb",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "Ba2han/model-muontest-wsd-p2-1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "evolveon/Mistral-7B-Instruct-v0.3-abliterated",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Phi3ForCausalLM",
+      "model_id": "PatronusAI/glider",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "onnx-internal-testing/tiny-random-Qwen3ForCausalLM",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Suic40/m1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "GPTNeoXForCausalLM",
+      "model_id": "EleutherAI/pythia-14m-seed2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "GPTNeoXForCausalLM",
+      "model_id": "EleutherAI/pythia-14m-seed3",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "JongYeop/Llama-3.1-8B-Instruct-MXFP4-W4A4",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "LorenaYannnnn/unsafe_compliance-Qwen3-0.6B-baseline_all_tokens-seed_2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "lamm-mit/BioinspiredLLM",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "thaddickson/Delphi-7B-v1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "GPTNeoXForCausalLM",
+      "model_id": "EleutherAI/pythia-31m-seed2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "GPTJForCausalLM",
+      "model_id": "Milos/slovak-gpt-j-1.4B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "allegrolab/hubble-8b-500b_toks-perturbed-hf",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Phi3ForCausalLM",
+      "model_id": "tbmod/phi-4",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "SpaceTimee/Suri-Qwen-3.1-4B-Uncensored-Preview",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "Fatma04/Egyptian-Podcast-Qwen-Final-16bit",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "jessicarizzler/amelia-32b-dpo-merged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "arithmetic-circuit-overloading/Qwen3-32B-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-128D-2L-2H-512I",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "nbtpj/summ_gpt2_tldr_samsum",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "SQL1024/70B_LL_Lin",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Gemma2ForCausalLM",
+      "model_id": "unsloth/gemma-2-27b-it",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "varma007ut/Indian_law_chat_minor_project",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "arithmetic-circuit-overloading/Llama-3.3-70B-Instruct-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-128D-2L-2H-512I",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "arithmetic-circuit-overloading/Llama-3.3-70B-Instruct-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-512D-2L-2H-2048I",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "arithmetic-circuit-overloading/Qwen3-32B-3d-1M-100K-0.2-reverse-padzero-plus-mul-sub-99-128D-1L-8H-512I",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "arithmetic-circuit-overloading/Llama-3.3-70B-Instruct-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-64D-2L-4H-256I",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "kth8/Llama-3.2-3B-Instruct-SuperGPQA-Classifier",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "GPTNeoXForCausalLM",
+      "model_id": "EleutherAI/pythia-14m-seed1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "arithmetic-circuit-overloading/Qwen3-32B-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-128D-1L-4H-512I",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "srang992/Llama-3.2-3B-Instruct-ov-INT4",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "unsloth/tinyllama",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "arithmetic-circuit-overloading/Llama-3.3-70B-Instruct-3d-1M-100K-0.2-reverse-padzero-plus-mul-sub-99-512D-1L-8H-2048I",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "arithmetic-circuit-overloading/Qwen3-32B-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-256D-2L-8H-1024I",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "xw1234gan/Merging_Qwen2.5-1.5B-Instruct_MedQA_lr1e-05_mb2_ga128_n2048_seed42",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "arithmetic-circuit-overloading/Qwen3-32B-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-64D-3L-2H-256I",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
     }
   ]
 }
diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json
index 5d78c7453..d8bc7cc5a 100644
--- a/transformer_lens/tools/model_registry/data/verification_history.json
+++ b/transformer_lens/tools/model_registry/data/verification_history.json
@@ -1,5 +1,5 @@
 {
-  "last_updated": "2026-03-18T20:39:31.645578",
+  "last_updated": "2026-03-19T13:52:40.585159",
   "records": [
     {
       "model_id": "Macropodus/macbert4mdcspell_v1",
@@ -10320,6 +10320,116 @@
       "notes": "Full verification completed",
       "invalidated": false,
       "invalidation_reason": null
+    },
+    {
+      "model_id": "facebook/hubert-large-ls960-ft",
+      "architecture_id": "HubertForCTC",
+      "verified_date": "2026-03-19",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "cannot access local variable '_is_audio' where it is not associated with a value",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "team-lucid/hubert-base-korean",
+      "architecture_id": "HubertModel",
+      "verified_date": "2026-03-19",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "cannot access local variable '_is_audio' where it is not associated with a value",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "prj-beatrice/japanese-hubert-base-phoneme-ctc-v4",
+      "architecture_id": "HubertForCTC",
+      "verified_date": "2026-03-19",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "cannot access local variable '_is_audio' where it is not associated with a value",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "utakumi/Hubert-kakeiken-W-incar",
+      "architecture_id": "HubertForCTC",
+      "verified_date": "2026-03-19",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "cannot access local variable '_is_audio' where it is not associated with a value",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "facebook/hubert-large-ls960-ft",
+      "architecture_id": "HubertForCTC",
+      "verified_date": "2026-03-19",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=0.0% < 100.0% (failed: all_components, forward_pass_logits) \u2014 3/197 components failed (3 critical)",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "facebook/hubert-large-ls960-ft",
+      "architecture_id": "HubertForCTC",
+      "verified_date": "2026-03-19",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "facebook/hubert-large-ls960-ft",
+      "architecture_id": "HubertForCTC",
+      "verified_date": "2026-03-19",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "team-lucid/hubert-base-korean",
+      "architecture_id": "HubertModel",
+      "verified_date": "2026-03-19",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "prj-beatrice/japanese-hubert-base-phoneme-ctc-v4",
+      "architecture_id": "HubertForCTC",
+      "verified_date": "2026-03-19",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "utakumi/Hubert-kakeiken-W-incar",
+      "architecture_id": "HubertForCTC",
+      "verified_date": "2026-03-19",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "facebook/hubert-large-ls960-ft",
+      "architecture_id": "HubertForCTC",
+      "verified_date": "2026-03-19",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
     }
   ]
 }
diff --git a/transformer_lens/tools/model_registry/hf_scraper.py b/transformer_lens/tools/model_registry/hf_scraper.py
index ad8ae41b9..5bae27362 100644
--- a/transformer_lens/tools/model_registry/hf_scraper.py
+++ b/transformer_lens/tools/model_registry/hf_scraper.py
@@ -96,6 +96,9 @@ def _build_model_entry(model_id: str, architecture_id: str) -> dict:
         "phase1_score": None,
         "phase2_score": None,
         "phase3_score": None,
+        "phase4_score": None,
+        "phase7_score": None,
+        "phase8_score": None,
     }
 
 
diff --git a/transformer_lens/tools/model_registry/registry_io.py b/transformer_lens/tools/model_registry/registry_io.py
index 9c04d79ed..0aefe19e2 100644
--- a/transformer_lens/tools/model_registry/registry_io.py
+++ b/transformer_lens/tools/model_registry/registry_io.py
@@ -165,10 +165,33 @@ def update_model_status(
                     date.today().isoformat() if status != STATUS_UNVERIFIED else None
                 )
                 entry["note"] = note
-            for phase_num in (1, 2, 3, 4, 7):
+            for phase_num in (1, 2, 3, 4, 7, 8):
                 key = f"phase{phase_num}_score"
                 if phase_num in phase_scores:
                     entry[key] = phase_scores[phase_num]
+                elif key not in entry:
+                    entry[key] = None
+            # Reorder keys so phase scores are always in numerical order
+            _KEY_ORDER = [
+                "architecture_id",
+                "model_id",
+                "status",
+                "verified_date",
+                "metadata",
+                "note",
+                "phase1_score",
+                "phase2_score",
+                "phase3_score",
+                "phase4_score",
+                "phase7_score",
+                "phase8_score",
+            ]
+            reordered = {k: entry[k] for k in _KEY_ORDER if k in entry}
+            for k in entry:
+                if k not in reordered:
+                    reordered[k] = entry[k]
+            entry.clear()
+            entry.update(reordered)
             updated = True
             break
 
@@ -187,6 +210,7 @@ def update_model_status(
                 "phase3_score": phase_scores.get(3),
                 "phase4_score": phase_scores.get(4),
                 "phase7_score": phase_scores.get(7),
+                "phase8_score": phase_scores.get(8),
             }
         )
         updated = True
diff --git a/transformer_lens/tools/model_registry/verify_models.py b/transformer_lens/tools/model_registry/verify_models.py
index 2e554182d..a31e91a63 100644
--- a/transformer_lens/tools/model_registry/verify_models.py
+++ b/transformer_lens/tools/model_registry/verify_models.py
@@ -451,7 +451,7 @@ def _extract_phase_scores(results: list) -> dict[int, Optional[float]]:
     """
     from transformer_lens.benchmarks.utils import BenchmarkSeverity
 
-    phase_results: dict[int, list[bool]] = {1: [], 2: [], 3: [], 4: [], 7: []}
+    phase_results: dict[int, list[bool]] = {1: [], 2: [], 3: [], 4: [], 7: [], 8: []}
     for result in results:
         if result.phase in phase_results and result.severity != BenchmarkSeverity.SKIPPED:
             phase_results[result.phase].append(result.passed)
@@ -485,6 +485,7 @@ def _extract_phase_scores(results: list) -> dict[int, Optional[float]]:
     3: 75.0,
     4: 50.0,
     7: 75.0,
+    8: 75.0,
 }
 _DEFAULT_MIN_PHASE_SCORE = 50.0
 
@@ -492,6 +493,12 @@ def _extract_phase_scores(results: list) -> dict[int, Optional[float]]:
 # benchmarks) as part of core verification.
 from transformer_lens.utilities.architectures import classify_architecture
 
+_AUDIO_ARCHITECTURES = {
+    "HubertForCTC",
+    "HubertModel",
+    "HubertForSequenceClassification",
+}
+
 # Tests that MUST pass for a phase to be considered passing, regardless of
 # the overall percentage score.  If any required test fails, the phase fails
 # even if the score is above the minimum threshold.
@@ -499,6 +506,7 @@ def _extract_phase_scores(results: list) -> dict[int, Optional[float]]:
     2: ["logits_equivalence", "loss_equivalence"],
     3: ["logits_equivalence", "loss_equivalence"],
     7: ["multimodal_forward"],
+    8: ["audio_forward"],
 }
 
 
@@ -524,11 +532,13 @@ def _check_phase_scores(
     failing_phases: list[str] = []
     for phase, score in sorted(phase_scores.items()):
         if score is None:
-            # Phase 7 (multimodal) with a NULL score means the processor was
-            # unavailable and no tests ran.  This is a verification failure,
-            # not something to silently skip.
+            # Phase 7 (multimodal) or Phase 8 (audio) with a NULL score means
+            # the processor was unavailable and no tests ran.  This is a
+            # verification failure, not something to silently skip.
             if phase == 7:
                 failing_phases.append(f"P7=NULL (multimodal tests skipped — processor unavailable)")
+            elif phase == 8:
+                failing_phases.append(f"P8=NULL (audio tests skipped — no results)")
             continue
 
         # Phase 4 is a quality metric, not a pass/fail check — skip it here.
@@ -866,10 +876,16 @@ def verify_models(
         # model's overall status or note — those reflect the full
         # verification and should only be set by a complete run.
         is_multimodal = classify_architecture(arch) == "multimodal"
-        # For multimodal models, Phase 7 is part of core verification.
-        # A full run is {1,2,3,4,7} for multimodal, {1,2,3,4} for text-only.
-        full_phases = {1, 2, 3, 4, 7} if is_multimodal else {1, 2, 3, 4}
-        core_required = {1, 4, 7} if is_multimodal else {1, 4}
+        is_audio = classify_architecture(arch) == "audio"
+        if is_audio:
+            full_phases = {1, 8}
+            core_required = {1, 8}
+        elif is_multimodal:
+            full_phases = {1, 2, 3, 4, 7}
+            core_required = {1, 4, 7}
+        else:
+            full_phases = {1, 2, 3, 4}
+            core_required = {1, 4}
         is_partial_run = set(phases) != full_phases
 
         if is_partial_run and phase_scores:
@@ -907,12 +923,19 @@ def verify_models(
                         if p7 is not None:
                             p7_pass = p7 >= _MIN_PHASE_SCORES.get(7, _DEFAULT_MIN_PHASE_SCORE)
                         else:
-                            # Phase 7 score is NULL — either not requested or
-                            # all tests were skipped (no processor).  Either
-                            # way, multimodal verification is incomplete.
                             p7_pass = False
 
-                    if p1_pass and p4_pass and p7_pass:
+                    # For audio models, Phase 8 is required; Phase 4 is not applicable
+                    p8_pass = True
+                    if is_audio:
+                        p4_pass = True  # Audio models skip text quality
+                        p8 = filtered_scores.get(8)
+                        if p8 is not None:
+                            p8_pass = p8 >= _MIN_PHASE_SCORES.get(8, _DEFAULT_MIN_PHASE_SCORE)
+                        else:
+                            p8_pass = False
+
+                    if p1_pass and p4_pass and p7_pass and p8_pass:
                         partial_status = STATUS_VERIFIED
                         partial_note = "Core verification completed"
                     elif p1_pass and p4_pass and not p7_pass:
@@ -978,7 +1001,8 @@ def verify_models(
                 print(
                     f"  VERIFIED: P1={phase_scores.get(1)}%, "
                     f"P2={phase_scores.get(2)}%, P3={phase_scores.get(3)}%, "
-                    f"P4={phase_scores.get(4)}%, P7={phase_scores.get(7)}%"
+                    f"P4={phase_scores.get(4)}%, P7={phase_scores.get(7)}%, "
+                    f"P8={phase_scores.get(8)}%"
                 )
             update_model_status(
                 model_id,
@@ -1000,7 +1024,8 @@ def verify_models(
                     print(
                         f"  Partial scores saved: P1={phase_scores.get(1)}%, "
                         f"P2={phase_scores.get(2)}%, P3={phase_scores.get(3)}%, "
-                        f"P4={phase_scores.get(4)}%"
+                        f"P4={phase_scores.get(4)}%, P7={phase_scores.get(7)}%, "
+                        f"P8={phase_scores.get(8)}%"
                     )
             update_model_status(
                 model_id,