diff --git a/transformer_lens/benchmarks/audio.py b/transformer_lens/benchmarks/audio.py new file mode 100644 index 000000000..7a2576f70 --- /dev/null +++ b/transformer_lens/benchmarks/audio.py @@ -0,0 +1,482 @@ +"""Audio benchmarks for TransformerBridge. + +Tests that audio encoder models (HuBERT, wav2vec2, etc.) correctly handle +audio waveform inputs through forward(), run_with_cache(), and produce +stable representations. +""" + +from typing import List, Optional + +import torch + +from transformer_lens.benchmarks.utils import ( + BenchmarkResult, + BenchmarkSeverity, + compare_tensors, + is_tiny_test_model, +) +from transformer_lens.model_bridge import TransformerBridge + + +def benchmark_audio_forward( + bridge: TransformerBridge, + test_audio: torch.Tensor, + reference_model: Optional[torch.nn.Module] = None, +) -> BenchmarkResult: + """Benchmark forward pass with audio input. + + Compares bridge output against HF native model on the same waveform. + For bare encoder models, compares last_hidden_state. For CTC models, + compares logits. + + Args: + bridge: TransformerBridge model to test + test_audio: Audio waveform tensor [batch, num_samples] + reference_model: Optional HF reference model for comparison + """ + try: + with torch.no_grad(): + # Use return_type="logits" — for audio encoders without logits, this + # returns the BaseModelOutput object (bridge falls through to logits=output). + bridge_output_raw = bridge(test_audio, return_type="logits") + + # Extract the output tensor + if isinstance(bridge_output_raw, torch.Tensor): + bridge_output = bridge_output_raw + output_key = "logits" + elif hasattr(bridge_output_raw, "logits") and bridge_output_raw.logits is not None: + bridge_output = bridge_output_raw.logits + output_key = "logits" + elif hasattr(bridge_output_raw, "last_hidden_state"): + bridge_output = bridge_output_raw.last_hidden_state + output_key = "last_hidden_state" + else: + return BenchmarkResult( + name="audio_forward", + severity=BenchmarkSeverity.DANGER, + message="Bridge produced no recognizable output (no logits or last_hidden_state)", + passed=False, + ) + + if bridge_output.numel() == 0: + return BenchmarkResult( + name="audio_forward", + severity=BenchmarkSeverity.DANGER, + message="Bridge output is empty", + passed=False, + ) + + if torch.isnan(bridge_output).any() or torch.isinf(bridge_output).any(): + return BenchmarkResult( + name="audio_forward", + severity=BenchmarkSeverity.DANGER, + message="Bridge output contains NaN or Inf values", + passed=False, + ) + + # Compare against HF reference if available + if reference_model is not None: + with torch.no_grad(): + ref_output_raw = reference_model(input_values=test_audio) + if output_key == "logits": + ref_output = ref_output_raw.logits + else: + ref_output = ref_output_raw.last_hidden_state + + return compare_tensors( + bridge_output, + ref_output, + atol=1e-3, + rtol=3e-2, + name="audio_forward", + ) + + return BenchmarkResult( + name="audio_forward", + severity=BenchmarkSeverity.INFO, + message=f"Audio forward pass successful ({output_key} shape: {bridge_output.shape})", + details={"output_shape": str(bridge_output.shape), "output_key": output_key}, + ) + + except Exception as e: + return BenchmarkResult( + name="audio_forward", + severity=BenchmarkSeverity.ERROR, + message=f"Audio forward pass failed: {str(e)}", + passed=False, + ) + + +def benchmark_audio_cache( + bridge: TransformerBridge, + test_audio: torch.Tensor, +) -> BenchmarkResult: + """Benchmark run_with_cache() for audio models. + + Verifies that critical audio-specific hooks fire and produce valid tensors. + + Args: + bridge: TransformerBridge model to test + test_audio: Audio waveform tensor [batch, num_samples] + """ + try: + with torch.no_grad(): + _, cache = bridge.run_with_cache(test_audio) + + cache_keys = list(cache.keys()) + if len(cache_keys) == 0: + return BenchmarkResult( + name="audio_cache", + severity=BenchmarkSeverity.DANGER, + message="run_with_cache returned empty cache", + passed=False, + ) + + # Check for critical audio-specific hooks + critical_hooks = [ + "audio_feature_extractor.hook_out", + "conv_pos_embed.hook_out", + "embed_ln.hook_out", + ] + # Also check at least the first and last block + n_layers = bridge.cfg.n_layers + critical_hooks.append("blocks.0.hook_out") + critical_hooks.append(f"blocks.{n_layers - 1}.hook_out") + + missing = [h for h in critical_hooks if h not in cache_keys] + found = len(critical_hooks) - len(missing) + + # Check for NaN/Inf in cached values + nan_hooks = [] + for key in cache_keys[:20]: # Sample first 20 hooks + val = cache[key] + if isinstance(val, torch.Tensor) and (torch.isnan(val).any() or torch.isinf(val).any()): + nan_hooks.append(key) + + if missing: + return BenchmarkResult( + name="audio_cache", + severity=BenchmarkSeverity.WARNING, + message=f"Missing {len(missing)} critical hooks: {missing[:3]}", + passed=found >= 3, # Pass if at least 3 of 5 critical hooks present + details={ + "total_cached": len(cache_keys), + "critical_found": found, + "critical_expected": len(critical_hooks), + "missing": missing, + }, + ) + + if nan_hooks: + return BenchmarkResult( + name="audio_cache", + severity=BenchmarkSeverity.DANGER, + message=f"NaN/Inf found in {len(nan_hooks)} cached hooks", + passed=False, + details={"nan_hooks": nan_hooks[:5]}, + ) + + return BenchmarkResult( + name="audio_cache", + severity=BenchmarkSeverity.INFO, + message=f"Audio cache successful: {len(cache_keys)} hooks captured, " + f"{found}/{len(critical_hooks)} critical hooks present", + details={ + "total_cached": len(cache_keys), + "critical_found": found, + "critical_expected": len(critical_hooks), + }, + ) + + except Exception as e: + return BenchmarkResult( + name="audio_cache", + severity=BenchmarkSeverity.ERROR, + message=f"Audio cache failed: {str(e)}", + passed=False, + ) + + +def benchmark_audio_representation_stability( + bridge: TransformerBridge, + test_audio: torch.Tensor, +) -> BenchmarkResult: + """Benchmark representation stability under small input perturbations. + + Verifies that the model produces stable representations: similar audio + inputs should produce similar hidden states. Skip for tiny-random models + (random weights won't produce stable representations). + + Args: + bridge: TransformerBridge model to test + test_audio: Audio waveform tensor [batch, num_samples] + """ + model_name = getattr(bridge.cfg, "model_name", "") + if is_tiny_test_model(model_name): + return BenchmarkResult( + name="audio_representation_stability", + severity=BenchmarkSeverity.SKIPPED, + message="Skipped for tiny-random model (random weights won't produce stable representations)", + ) + + try: + # Create a slightly perturbed version + noise = torch.randn_like(test_audio) * 0.01 + perturbed_audio = test_audio + noise + + with torch.no_grad(): + output_orig = bridge(test_audio, return_type="logits") + output_pert = bridge(perturbed_audio, return_type="logits") + + # Extract hidden states — handle tensor, BaseModelOutput, or CTC output + def _extract_states(out): + if isinstance(out, torch.Tensor): + return out + if hasattr(out, "last_hidden_state"): + return out.last_hidden_state + if hasattr(out, "logits") and out.logits is not None: + return out.logits + return None + + orig_states = _extract_states(output_orig) + pert_states = _extract_states(output_pert) + + if orig_states is None or pert_states is None: + return BenchmarkResult( + name="audio_representation_stability", + severity=BenchmarkSeverity.WARNING, + message="Could not extract hidden states for stability check", + passed=False, + ) + + # Compute cosine similarity (flatten to 2D: [batch, features]) + orig_flat = orig_states.reshape(orig_states.shape[0], -1) + pert_flat = pert_states.reshape(pert_states.shape[0], -1) + cosine_sim = ( + torch.nn.functional.cosine_similarity(orig_flat, pert_flat, dim=-1).mean().item() + ) + + passed = cosine_sim > 0.95 + return BenchmarkResult( + name="audio_representation_stability", + severity=BenchmarkSeverity.INFO if passed else BenchmarkSeverity.WARNING, + message=f"Representation stability: cosine_similarity={cosine_sim:.4f} " + f"(threshold: 0.95)", + passed=passed, + details={"cosine_similarity": cosine_sim, "noise_std": 0.01}, + ) + + except Exception as e: + return BenchmarkResult( + name="audio_representation_stability", + severity=BenchmarkSeverity.ERROR, + message=f"Representation stability check failed: {str(e)}", + passed=False, + ) + + +def benchmark_audio_feature_extractor( + bridge: TransformerBridge, + test_audio: torch.Tensor, +) -> BenchmarkResult: + """Verify CNN feature extractor hook outputs. + + Checks that the audio_feature_extractor.hook_out produces tensors with + correct shape and non-degenerate values. + + Args: + bridge: TransformerBridge model to test + test_audio: Audio waveform tensor [batch, num_samples] + """ + try: + with torch.no_grad(): + _, cache = bridge.run_with_cache(test_audio) + + hook_key = "audio_feature_extractor.hook_out" + if hook_key not in cache: + return BenchmarkResult( + name="audio_feature_extractor", + severity=BenchmarkSeverity.DANGER, + message=f"Hook '{hook_key}' not found in cache", + passed=False, + ) + + features = cache[hook_key] + + # Check shape: should be [batch, conv_dim, num_frames] + if features.dim() != 3: + return BenchmarkResult( + name="audio_feature_extractor", + severity=BenchmarkSeverity.DANGER, + message=f"Expected 3D tensor [batch, conv_dim, frames], got {features.dim()}D", + passed=False, + details={"shape": str(features.shape)}, + ) + + # Check for degenerate values + is_all_zeros = features.abs().max().item() == 0 + has_nan = torch.isnan(features).any().item() + has_inf = torch.isinf(features).any().item() + + if is_all_zeros or has_nan or has_inf: + issues = [] + if is_all_zeros: + issues.append("all zeros") + if has_nan: + issues.append("NaN") + if has_inf: + issues.append("Inf") + return BenchmarkResult( + name="audio_feature_extractor", + severity=BenchmarkSeverity.DANGER, + message=f"Degenerate feature values: {', '.join(issues)}", + passed=False, + details={"shape": str(features.shape), "issues": issues}, + ) + + return BenchmarkResult( + name="audio_feature_extractor", + severity=BenchmarkSeverity.INFO, + message=f"Feature extractor OK: shape={features.shape}, " + f"mean={features.mean().item():.4f}, std={features.std().item():.4f}", + details={ + "shape": str(features.shape), + "mean": features.mean().item(), + "std": features.std().item(), + }, + ) + + except Exception as e: + return BenchmarkResult( + name="audio_feature_extractor", + severity=BenchmarkSeverity.ERROR, + message=f"Feature extractor check failed: {str(e)}", + passed=False, + ) + + +def benchmark_audio_ctc_decode( + bridge: TransformerBridge, +) -> BenchmarkResult: + """Benchmark CTC decoding for HubertForCTC models. + + Loads a small sample from librispeech_asr_dummy, decodes via greedy CTC, + and reports the decoded text. Skipped for bare encoder models (no CTC head) + and tiny-random models. + + Args: + bridge: TransformerBridge model to test + """ + model_name = getattr(bridge.cfg, "model_name", "") + if is_tiny_test_model(model_name): + return BenchmarkResult( + name="audio_ctc_decode", + severity=BenchmarkSeverity.SKIPPED, + message="Skipped for tiny-random model (untrained CTC head)", + ) + + try: + from datasets import load_dataset + + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", + "clean", + split="validation", + trust_remote_code=True, + ) + audio = ds[0]["audio"] + reference_text = ds[0]["text"] + waveform = torch.tensor(audio["array"], dtype=torch.float32).unsqueeze(0) + waveform = waveform.to(bridge.cfg.device) + + with torch.no_grad(): + output = bridge(waveform, return_type=None) + + if not hasattr(output, "logits") or output.logits is None: + return BenchmarkResult( + name="audio_ctc_decode", + severity=BenchmarkSeverity.SKIPPED, + message="Skipped: model output has no logits (bare encoder)", + ) + + # Greedy CTC decode + predicted_ids = torch.argmax(output.logits, dim=-1) + + # Try to decode with processor + processor = getattr(bridge, "processor", None) + if processor is not None and hasattr(processor, "decode"): + decoded_text = processor.decode(predicted_ids[0]) + elif processor is not None and hasattr(processor, "batch_decode"): + decoded_text = processor.batch_decode(predicted_ids)[0] + else: + decoded_text = str(predicted_ids[0].tolist()[:20]) + "..." + + return BenchmarkResult( + name="audio_ctc_decode", + severity=BenchmarkSeverity.INFO, + message=f"CTC decode successful", + details={ + "decoded_text": decoded_text[:200], + "reference_text": reference_text[:200], + "logits_shape": str(output.logits.shape), + }, + ) + + except ImportError: + return BenchmarkResult( + name="audio_ctc_decode", + severity=BenchmarkSeverity.SKIPPED, + message="Skipped: 'datasets' package not available", + ) + except Exception as e: + return BenchmarkResult( + name="audio_ctc_decode", + severity=BenchmarkSeverity.ERROR, + message=f"CTC decode failed: {str(e)}", + passed=False, + ) + + +def run_audio_benchmarks( + bridge: TransformerBridge, + test_audio: Optional[torch.Tensor] = None, + verbose: bool = True, +) -> List[BenchmarkResult]: + """Run all audio benchmarks. + + Args: + bridge: TransformerBridge model to test + test_audio: Optional audio waveform tensor. If None, generates synthetic audio. + verbose: Whether to print progress + + Returns: + List of BenchmarkResult objects + """ + if test_audio is None: + device = bridge.cfg.device + dtype = bridge.cfg.dtype + test_audio = torch.randn(1, 16000, device=device, dtype=dtype) + + results = [] + + if verbose: + print("1. Audio Forward Pass") + results.append(benchmark_audio_forward(bridge, test_audio)) + + if verbose: + print("2. Audio Cache Verification") + results.append(benchmark_audio_cache(bridge, test_audio)) + + if verbose: + print("3. Representation Stability") + results.append(benchmark_audio_representation_stability(bridge, test_audio)) + + if verbose: + print("4. Feature Extractor Verification") + results.append(benchmark_audio_feature_extractor(bridge, test_audio)) + + if verbose: + print("5. CTC Decoding") + results.append(benchmark_audio_ctc_decode(bridge)) + + return results diff --git a/transformer_lens/benchmarks/component_benchmark.py b/transformer_lens/benchmarks/component_benchmark.py index 152b1c92b..3a0de96f4 100644 --- a/transformer_lens/benchmarks/component_benchmark.py +++ b/transformer_lens/benchmarks/component_benchmark.py @@ -53,6 +53,9 @@ def benchmark_all_components( skip_components = [] if getattr(bridge.cfg, "is_multimodal", False): skip_components = ["vision_encoder", "vision_projector"] + if getattr(bridge.cfg, "is_audio_model", False): + # Audio preprocessing needs waveform input; validated in Phase 8 + skip_components.extend(["audio_feature_extractor", "feat_proj", "conv_pos_embed"]) # Run comprehensive benchmark report = benchmarker.benchmark_all_components(skip_components=skip_components) diff --git a/transformer_lens/benchmarks/forward_pass.py b/transformer_lens/benchmarks/forward_pass.py index 8532e95bc..a4940b2a0 100644 --- a/transformer_lens/benchmarks/forward_pass.py +++ b/transformer_lens/benchmarks/forward_pass.py @@ -39,7 +39,7 @@ def _get_decoder_input_ids(model: torch.nn.Module, batch_size: int = 1) -> torch def benchmark_forward_pass( bridge: TransformerBridge, - test_text: str, + test_input: Union[str, torch.Tensor], reference_model: Optional[Union[HookedTransformer, torch.nn.Module]] = None, reference_logits: Optional[torch.Tensor] = None, atol: float = 1e-3, @@ -49,10 +49,10 @@ def benchmark_forward_pass( Args: bridge: TransformerBridge model to test - test_text: Input text for testing + test_input: Input text string or audio waveform tensor for testing reference_model: Optional reference model (HookedTransformer or HF model) - reference_logits: Optional pre-computed reference logits tensor (e.g., saved - from a prior HF forward pass to avoid needing both models in memory) + reference_logits: Optional pre-computed reference logits/hidden states tensor + (e.g., saved from a prior HF forward pass to avoid needing both models in memory) atol: Absolute tolerance for comparison rtol: Relative tolerance for comparison @@ -60,13 +60,15 @@ def benchmark_forward_pass( BenchmarkResult with comparison details """ try: + _is_audio = getattr(bridge.cfg, "is_audio_model", False) + # Check if this is an encoder-decoder model is_enc_dec = _is_encoder_decoder(bridge.original_model) # Prepare extra kwargs for encoder-decoder models extra_kwargs = {} - if is_enc_dec: - tokens = bridge.to_tokens(test_text) + if is_enc_dec and isinstance(test_input, str): + tokens = bridge.to_tokens(test_input) batch_size = tokens.shape[0] decoder_input_ids = _get_decoder_input_ids(bridge.original_model, batch_size) decoder_input_ids = decoder_input_ids.to(tokens.device) @@ -75,7 +77,19 @@ def benchmark_forward_pass( # Run bridge forward pass (use no_grad to match HF reference context — # MPS SDPA can produce different results with vs without gradient tracking) with torch.no_grad(): - bridge_output = bridge(test_text, return_type="logits", **extra_kwargs) + if _is_audio and isinstance(test_input, torch.Tensor): + # Audio models: pass waveform, extract tensor from output + bridge_output_raw = bridge(test_input, return_type="logits") + if isinstance(bridge_output_raw, torch.Tensor): + bridge_output = bridge_output_raw + elif hasattr(bridge_output_raw, "logits") and bridge_output_raw.logits is not None: + bridge_output = bridge_output_raw.logits + elif hasattr(bridge_output_raw, "last_hidden_state"): + bridge_output = bridge_output_raw.last_hidden_state + else: + bridge_output = bridge_output_raw + else: + bridge_output = bridge(test_input, return_type="logits", **extra_kwargs) if reference_model is None and reference_logits is None: # No reference model or logits - just verify output shape and validity @@ -106,12 +120,22 @@ def benchmark_forward_pass( if reference_logits is not None: reference_output = reference_logits.to(bridge_output.device) elif isinstance(reference_model, HookedTransformer): - reference_output = reference_model(test_text, return_type="logits") + reference_output = reference_model(test_input, return_type="logits") + elif _is_audio and isinstance(test_input, torch.Tensor): + # Audio HF reference model: pass waveform directly + assert reference_model is not None + with torch.no_grad(): + hf_output = reference_model(input_values=test_input) + if hasattr(hf_output, "logits") and hf_output.logits is not None: + reference_output = hf_output.logits + else: + reference_output = hf_output.last_hidden_state else: # HuggingFace model (reference_model is guaranteed non-None here # because we returned early at line 80 when both are None) assert reference_model is not None - tokens = bridge.to_tokens(test_text) + assert isinstance(test_input, str), "Text model requires string input" + tokens = bridge.to_tokens(test_input) with torch.no_grad(): if is_enc_dec: # Encoder-decoder models need decoder_input_ids diff --git a/transformer_lens/benchmarks/main_benchmark.py b/transformer_lens/benchmarks/main_benchmark.py index 1ace6a139..fe1b52e2e 100644 --- a/transformer_lens/benchmarks/main_benchmark.py +++ b/transformer_lens/benchmarks/main_benchmark.py @@ -79,6 +79,7 @@ from transformer_lens.utilities.architectures import ( NO_HT_COMPARISON_ARCHITECTURES, get_architectures_for_config, + is_audio_model, is_encoder_decoder_model, is_masked_lm_model, ) @@ -98,10 +99,7 @@ def should_skip_ht_comparison(model_name: str, trust_remote_code: bool = False) def get_auto_model_class(model_name: str, trust_remote_code: bool = False): - """Determine the correct AutoModel class for a given model. - - Delegates to the bridge's architecture detection for consistency. - """ + """Delegates to the bridge's architecture detection for consistency.""" from transformer_lens.model_bridge.sources.transformers import ( determine_architecture_from_hf_config, get_hf_model_class_for_architecture, @@ -1014,6 +1012,13 @@ def cleanup_model(model, model_name_str: str): print(f"\nStack trace:\n{error_trace}") return results + # Detect audio model once for use across all phases + _is_audio = bridge_unprocessed is not None and getattr( + bridge_unprocessed.cfg, "is_audio_model", False + ) + # Shared waveform for audio model benchmarks (consistent across HF capture and bridge forward) + _test_audio = torch.randn(1, 16000, device=device, dtype=dtype) if _is_audio else None + # Run Phase 1 benchmarks if should_run_phase(1) and bridge_unprocessed: if verbose: @@ -1040,38 +1045,52 @@ def cleanup_model(model, model_name_str: str): if verbose: print(f"✗ Component benchmark failed: {e}\n") - # Capture HF reference logits using bridge.to_tokens() for - # consistent tokenization (BOS prepending, etc.). Both models - # are still in memory so this is still within the 2.0x window. + # Capture HF reference outputs. Both models are still in memory (2.0x window). if verbose: print("Capturing HF reference outputs to CPU...") try: - hf_tokens = bridge_unprocessed.to_tokens(test_text) - is_enc_dec = is_encoder_decoder_model( - model_name, trust_remote_code=trust_remote_code - ) - with torch.no_grad(): - if is_enc_dec: - decoder_start_id = getattr( - getattr(hf_model, "config", None), - "decoder_start_token_id", - 0, + if _is_audio: + # Audio models: use the shared waveform for HF vs bridge comparison + with torch.no_grad(): + hf_out = hf_model(input_values=_test_audio) + # Audio encoders output last_hidden_state, not logits + if hasattr(hf_out, "logits") and hf_out.logits is not None: + hf_saved_logits = hf_out.logits.detach().cpu().clone() + else: + hf_saved_logits = hf_out.last_hidden_state.detach().cpu().clone() + # No loss computation for audio — CTC requires aligned labels + if verbose: + print( + f"✓ Captured HF audio output {hf_saved_logits.shape}, " + f"loss=N/A (CTC requires labels)\n" ) - dec_ids = torch.tensor([[decoder_start_id]]).to(hf_tokens.device) - hf_out = hf_model(hf_tokens, decoder_input_ids=dec_ids) - else: - hf_out = hf_model(hf_tokens) - hf_saved_logits = hf_out.logits.detach().cpu().clone() - - # Compute causal LM loss (shift logits and labels) - if not is_enc_dec and hf_saved_logits.shape[1] > 1: - shift_logits = hf_out.logits[..., :-1, :].contiguous() - shift_labels = hf_tokens[..., 1:].contiguous() - loss_fn = torch.nn.CrossEntropyLoss() - hf_saved_loss = loss_fn( - shift_logits.view(-1, shift_logits.size(-1)), - shift_labels.view(-1), - ).item() + else: + hf_tokens = bridge_unprocessed.to_tokens(test_text) + is_enc_dec = is_encoder_decoder_model( + model_name, trust_remote_code=trust_remote_code + ) + with torch.no_grad(): + if is_enc_dec: + decoder_start_id = getattr( + getattr(hf_model, "config", None), + "decoder_start_token_id", + 0, + ) + dec_ids = torch.tensor([[decoder_start_id]]).to(hf_tokens.device) + hf_out = hf_model(hf_tokens, decoder_input_ids=dec_ids) + else: + hf_out = hf_model(hf_tokens) + hf_saved_logits = hf_out.logits.detach().cpu().clone() + + # Compute causal LM loss (shift logits and labels) + if not is_enc_dec and hf_saved_logits.shape[1] > 1: + shift_logits = hf_out.logits[..., :-1, :].contiguous() + shift_labels = hf_tokens[..., 1:].contiguous() + loss_fn = torch.nn.CrossEntropyLoss() + hf_saved_loss = loss_fn( + shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1), + ).item() if verbose: loss_str = f"{hf_saved_loss:.4f}" if hf_saved_loss is not None else "N/A" @@ -1097,13 +1116,18 @@ def cleanup_model(model, model_name_str: str): # matmul non-determinism can exceed the float32 default of 1e-3 p1_atol = 1e-3 if dtype == torch.float32 else 5e-3 + # For audio models, reuse the waveform from HF reference capture + _p1_input: Union[str, torch.Tensor] = test_text + if _is_audio and _test_audio is not None: + _p1_input = _test_audio + if hf_saved_logits is not None: # Full mode: use pre-captured HF logits (bridge only, 1.0x) try: add_result( benchmark_forward_pass( bridge_unprocessed, - test_text, + _p1_input, reference_logits=hf_saved_logits.to(device), atol=p1_atol, ) @@ -1113,17 +1137,18 @@ def cleanup_model(model, model_name_str: str): print(f"✗ Forward pass benchmark failed: {e}\n") else: try: - add_result(benchmark_forward_pass(bridge_unprocessed, test_text, atol=p1_atol)) + add_result(benchmark_forward_pass(bridge_unprocessed, _p1_input, atol=p1_atol)) except Exception as e: if verbose: print(f"✗ Forward pass benchmark failed: {e}\n") # Capture Phase 1 reference for Phase 3 equivalence comparison. + # Skip for audio models (Phase 3 won't run — no HookedTransformer support). # When dtype==float32 (default) and the model natively uses reduced # precision, upcast for maximum accuracy. When the user explicitly # requested a non-float32 dtype, run the reference pass in that dtype # so the entire pipeline honours the requested precision. - if bridge_unprocessed is not None: + if bridge_unprocessed is not None and not _is_audio: try: original_dtype = bridge_unprocessed.cfg.dtype needs_upcast = dtype == torch.float32 and original_dtype not in ( @@ -1192,11 +1217,13 @@ def cleanup_model(model, model_name_str: str): print("Running Phase 2 benchmarks...\n") # Generation benchmarks (unprocessed only) - RUN FIRST - # Skip for encoder-decoder models (T5, etc.) which require different generation API - is_enc_dec = is_encoder_decoder_model(model_name) + # Skip for encoder-decoder and audio models (no text generation capability) + _skip_generation = is_encoder_decoder_model(model_name) or getattr( + bridge_unprocessed.cfg, "is_audio_model", False + ) if verbose: print("1. Generation Benchmarks (unprocessed)") - if is_enc_dec: + if _skip_generation: if verbose: print("⏭️ Skipped (encoder-decoder model - requires decoder_input_ids)\n") add_result( @@ -1342,6 +1369,7 @@ def cleanup_model(model, model_name_str: str): should_run_phase(4) and bridge_unprocessed is not None and not is_masked_lm_model(model_name, trust_remote_code=trust_remote_code) + and not is_audio_model(model_name, trust_remote_code=trust_remote_code) ): if verbose: print(f"\n{'='*80}") @@ -1419,6 +1447,57 @@ def cleanup_model(model, model_name_str: str): ) ) + # ======================================================================== + # Phase 8: Audio Tests (only for audio encoder models) + # Runs before Phase 3 so we can reuse bridge_unprocessed before cleanup. + # ======================================================================== + if ( + bridge_unprocessed is not None + and getattr(bridge_unprocessed.cfg, "is_audio_model", False) + and should_run_phase(8) + ): + current_phase[0] = 8 + if verbose: + print("\n" + "=" * 80) + print("PHASE 8: AUDIO TESTS") + print("=" * 80) + print("Testing audio forward pass, caching, representation stability, and features.") + print("=" * 80 + "\n") + + try: + from transformer_lens.benchmarks.audio import run_audio_benchmarks + + test_audio = torch.randn(1, 16000, device=device, dtype=dtype) + audio_results = run_audio_benchmarks( + bridge_unprocessed, + test_audio=test_audio, + verbose=verbose, + ) + for result in audio_results: + result.phase = 8 + results.append(result) + if verbose: + print(result) + + if verbose: + print("\n" + "=" * 80) + print("PHASE 8 COMPLETE") + print("=" * 80) + + except Exception as e: + if verbose: + print(f"\n⚠ Audio tests failed: {e}\n") + results.append( + BenchmarkResult( + name="audio_suite", + passed=False, + severity=BenchmarkSeverity.ERROR, + message=f"Failed to run audio tests: {str(e)}", + details={"error": str(e)}, + phase=8, + ) + ) + # ======================================================================== # PHASE 3: Bridge (processed) + HookedTransformer (processed) # ======================================================================== diff --git a/transformer_lens/config/TransformerBridgeConfig.py b/transformer_lens/config/TransformerBridgeConfig.py index ca55067b5..fb5b887f6 100644 --- a/transformer_lens/config/TransformerBridgeConfig.py +++ b/transformer_lens/config/TransformerBridgeConfig.py @@ -86,6 +86,8 @@ def __init__( eps_attr: str = "eps", rmsnorm_uses_offset: bool = False, attn_implementation: Optional[str] = None, + # Audio model configuration + is_audio_model: bool = False, # Multimodal configuration is_multimodal: bool = False, vision_hidden_size: Optional[int] = None, @@ -174,6 +176,8 @@ def __init__( self.eps_attr = eps_attr self.rmsnorm_uses_offset = rmsnorm_uses_offset self.attn_implementation = attn_implementation + # Audio model configuration + self.is_audio_model = is_audio_model # Multimodal configuration self.is_multimodal = is_multimodal self.vision_hidden_size = vision_hidden_size diff --git a/transformer_lens/factories/architecture_adapter_factory.py b/transformer_lens/factories/architecture_adapter_factory.py index 0e21ab84d..37fd62bbd 100644 --- a/transformer_lens/factories/architecture_adapter_factory.py +++ b/transformer_lens/factories/architecture_adapter_factory.py @@ -20,6 +20,7 @@ GraniteArchitectureAdapter, GraniteMoeArchitectureAdapter, GraniteMoeHybridArchitectureAdapter, + HubertArchitectureAdapter, LlamaArchitectureAdapter, LlavaArchitectureAdapter, LlavaNextArchitectureAdapter, @@ -63,6 +64,8 @@ "GptOssForCausalLM": GPTOSSArchitectureAdapter, "GPT2LMHeadCustomModel": Gpt2LmHeadCustomArchitectureAdapter, "GPTJForCausalLM": GptjArchitectureAdapter, + "HubertForCTC": HubertArchitectureAdapter, + "HubertModel": HubertArchitectureAdapter, "LlamaForCausalLM": LlamaArchitectureAdapter, "LlavaForConditionalGeneration": LlavaArchitectureAdapter, "LlavaNextForConditionalGeneration": LlavaNextArchitectureAdapter, diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py index 45e4ddce6..469cba39f 100644 --- a/transformer_lens/model_bridge/bridge.py +++ b/transformer_lens/model_bridge/bridge.py @@ -105,7 +105,7 @@ def __init__(self, model: nn.Module, adapter: ArchitectureAdapter, tokenizer: An self.adapter = adapter self.cfg = adapter.cfg self.tokenizer = tokenizer - if self.cfg.d_vocab == -1: + if self.cfg.d_vocab == -1 and self.tokenizer is not None: if hasattr(self.tokenizer, "get_vocab"): vocab = self.tokenizer.get_vocab() self.cfg.d_vocab = max(vocab.values()) + 1 @@ -1214,6 +1214,7 @@ def forward( start_at_layer: Optional[int] = None, stop_at_layer: Optional[int] = None, pixel_values: Optional[torch.Tensor] = None, + input_values: Optional[torch.Tensor] = None, **kwargs, ) -> Any: """Forward pass through the model. @@ -1230,6 +1231,9 @@ def forward( pixel_values: Optional image tensor for multimodal models (e.g., LLaVA, Gemma3). The tensor is passed directly to the underlying HuggingFace model. Only valid when cfg.is_multimodal is True. + input_values: Optional audio waveform tensor for audio models (e.g., HuBERT). + The tensor is passed directly to the underlying HuggingFace model. + Only valid when cfg.is_audio_model is True. **kwargs: Additional arguments passed to model Returns: @@ -1252,6 +1256,11 @@ def forward( try: if isinstance(input, (str, list)): + if getattr(self.cfg, "is_audio_model", False): + raise ValueError( + "Audio models require tensor input (raw waveform), not text. " + "Pass a torch.Tensor or use the input_values parameter." + ) input_ids = self.to_tokens( input, prepend_bos=prepend_bos, padding_side=padding_side ) @@ -1323,8 +1332,32 @@ def forward( ) kwargs["pixel_values"] = pixel_values + # Handle input_values for audio models + if input_values is not None: + if not getattr(self.cfg, "is_audio_model", False): + raise ValueError( + "input_values can only be passed to audio models " + "(cfg.is_audio_model must be True)" + ) + kwargs["input_values"] = input_values + + # Audio models take input_values (raw waveform), not input_ids original_tl_cache = past_kv_cache - output = self.original_model(input_ids, **kwargs) + if getattr(self.cfg, "is_audio_model", False): + # For audio models, input is the raw waveform tensor or + # input_values was passed as a keyword argument + if input_values is not None: + output = self.original_model(**kwargs) + elif isinstance(input, torch.Tensor): + kwargs["input_values"] = input + output = self.original_model(**kwargs) + else: + raise ValueError( + "Audio models require tensor input (raw waveform). " + "Pass a torch.Tensor or use input_values parameter." + ) + else: + output = self.original_model(input_ids, **kwargs) if ( original_tl_cache is not None and hasattr(output, "past_key_values") @@ -1361,6 +1394,11 @@ def forward( if return_type == "logits": return logits elif return_type == "loss": + if getattr(self.cfg, "is_audio_model", False): + raise ValueError( + "Audio models do not support return_type='loss'. " + "CTC loss requires aligned frame-level labels." + ) # Always use self.loss_fn for consistency with HT's formula # (log_softmax + gather). HF's output.loss uses F.cross_entropy # which gives different results in bfloat16. @@ -1369,6 +1407,11 @@ def forward( ), f"Expected logits tensor, got {type(logits)}" return self.loss_fn(logits, input_ids, per_token=loss_per_token) elif return_type == "both": + if getattr(self.cfg, "is_audio_model", False): + raise ValueError( + "Audio models do not support return_type='both'. " + "CTC loss requires aligned frame-level labels." + ) assert isinstance( logits, torch.Tensor ), f"Expected logits tensor, got {type(logits)}" diff --git a/transformer_lens/model_bridge/generalized_components/__init__.py b/transformer_lens/model_bridge/generalized_components/__init__.py index 126746a71..ca38829c0 100644 --- a/transformer_lens/model_bridge/generalized_components/__init__.py +++ b/transformer_lens/model_bridge/generalized_components/__init__.py @@ -1,54 +1,77 @@ """Bridge components for transformer architectures.""" -from transformer_lens.model_bridge.generalized_components.attention import AttentionBridge +from transformer_lens.model_bridge.generalized_components.attention import ( + AttentionBridge, +) +from transformer_lens.model_bridge.generalized_components.audio_feature_extractor import ( + AudioFeatureExtractorBridge, +) from transformer_lens.model_bridge.generalized_components.block import BlockBridge -from transformer_lens.model_bridge.generalized_components.embedding import EmbeddingBridge -from transformer_lens.model_bridge.generalized_components.rotary_embedding import ( - RotaryEmbeddingBridge, +from transformer_lens.model_bridge.generalized_components.bloom_attention import ( + BloomAttentionBridge, ) -from transformer_lens.model_bridge.generalized_components.pos_embed import PosEmbedBridge -from transformer_lens.model_bridge.generalized_components.normalization import NormalizationBridge -from transformer_lens.model_bridge.generalized_components.rms_normalization import ( - RMSNormalizationBridge, +from transformer_lens.model_bridge.generalized_components.bloom_block import ( + BloomBlockBridge, +) +from transformer_lens.model_bridge.generalized_components.bloom_mlp import ( + BloomMLPBridge, +) +from transformer_lens.model_bridge.generalized_components.clip_vision_encoder import ( + CLIPVisionEncoderBridge, + CLIPVisionEncoderLayerBridge, ) -from transformer_lens.model_bridge.generalized_components.linear import LinearBridge from transformer_lens.model_bridge.generalized_components.conv1d import Conv1DBridge +from transformer_lens.model_bridge.generalized_components.conv_pos_embed import ( + ConvPosEmbedBridge, +) +from transformer_lens.model_bridge.generalized_components.embedding import ( + EmbeddingBridge, +) +from transformer_lens.model_bridge.generalized_components.gated_mlp import ( + GatedMLPBridge, +) +from transformer_lens.model_bridge.generalized_components.joint_gate_up_mlp import ( + JointGateUpMLPBridge, +) from transformer_lens.model_bridge.generalized_components.joint_qkv_attention import ( JointQKVAttentionBridge, ) from transformer_lens.model_bridge.generalized_components.joint_qkv_position_embeddings_attention import ( JointQKVPositionEmbeddingsAttentionBridge, ) -from transformer_lens.model_bridge.generalized_components.position_embeddings_attention import ( - PositionEmbeddingsAttentionBridge, -) +from transformer_lens.model_bridge.generalized_components.linear import LinearBridge from transformer_lens.model_bridge.generalized_components.mlp import MLPBridge -from transformer_lens.model_bridge.generalized_components.gated_mlp import GatedMLPBridge from transformer_lens.model_bridge.generalized_components.moe import MoEBridge -from transformer_lens.model_bridge.generalized_components.joint_gate_up_mlp import ( - JointGateUpMLPBridge, +from transformer_lens.model_bridge.generalized_components.normalization import ( + NormalizationBridge, ) -from transformer_lens.model_bridge.generalized_components.symbolic import SymbolicBridge -from transformer_lens.model_bridge.generalized_components.unembedding import UnembeddingBridge -from transformer_lens.model_bridge.generalized_components.t5_block import T5BlockBridge -from transformer_lens.model_bridge.generalized_components.bloom_block import BloomBlockBridge -from transformer_lens.model_bridge.generalized_components.bloom_attention import ( - BloomAttentionBridge, +from transformer_lens.model_bridge.generalized_components.pos_embed import ( + PosEmbedBridge, ) -from transformer_lens.model_bridge.generalized_components.bloom_mlp import BloomMLPBridge -from transformer_lens.model_bridge.generalized_components.clip_vision_encoder import ( - CLIPVisionEncoderBridge, - CLIPVisionEncoderLayerBridge, +from transformer_lens.model_bridge.generalized_components.position_embeddings_attention import ( + PositionEmbeddingsAttentionBridge, +) +from transformer_lens.model_bridge.generalized_components.rms_normalization import ( + RMSNormalizationBridge, +) +from transformer_lens.model_bridge.generalized_components.rotary_embedding import ( + RotaryEmbeddingBridge, ) from transformer_lens.model_bridge.generalized_components.siglip_vision_encoder import ( SiglipVisionEncoderBridge, SiglipVisionEncoderLayerBridge, ) +from transformer_lens.model_bridge.generalized_components.symbolic import SymbolicBridge +from transformer_lens.model_bridge.generalized_components.t5_block import T5BlockBridge +from transformer_lens.model_bridge.generalized_components.unembedding import ( + UnembeddingBridge, +) from transformer_lens.model_bridge.generalized_components.vision_projection import ( VisionProjectionBridge, ) __all__ = [ "AttentionBridge", + "AudioFeatureExtractorBridge", "BlockBridge", "BloomBlockBridge", "BloomAttentionBridge", @@ -56,6 +79,7 @@ "CLIPVisionEncoderBridge", "CLIPVisionEncoderLayerBridge", "Conv1DBridge", + "ConvPosEmbedBridge", "EmbeddingBridge", "RotaryEmbeddingBridge", "PosEmbedBridge", diff --git a/transformer_lens/model_bridge/generalized_components/audio_feature_extractor.py b/transformer_lens/model_bridge/generalized_components/audio_feature_extractor.py new file mode 100644 index 000000000..d2ac84a2d --- /dev/null +++ b/transformer_lens/model_bridge/generalized_components/audio_feature_extractor.py @@ -0,0 +1,50 @@ +"""Bridge component for audio CNN feature extractors (HuBERT, wav2vec2).""" + +from typing import Any, Dict, Optional + +import torch + +from transformer_lens.model_bridge.generalized_components.base import ( + GeneralizedComponent, +) + + +class AudioFeatureExtractorBridge(GeneralizedComponent): + """Wraps the multi-layer 1D CNN that converts raw waveforms into features. + + hook_in captures the raw waveform, hook_out captures extracted features. + """ + + hook_aliases = { + "hook_audio_features": "hook_out", + } + + def __init__( + self, + name: str, + config: Optional[Any] = None, + submodules: Optional[Dict[str, GeneralizedComponent]] = None, + ): + super().__init__(name, config, submodules=submodules or {}) + + def forward( + self, + input_values: torch.Tensor, + **kwargs: Any, + ) -> torch.Tensor: + """input_values: [batch, num_samples] -> [batch, conv_dim, num_frames]""" + if self.original_component is None: + raise RuntimeError( + f"Original component not set for {self.name}. " + "Call set_original_component() first." + ) + + input_values = self.hook_in(input_values) + output = self.original_component(input_values, **kwargs) + + if isinstance(output, tuple): + output = (self.hook_out(output[0]),) + output[1:] + else: + output = self.hook_out(output) + + return output diff --git a/transformer_lens/model_bridge/generalized_components/conv_pos_embed.py b/transformer_lens/model_bridge/generalized_components/conv_pos_embed.py new file mode 100644 index 000000000..5463d9d00 --- /dev/null +++ b/transformer_lens/model_bridge/generalized_components/conv_pos_embed.py @@ -0,0 +1,51 @@ +"""Bridge component for convolutional positional embeddings (HuBERT, wav2vec2).""" + +from typing import Any, Dict, Optional + +import torch + +from transformer_lens.model_bridge.generalized_components.base import ( + GeneralizedComponent, +) + + +class ConvPosEmbedBridge(GeneralizedComponent): + """Wraps a grouped 1D conv that produces relative positional information. + + Unlike PosEmbedBridge (lookup table) or RotaryEmbeddingBridge (rotation matrices), + this operates on hidden states via convolution. + """ + + hook_aliases = { + "hook_pos_embed": "hook_out", + } + + def __init__( + self, + name: str, + config: Optional[Any] = None, + submodules: Optional[Dict[str, GeneralizedComponent]] = None, + ): + super().__init__(name, config, submodules=submodules or {}) + + def forward( + self, + hidden_states: torch.Tensor, + **kwargs: Any, + ) -> torch.Tensor: + """hidden_states: [batch, seq_len, hidden_size] -> [batch, seq_len, hidden_size]""" + if self.original_component is None: + raise RuntimeError( + f"Original component not set for {self.name}. " + "Call set_original_component() first." + ) + + hidden_states = self.hook_in(hidden_states) + output = self.original_component(hidden_states, **kwargs) + + if isinstance(output, tuple): + output = (self.hook_out(output[0]),) + output[1:] + else: + output = self.hook_out(output) + + return output diff --git a/transformer_lens/model_bridge/sources/transformers.py b/transformer_lens/model_bridge/sources/transformers.py index 850969012..b18ec285e 100644 --- a/transformer_lens/model_bridge/sources/transformers.py +++ b/transformer_lens/model_bridge/sources/transformers.py @@ -123,7 +123,7 @@ def map_default_transformer_lens_config(hf_config): tl_config.n_layers = source_config.num_transformer_layers elif hasattr(source_config, "num_layers"): tl_config.n_layers = source_config.num_layers - if hasattr(source_config, "vocab_size"): + if hasattr(source_config, "vocab_size") and isinstance(source_config.vocab_size, int): tl_config.d_vocab = source_config.vocab_size if hasattr(source_config, "n_positions"): tl_config.n_ctx = source_config.n_positions @@ -151,6 +151,15 @@ def map_default_transformer_lens_config(hf_config): tl_config.d_head = tl_config.d_model // tl_config.n_heads if hasattr(source_config, "activation_function"): tl_config.act_fn = source_config.activation_function + elif hasattr(source_config, "hidden_act"): + tl_config.act_fn = source_config.hidden_act + # Layer norm / RMS norm epsilon — HF uses 3 different field names + if hasattr(source_config, "rms_norm_eps"): + tl_config.eps = source_config.rms_norm_eps + elif hasattr(source_config, "layer_norm_eps"): + tl_config.eps = source_config.layer_norm_eps + elif hasattr(source_config, "layer_norm_epsilon"): + tl_config.eps = source_config.layer_norm_epsilon if hasattr(source_config, "num_local_experts"): tl_config.num_experts = source_config.num_local_experts if hasattr(source_config, "num_experts_per_tok"): @@ -191,6 +200,7 @@ def determine_architecture_from_hf_config(hf_config): model_type_mappings = { "apertus": "ApertusForCausalLM", "gpt2": "GPT2LMHeadModel", + "hubert": "HubertModel", "llama": "LlamaForCausalLM", "mistral": "MistralForCausalLM", "mixtral": "MixtralForCausalLM", @@ -229,6 +239,7 @@ def get_hf_model_class_for_architecture(architecture: str): Uses centralized architecture sets from utilities.architectures. """ from transformer_lens.utilities.architectures import ( + AUDIO_ARCHITECTURES, MASKED_LM_ARCHITECTURES, MULTIMODAL_ARCHITECTURES, SEQ2SEQ_ARCHITECTURES, @@ -242,6 +253,14 @@ def get_hf_model_class_for_architecture(architecture: str): from transformers import AutoModelForImageTextToText return AutoModelForImageTextToText + elif architecture in AUDIO_ARCHITECTURES: + if "ForCTC" in architecture: + from transformers import AutoModelForCTC + + return AutoModelForCTC + from transformers import AutoModel + + return AutoModel else: return AutoModelForCausalLM @@ -377,7 +396,11 @@ def boot( tokenizer = tokenizer default_padding_side = getattr(adapter.cfg, "default_padding_side", None) use_fast = getattr(adapter.cfg, "use_fast", True) - if tokenizer is not None: + # Audio models use feature extractors, not text tokenizers + _is_audio = getattr(adapter.cfg, "is_audio_model", False) + if _is_audio and tokenizer is None: + tokenizer = None # Skip tokenizer loading for audio models + elif tokenizer is not None: tokenizer = setup_tokenizer(tokenizer, default_padding_side=default_padding_side) else: token_arg = get_hf_token() @@ -484,6 +507,21 @@ def boot( except Exception: pass # Processor not available; user can set bridge.processor manually + # Load feature extractor for audio models (needed for audio preprocessing) + if getattr(adapter.cfg, "is_audio_model", False): + try: + from transformers import AutoFeatureExtractor + + huggingface_token = os.environ.get("HF_TOKEN", "") + token_arg = huggingface_token if len(huggingface_token) > 0 else None + bridge.processor = AutoFeatureExtractor.from_pretrained( + model_name, + token=token_arg, + trust_remote_code=trust_remote_code, + ) + except Exception: + pass # Feature extractor not available; user can set bridge.processor manually + return bridge diff --git a/transformer_lens/model_bridge/supported_architectures/__init__.py b/transformer_lens/model_bridge/supported_architectures/__init__.py index a9dff24b5..ac1a334e2 100644 --- a/transformer_lens/model_bridge/supported_architectures/__init__.py +++ b/transformer_lens/model_bridge/supported_architectures/__init__.py @@ -45,6 +45,9 @@ from transformer_lens.model_bridge.supported_architectures.granite_moe_hybrid import ( GraniteMoeHybridArchitectureAdapter, ) +from transformer_lens.model_bridge.supported_architectures.hubert import ( + HubertArchitectureAdapter, +) from transformer_lens.model_bridge.supported_architectures.llama import ( LlamaArchitectureAdapter, ) @@ -136,6 +139,7 @@ "GPTOSSArchitectureAdapter", "Gpt2LmHeadCustomArchitectureAdapter", "GptjArchitectureAdapter", + "HubertArchitectureAdapter", "LlamaArchitectureAdapter", "LlavaArchitectureAdapter", "LlavaNextArchitectureAdapter", diff --git a/transformer_lens/model_bridge/supported_architectures/hubert.py b/transformer_lens/model_bridge/supported_architectures/hubert.py new file mode 100644 index 000000000..2f73b311a --- /dev/null +++ b/transformer_lens/model_bridge/supported_architectures/hubert.py @@ -0,0 +1,179 @@ +"""HuBERT architecture adapter. + +Supports HubertModel (bare encoder) and HubertForCTC (with CTC head). +Encoder blocks are structurally identical to BERT (post-LN by default, +pre-LN when do_stable_layer_norm=True). +""" + +from typing import Any + +from transformer_lens.conversion_utils.conversion_steps import RearrangeTensorConversion +from transformer_lens.conversion_utils.param_processing_conversion import ( + ParamProcessingConversion, +) +from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter +from transformer_lens.model_bridge.generalized_components import ( + AttentionBridge, + BlockBridge, + LinearBridge, + MLPBridge, + NormalizationBridge, + UnembeddingBridge, +) +from transformer_lens.model_bridge.generalized_components.audio_feature_extractor import ( + AudioFeatureExtractorBridge, +) +from transformer_lens.model_bridge.generalized_components.base import ( + GeneralizedComponent, +) +from transformer_lens.model_bridge.generalized_components.conv_pos_embed import ( + ConvPosEmbedBridge, +) + + +class HubertArchitectureAdapter(ArchitectureAdapter): + """Architecture adapter for HuBERT audio models. + + HubertForCTC nests HubertModel under a 'hubert.' prefix; + prepare_model() detects this and adjusts component paths. + """ + + def __init__(self, cfg: Any) -> None: + super().__init__(cfg) + + self.cfg.is_audio_model = True + self.cfg.normalization_type = "LN" + self.cfg.positional_embedding_type = "conv" + self.cfg.final_rms = False + self.cfg.gated_mlp = False + self.cfg.attn_only = False + + # Pre-LN (True) vs post-LN (False). Propagated from HF config in prepare_loading(). + self._do_stable_layer_norm = getattr(self.cfg, "do_stable_layer_norm", False) + self.supports_fold_ln = self._do_stable_layer_norm + + n_heads = self.cfg.n_heads + + # Q/K/V/O rearrangement — same pattern as BERT + self.weight_processing_conversions = { + "blocks.{i}.attn.q.weight": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion( + "(h d_head) d_model -> h d_model d_head", h=n_heads + ), + ), + "blocks.{i}.attn.k.weight": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion( + "(h d_head) d_model -> h d_model d_head", h=n_heads + ), + ), + "blocks.{i}.attn.v.weight": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion( + "(h d_head) d_model -> h d_model d_head", h=n_heads + ), + ), + "blocks.{i}.attn.q.bias": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion("(h d_head) -> h d_head", h=n_heads), + ), + "blocks.{i}.attn.k.bias": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion("(h d_head) -> h d_head", h=n_heads), + ), + "blocks.{i}.attn.v.bias": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion("(h d_head) -> h d_head", h=n_heads), + ), + "blocks.{i}.attn.o.weight": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion( + "d_model (h d_head) -> h d_head d_model", h=n_heads + ), + ), + } + + # Default mapping for bare HubertModel. prepare_model() rebuilds with + # "hubert." prefix for HubertForCTC. + self.component_mapping = self._build_component_mapping(prefix="") + + def _build_component_mapping(self, prefix: str) -> dict: + """Build component mapping. prefix="" for HubertModel, "hubert." for HubertForCTC.""" + p = prefix + mapping: dict[str, Any] = { + "audio_feature_extractor": AudioFeatureExtractorBridge( + name=f"{p}feature_extractor", + ), + "feat_proj": GeneralizedComponent( + name=f"{p}feature_projection", + ), + "conv_pos_embed": ConvPosEmbedBridge( + name=f"{p}encoder.pos_conv_embed", + ), + "embed_ln": NormalizationBridge( + name=f"{p}encoder.layer_norm", + config=self.cfg, + use_native_layernorm_autograd=True, + ), + "blocks": BlockBridge( + name=f"{p}encoder.layers", + # Redirect MLP hooks to the actual linear layer hooks (same as BERT) + hook_alias_overrides={ + "hook_mlp_out": "mlp.out.hook_out", + "hook_mlp_in": "mlp.in.hook_in", + }, + submodules={ + "ln1": NormalizationBridge( + name="layer_norm", + config=self.cfg, + use_native_layernorm_autograd=True, + ), + "ln2": NormalizationBridge( + name="final_layer_norm", + config=self.cfg, + use_native_layernorm_autograd=True, + ), + "attn": AttentionBridge( + name="attention", + config=self.cfg, + submodules={ + "q": LinearBridge(name="q_proj"), + "k": LinearBridge(name="k_proj"), + "v": LinearBridge(name="v_proj"), + "o": LinearBridge(name="out_proj"), + }, + ), + "mlp": MLPBridge( + name="feed_forward", + config=self.cfg, + submodules={ + "in": LinearBridge(name="intermediate_dense"), + "out": LinearBridge(name="output_dense"), + }, + ), + }, + ), + } + return mapping + + def prepare_loading(self, model_name: str, model_kwargs: dict) -> None: + """Propagate HuBERT-specific HF config attributes to bridge config. + + Prevents silent-default bugs where adapter reads from bridge config + but the attribute was never propagated from HF config. + """ + hf_config = model_kwargs.get("config") + if hf_config is None: + return + + # Pre-LN vs post-LN — determines fold_ln safety + do_stable = getattr(hf_config, "do_stable_layer_norm", False) + self.cfg.do_stable_layer_norm = do_stable # type: ignore[attr-defined] + self._do_stable_layer_norm = do_stable + self.supports_fold_ln = do_stable + + # hidden_act and layer_norm_eps are mapped globally in + # map_default_transformer_lens_config() + + # Rebuild with correct LN variant + self.component_mapping = self._build_component_mapping(prefix="") + + def prepare_model(self, hf_model: Any) -> None: + """Detect HubertForCTC (has 'hubert.' prefix) and add CTC head.""" + if hasattr(hf_model, "hubert"): + self.component_mapping = self._build_component_mapping(prefix="hubert.") + self.component_mapping["unembed"] = UnembeddingBridge(name="lm_head") diff --git a/transformer_lens/tools/model_registry/__init__.py b/transformer_lens/tools/model_registry/__init__.py index 7ee6bfebe..7d84970f3 100644 --- a/transformer_lens/tools/model_registry/__init__.py +++ b/transformer_lens/tools/model_registry/__init__.py @@ -58,6 +58,8 @@ "GPTNeoForCausalLM", "OpenELMForCausalLM", "GPTNeoXForCausalLM", + "HubertForCTC", + "HubertModel", "LlamaForCausalLM", "LlavaForConditionalGeneration", "LlavaNextForConditionalGeneration", diff --git a/transformer_lens/tools/model_registry/data/architecture_gaps.json b/transformer_lens/tools/model_registry/data/architecture_gaps.json index 90ebe2314..7344d18b3 100644 --- a/transformer_lens/tools/model_registry/data/architecture_gaps.json +++ b/transformer_lens/tools/model_registry/data/architecture_gaps.json @@ -1,65 +1,65 @@ { - "generated_at": "2026-03-18", + "generated_at": "2026-03-19", "scan_info": { - "total_scanned": 3426, + "total_scanned": 3517, "task_filter": "text-generation", "min_downloads": 500, - "scan_duration_seconds": 2.4 + "scan_duration_seconds": 2.7 }, - "total_unsupported_architectures": 253, - "total_unsupported_models": 1013, + "total_unsupported_architectures": 258, + "total_unsupported_models": 1031, "gaps": [ { "architecture_id": "Qwen3MoeForCausalLM", - "total_models": 66, + "total_models": 68, "sample_models": [ "Qwen/Qwen3-30B-A3B", - "Qwen/Qwen3-30B-A3B-Thinking-2507", "Qwen/Qwen3-30B-A3B-Instruct-2507", + "Qwen/Qwen3-30B-A3B-Thinking-2507", "Qwen/Qwen3-Coder-30B-A3B-Instruct", "Qwen/Qwen3-235B-A22B", "trl-internal-testing/tiny-Qwen3MoeForCausalLM", "Qwen/Qwen3-235B-A22B-Instruct-2507", "Qwen/Qwen3-Coder-480B-A35B-Instruct", - "nvidia/Qwen3-30B-A3B-NVFP4", - "NVFP4/Qwen3-30B-A3B-Instruct-2507-FP4" + "NVFP4/Qwen3-30B-A3B-Instruct-2507-FP4", + "nvidia/Qwen3-30B-A3B-NVFP4" ] }, { "architecture_id": "DeepseekV3ForCausalLM", - "total_models": 51, + "total_models": 53, "sample_models": [ "deepseek-ai/DeepSeek-R1", - "deepseek-ai/DeepSeek-V3", "deepseek-ai/DeepSeek-R1-0528", + "deepseek-ai/DeepSeek-V3", "deepseek-ai/DeepSeek-V3-0324", "nvidia/DeepSeek-R1-0528-NVFP4-v2", "deepseek-ai/DeepSeek-V3.1", "ai-sage/GigaChat3-10B-A1.8B", "trl-internal-testing/tiny-DeepseekV3ForCausalLM", - "trl-internal-testing/tiny-DeepseekV3ForCausalLM-0528", - "nvidia/DeepSeek-V3-0324-NVFP4" + "nvidia/DeepSeek-V3-0324-NVFP4", + "moonshotai/Kimi-K2-Instruct" ] }, { "architecture_id": "Qwen3_5ForConditionalGeneration", - "total_models": 42, + "total_models": 46, "sample_models": [ "Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled", "osoleve/Qwen3.5-27B-Text-NVFP4-MTP", - "nightmedia/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-qx64-hi-mlx", "Tesslate/OmniCoder-9B", - "txn545/Qwen3.5-27B-NVFP4", + "nightmedia/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-qx64-hi-mlx", "Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled", - "EganAI/qwen3.5-9b-terminal-merge", + "txn545/Qwen3.5-27B-NVFP4", + "mconcat/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-NVFP4", "Jackrong/Qwen3.5-4B-Claude-4.6-Opus-Reasoning-Distilled", - "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled", - "nightmedia/Qwen3.5-27B-Text" + "EganAI/qwen3.5-9b-terminal-merge", + "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled" ] }, { "architecture_id": "Qwen3NextForCausalLM", - "total_models": 37, + "total_models": 35, "sample_models": [ "Qwen/Qwen3-Coder-Next", "Qwen/Qwen3-Next-80B-A3B-Instruct", @@ -75,14 +75,14 @@ }, { "architecture_id": "FalconForCausalLM", - "total_models": 31, + "total_models": 32, "sample_models": [ "tiiuae/falcon-7b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct", "tiiuae/falcon-40b", - "fxmarty/really-tiny-falcon-testing", "tiiuae/falcon-rw-1b", + "fxmarty/really-tiny-falcon-testing", "vilsonrodrigues/falcon-7b-instruct-sharded", "tiiuae/falcon-11B", "euclaise/falcon_1b_stage2", @@ -91,44 +91,28 @@ }, { "architecture_id": "Qwen3_5MoeForConditionalGeneration", - "total_models": 27, + "total_models": 28, "sample_models": [ "txn545/Qwen3.5-122B-A10B-NVFP4", "nvidia/Qwen3.5-397B-A17B-NVFP4", "txn545/Qwen3.5-35B-A3B-NVFP4", "RepublicOfKorokke/Qwen3.5-35B-A3B-mlx-lm-mxfp4", "nightmedia/Qwen3.5-35B-A3B-Text-qx64-hi-mlx", + "lukealonso/Qwen3.5-397B-A17B-NVFP4", "nightmedia/Qwen3.5-122B-A10B-Text-mxfp4-mlx", "olka-fi/Qwen3.5-122B-A10B-MXFP4", - "lukealonso/Qwen3.5-397B-A17B-NVFP4", "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled", "NexVeridian/Qwen3.5-35B-A3B-3bit" ] }, - { - "architecture_id": "InternLM2ForCausalLM", - "total_models": 21, - "sample_models": [ - "internlm/internlm2-chat-7b", - "internlm/internlm2_5-7b-chat", - "internlm/internlm2-7b", - "internlm/internlm2-20b", - "internlm/internlm2-base-7b", - "internlm/internlm2-chat-20b", - "internlm/internlm2-base-20b", - "chujiezheng/internlm2-chat-7b-ExPO", - "chujiezheng/internlm2-chat-20b-ExPO", - "AI4Chem/ChemLLM-7B-Chat-1_5-DPO" - ] - }, { "architecture_id": "Lfm2ForCausalLM", - "total_models": 19, + "total_models": 21, "sample_models": [ "LiquidAI/LFM2-1.2B", "LiquidAI/LFM2.5-1.2B-Instruct", - "LiquidAI/LFM2-350M", "LiquidAI/LFM2.5-1.2B-Base", + "LiquidAI/LFM2-350M", "LiquidAI/LFM2.5-1.2B-Thinking", "LiquidAI/LFM2-2.6B", "LiquidAI/LFM2-2.6B-Exp", @@ -137,15 +121,31 @@ "LiquidAI/LFM2.5-1.2B-Thinking-ONNX" ] }, + { + "architecture_id": "InternLM2ForCausalLM", + "total_models": 19, + "sample_models": [ + "internlm/internlm2-chat-7b", + "internlm/internlm2_5-7b-chat", + "internlm/internlm2-7b", + "internlm/internlm2-20b", + "internlm/internlm2-base-7b", + "internlm/internlm2-chat-20b", + "internlm/internlm2-base-20b", + "chujiezheng/internlm2-chat-20b-ExPO", + "chujiezheng/internlm2-chat-7b-ExPO", + "AI4Chem/ChemLLM-7B-Chat-1_5-DPO" + ] + }, { "architecture_id": "Glm4MoeForCausalLM", - "total_models": 17, + "total_models": 18, "sample_models": [ "zai-org/GLM-4.5-Air", "zai-org/GLM-4.7", "trl-internal-testing/tiny-Glm4MoeForCausalLM", - "zai-org/GLM-4.6", "zai-org/GLM-4.5", + "zai-org/GLM-4.6", "Tengyunw/GLM-4.7-NVFP4", "Salyut1/GLM-4.7-NVFP4", "np-cr/testing-glm4-moe", @@ -158,15 +158,15 @@ "total_models": 17, "sample_models": [ "ai21labs/AI21-Jamba-Mini-1.5", - "ai21labs/AI21-Jamba2-3B", "ai21labs/Jamba-tiny-random", + "ai21labs/AI21-Jamba2-3B", "ai21labs/AI21-Jamba-Reasoning-3B", "ai21labs/AI21-Jamba-Large-1.5", "ai21labs/AI21-Jamba-Mini-1.6", "ai21labs/AI21-Jamba-Large-1.6", "microsoft/Dayhoff-170m-GR", "ai21labs/Jamba-v0.1", - "microsoft/Dayhoff-170m-UR90" + "microsoft/Dayhoff-170M-GRS-112000" ] }, { @@ -191,30 +191,30 @@ "sample_models": [ "tiiuae/Falcon-H1-Tiny-90M-Instruct", "tiiuae/Falcon-H1-0.5B-Base", - "tiiuae/Falcon-H1-7B-Instruct", "tiiuae/Falcon-H1R-7B", - "tiiuae/Falcon-H1-34B-Instruct", + "tiiuae/Falcon-H1-7B-Instruct", "tiiuae/Falcon-H1-34B-Base", + "tiiuae/Falcon-H1-34B-Instruct", "tiiuae/Falcon-H1-1.5B-Base", "tiiuae/Falcon-H1-7B-Base", "tiiuae/Falcon-H1-3B-Base", - "tiiuae/Falcon-H1-1.5B-Instruct" + "tiiuae/Falcon-H1-1.5B-Deep-Base" ] }, { - "architecture_id": "MiniMaxM2ForCausalLM", + "architecture_id": "NemotronHForCausalLM", "total_models": 15, "sample_models": [ - "MiniMaxAI/MiniMax-M2.5", - "MiniMaxAI/MiniMax-M2", - "cerebras/MiniMax-M2.1-REAP-139B-A10B", - "MiniMaxAI/MiniMax-M2.1", - "cerebras/MiniMax-M2.5-REAP-139B-A10B", - "PrimeIntellect/MiniMax-M2.5-bf16", - "cerebras/MiniMax-M2.5-REAP-172B-A10B", - "saricles/MiniMax-M2.5-REAP-172B-A10B-NVFP4-GB10", - "amd/MiniMax-M2.1-MXFP4", - "aspctu/MiniMax-M2.5" + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4", + "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", + "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese", + "nvidia/NVIDIA-Nemotron-Nano-9B-v2", + "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", + "unsloth/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", + "OpenResearcher/OpenResearcher-30B-A3B", + "nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4", + "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-Base-BF16" ] }, { @@ -228,25 +228,25 @@ "ibm-granite/granite-20b-code-base-8k", "ibm-granite/granite-20b-code-instruct-8k", "HuggingFaceH4/starchat-beta", - "bigcode/starcoderbase-3b", "HuggingFaceH4/starchat-alpha", - "openchat/opencoderplus" + "LoupGarou/WizardCoder-Guanaco-15B-V1.1", + "Danielbrdz/CodeBarcenas-1b" ] }, { - "architecture_id": "NemotronHForCausalLM", + "architecture_id": "MiniMaxM2ForCausalLM", "total_models": 14, "sample_models": [ - "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", - "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4", - "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese", - "nvidia/NVIDIA-Nemotron-Nano-9B-v2", - "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", - "unsloth/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", - "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", - "OpenResearcher/OpenResearcher-30B-A3B", - "nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4", - "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-Base-BF16" + "MiniMaxAI/MiniMax-M2.5", + "cerebras/MiniMax-M2.1-REAP-139B-A10B", + "MiniMaxAI/MiniMax-M2", + "MiniMaxAI/MiniMax-M2.1", + "cerebras/MiniMax-M2.5-REAP-139B-A10B", + "PrimeIntellect/MiniMax-M2.5-bf16", + "cerebras/MiniMax-M2.5-REAP-172B-A10B", + "saricles/MiniMax-M2.5-REAP-172B-A10B-NVFP4-GB10", + "aspctu/MiniMax-M2.5", + "amd/MiniMax-M2.1-MXFP4" ] }, { @@ -260,11 +260,27 @@ "facebook/xglm-4.5B", "KoboldAI/fairseq-dense-125M", "KoboldAI/fairseq-dense-2.7B", - "KoboldAI/fairseq-dense-1.3B", "KoboldAI/fairseq-dense-355M", + "KoboldAI/fairseq-dense-1.3B", "KoboldAI/fairseq-dense-6.7B" ] }, + { + "architecture_id": "Glm4MoeLiteForCausalLM", + "total_models": 13, + "sample_models": [ + "zai-org/GLM-4.7-Flash", + "GadflyII/GLM-4.7-Flash-NVFP4", + "unsloth/GLM-4.7-Flash", + "GadflyII/GLM-4.7-Flash-MTP-NVFP4", + "Olafangensan/GLM-4.7-Flash-heretic", + "cerebras/GLM-4.7-Flash-REAP-23B-A3B", + "huihui-ai/Huihui-GLM-4.7-Flash-abliterated", + "TeichAI/GLM-4.7-Flash-Claude-Opus-4.5-High-Reasoning-Distill", + "Ex0bit/GLM-4.7-Flash-PRISM", + "MuXodious/GLM-4.7-Flash-absolute-heresy" + ] + }, { "architecture_id": "CodeGenForCausalLM", "total_models": 13, @@ -297,22 +313,6 @@ "RWKV/rwkv-raven-7b" ] }, - { - "architecture_id": "Glm4MoeLiteForCausalLM", - "total_models": 12, - "sample_models": [ - "zai-org/GLM-4.7-Flash", - "GadflyII/GLM-4.7-Flash-NVFP4", - "unsloth/GLM-4.7-Flash", - "GadflyII/GLM-4.7-Flash-MTP-NVFP4", - "Olafangensan/GLM-4.7-Flash-heretic", - "huihui-ai/Huihui-GLM-4.7-Flash-abliterated", - "cerebras/GLM-4.7-Flash-REAP-23B-A3B", - "TeichAI/GLM-4.7-Flash-Claude-Opus-4.5-High-Reasoning-Distill", - "Ex0bit/GLM-4.7-Flash-PRISM", - "MuXodious/GLM-4.7-Flash-absolute-heresy" - ] - }, { "architecture_id": "DeepseekV2ForCausalLM", "total_models": 11, @@ -320,13 +320,13 @@ "deepseek-ai/DeepSeek-V2-Lite-Chat", "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", "deepseek-ai/DeepSeek-V2-Lite", - "deepseek-ai/DeepSeek-V2.5", "deepseek-ai/DeepSeek-V2-Chat", "deepseek-ai/DeepSeek-Coder-V2-Instruct-0724", "deepseek-ai/DeepSeek-V2", + "deepseek-ai/DeepSeek-V2.5", "deepseek-ai/DeepSeek-Coder-V2-Instruct", - "deepseek-ai/DeepSeek-Coder-V2-Lite-Base", - "deepseek-ai/DeepSeek-V2-Chat-0628" + "deepseek-ai/DeepSeek-V2-Chat-0628", + "deepseek-ai/DeepSeek-Coder-V2-Lite-Base" ] }, { @@ -350,13 +350,13 @@ "total_models": 10, "sample_models": [ "google/t5gemma-s-s-prefixlm", - "google/t5gemma-b-b-ul2", "google/t5gemma-9b-9b-ul2", + "google/t5gemma-b-b-ul2", "google/t5gemma-2b-2b-ul2", "google/t5gemma-b-b-prefixlm", "google/t5gemma-9b-9b-ul2-it", - "google/t5gemma-9b-2b-ul2-it", "google/t5gemma-2b-2b-prefixlm", + "google/t5gemma-9b-2b-ul2-it", "google/t5gemma-l-l-prefixlm", "harshaljanjani/tiny-t5gemma-test" ] @@ -379,7 +379,7 @@ }, { "architecture_id": "DeciLMForCausalLM", - "total_models": 10, + "total_models": 9, "sample_models": [ "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5", "nvidia/Llama-3_3-Nemotron-Super-49B-v1", @@ -389,8 +389,22 @@ "NewstaR/Porpoise-6b-instruct", "Danielbrdz/Barcenas-6b", "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1", - "nvidia/Llama-3_1-Nemotron-51B-Instruct", - "nvidia/Llama-3_3-Nemotron-Super-49B-GenRM" + "nvidia/Llama-3_1-Nemotron-51B-Instruct" + ] + }, + { + "architecture_id": "DFlashDraftModel", + "total_models": 9, + "sample_models": [ + "z-lab/Qwen3-4B-DFlash-b16", + "z-lab/Qwen3-8B-DFlash-b16", + "z-lab/Qwen3.5-9B-DFlash", + "z-lab/gpt-oss-20b-DFlash", + "z-lab/gpt-oss-120b-DFlash", + "z-lab/LLaMA3.1-8B-Instruct-DFlash-UltraChat", + "z-lab/Qwen3.5-35B-A3B-DFlash", + "z-lab/Qwen3-Coder-30B-A3B-DFlash", + "z-lab/Qwen3.5-4B-DFlash" ] }, { @@ -435,41 +449,27 @@ "dreuxx26/Multilingual-grammar-Corrector-using-mT5-small" ] }, - { - "architecture_id": "DFlashDraftModel", - "total_models": 8, - "sample_models": [ - "z-lab/Qwen3-4B-DFlash-b16", - "z-lab/Qwen3-8B-DFlash-b16", - "z-lab/Qwen3.5-9B-DFlash", - "z-lab/gpt-oss-20b-DFlash", - "z-lab/LLaMA3.1-8B-Instruct-DFlash-UltraChat", - "z-lab/gpt-oss-120b-DFlash", - "z-lab/Qwen3.5-35B-A3B-DFlash", - "z-lab/Qwen3-Coder-30B-A3B-DFlash" - ] - }, { "architecture_id": "Qwen3_5ForCausalLM", "total_models": 8, "sample_models": [ "lukey03/Qwen3.5-9B-abliterated", "osoleve/Qwen3.5-9B-Base-Text-NVFP4", - "Green-eyedDevil/Monika-9B", "Phonsiri/Qwen3.5-9B-Thai-Law-Base", + "Green-eyedDevil/Monika-9B", "eerwitt/qwen-h-neurons-honest", "rahul7star/albeit", - "nahidstaq/html-section-retriever", - "nbeerbower/Huihui-Qwen3.5-9B-abliterated-Grimoire-ORPO" + "nbeerbower/Huihui-Qwen3.5-9B-abliterated-Grimoire-ORPO", + "nahidstaq/html-section-retriever" ] }, { "architecture_id": "MPTForCausalLM", "total_models": 8, "sample_models": [ - "echarlaix/tiny-mpt-random-remote-code", "anas-awadalla/mpt-7b", "wtang06/mpt-125m-c4", + "echarlaix/tiny-mpt-random-remote-code", "lightblue/japanese-mpt-7b", "vinai/PhoGPT-4B", "Nethermind/Mpt-Instruct-DotNet-S", @@ -512,8 +512,21 @@ "optimum-internal-testing/tiny-random-SmolLM3ForCausalLM", "onnx-internal-testing/tiny-random-SmolLM3ForCausalLM", "HuggingFaceTB/SmolLM3-3B-ONNX", - "toroe/SmolLM-3B-Science-ES", - "N-Bot-Int/SmolSam3-MEMGRPO" + "N-Bot-Int/SmolSam3-MEMGRPO", + "toroe/SmolLM-3B-Science-ES" + ] + }, + { + "architecture_id": "ProGenForCausalLM", + "total_models": 7, + "sample_models": [ + "hugohrban/progen2-base", + "hugohrban/progen2-small", + "hugohrban/progen2-medium", + "hugohrban/progen2-oas", + "hugohrban/progen2-small-mix7", + "hugohrban/progen2-large", + "hugohrban/progen2-xlarge" ] }, { @@ -553,15 +566,27 @@ ] }, { - "architecture_id": "ProGenForCausalLM", + "architecture_id": "NemotronForCausalLM", "total_models": 6, "sample_models": [ - "hugohrban/progen2-small", - "hugohrban/progen2-base", - "hugohrban/progen2-medium", - "hugohrban/progen2-oas", - "hugohrban/progen2-xlarge", - "hugohrban/progen2-small-mix7" + "nvidia/Nemotron-Mini-4B-Instruct", + "nvidia/Minitron-8B-Base", + "badaoui/tiny-random-NemotronForCausalLM", + "nvidia/Minitron-4B-Base", + "thhaus/nemotron3-8b", + "dmvevents/Nemotron-Mini-4B-Instruct" + ] + }, + { + "architecture_id": "HyenaDNAForCausalLM", + "total_models": 6, + "sample_models": [ + "LongSafari/hyenadna-small-32k-seqlen-hf", + "LongSafari/hyenadna-tiny-1k-seqlen-hf", + "LongSafari/hyenadna-large-1m-seqlen-hf", + "LongSafari/hyenadna-medium-450k-seqlen-hf", + "LongSafari/hyenadna-medium-160k-seqlen-hf", + "LongSafari/hyenadna-tiny-1k-seqlen-d256-hf" ] }, { @@ -576,26 +601,14 @@ "ShareGPTVideo/LLaVA-Hound-Pretrain" ] }, - { - "architecture_id": "HyenaDNAForCausalLM", - "total_models": 6, - "sample_models": [ - "LongSafari/hyenadna-small-32k-seqlen-hf", - "LongSafari/hyenadna-large-1m-seqlen-hf", - "LongSafari/hyenadna-medium-160k-seqlen-hf", - "LongSafari/hyenadna-medium-450k-seqlen-hf", - "LongSafari/hyenadna-tiny-1k-seqlen-hf", - "LongSafari/hyenadna-tiny-1k-seqlen-d256-hf" - ] - }, { "architecture_id": "LlavaLlamaModel", "total_models": 6, "sample_models": [ "Efficient-Large-Model/VILA1.5-3b", - "Efficient-Large-Model/NVILA-15B", "Efficient-Large-Model/NVILA-Lite-8B", "Efficient-Large-Model/NVILA-8B", + "Efficient-Large-Model/NVILA-15B", "Efficient-Large-Model/VILA1.5-13b", "Efficient-Large-Model/Llama-3-VILA1.5-8B" ] @@ -619,8 +632,8 @@ "GSAI-ML/LLaDA-8B-Instruct", "GSAI-ML/LLaDA-8B-Base", "GSAI-ML/LLaDA-1.5", - "Fraser/LLaDA-8B-Base-gg2m", - "d3LLM/d3LLM_LLaDA" + "d3LLM/d3LLM_LLaDA", + "Fraser/LLaDA-8B-Base-gg2m" ] }, { @@ -634,6 +647,17 @@ "tiiuae/Falcon3-Mamba-7B-Instruct" ] }, + { + "architecture_id": "DreamModel", + "total_models": 5, + "sample_models": [ + "Dream-org/Dream-v0-Instruct-7B", + "Dream-org/Dream-v0-Base-7B", + "Dream-org/Dream-Coder-v0-Instruct-7B", + "d3LLM/d3LLM_Dream", + "Dream-org/Dream-Coder-v0-Base-7B" + ] + }, { "architecture_id": "Eagle3Speculator", "total_models": 5, @@ -660,8 +684,8 @@ "architecture_id": "Ernie4_5_MoeForCausalLM", "total_models": 5, "sample_models": [ - "baidu/ERNIE-4.5-21B-A3B-Base-PT", "baidu/ERNIE-4.5-21B-A3B-PT", + "baidu/ERNIE-4.5-21B-A3B-Base-PT", "baidu/ERNIE-4.5-21B-A3B-Thinking", "baidu/ERNIE-4.5-300B-A47B-PT", "baidu/ERNIE-4.5-300B-A47B-Paddle" @@ -711,17 +735,6 @@ "FreedomIntelligence/HuatuoGPT-Vision-7B" ] }, - { - "architecture_id": "NemotronForCausalLM", - "total_models": 5, - "sample_models": [ - "nvidia/Minitron-8B-Base", - "nvidia/Nemotron-Mini-4B-Instruct", - "badaoui/tiny-random-NemotronForCausalLM", - "nvidia/Minitron-4B-Base", - "thhaus/nemotron3-8b" - ] - }, { "architecture_id": "HunYuanDenseV1ForCausalLM", "total_models": 5, @@ -729,8 +742,8 @@ "tencent/Hunyuan-7B-Instruct", "tencent/Hunyuan-0.5B-Pretrain", "tencent/Hunyuan-4B-Instruct", - "tencent/Hunyuan-1.8B-Instruct", - "tencent/Hunyuan-0.5B-Instruct" + "tencent/Hunyuan-0.5B-Instruct", + "tencent/Hunyuan-1.8B-Instruct" ] }, { @@ -754,23 +767,13 @@ ] }, { - "architecture_id": "DreamModel", - "total_models": 4, - "sample_models": [ - "Dream-org/Dream-v0-Instruct-7B", - "Dream-org/Dream-v0-Base-7B", - "Dream-org/Dream-Coder-v0-Instruct-7B", - "d3LLM/d3LLM_Dream" - ] - }, - { - "architecture_id": "Step3p5ForCausalLM", + "architecture_id": "Lfm2MoeForCausalLM", "total_models": 4, "sample_models": [ - "stepfun-ai/Step-3.5-Flash", - "tacos4me/Step-3.5-Flash-NVFP4", - "stepfun-ai/Step-3.5-Flash-Base", - "shieldstackllc/Step-3.5-Flash-REAP-128B-A11B-mlx-mixed-4-6" + "LiquidAI/LFM2-8B-A1B", + "LiquidAI/LFM2-24B-A2B", + "huihui-ai/Huihui-LFM2-24B-A2B-abliterated", + "huihui-ai/Huihui-LFM2-8B-A1B-abliterated" ] }, { @@ -788,8 +791,8 @@ "total_models": 4, "sample_models": [ "nvidia/gpt-oss-120b-Eagle3-short-context", - "nvidia/gpt-oss-120b-Eagle3-long-context", "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3", + "nvidia/gpt-oss-120b-Eagle3-long-context", "nvidia/gpt-oss-120b-Eagle3-throughput" ] }, @@ -859,8 +862,8 @@ "sample_models": [ "BAAI/AquilaChat2-7B", "katuni4ka/tiny-random-aquila2", - "katuni4ka/tiny-random-aquilachat", - "BAAI/Aquila2-34B" + "BAAI/Aquila2-34B", + "katuni4ka/tiny-random-aquilachat" ] }, { @@ -932,17 +935,26 @@ "total_models": 3, "sample_models": [ "zai-org/GLM-5", - "yujiepan/glm-5-tiny-random", + "nvidia/GLM-5-NVFP4", "cs2764/GLM-5_dq3-mlx" ] }, { - "architecture_id": "Zamba2ForCausalLM", + "architecture_id": "Step3p5ForCausalLM", "total_models": 3, "sample_models": [ - "Zyphra/Zamba2-1.2B-instruct", - "Zyphra/Zamba2-7B-Instruct", - "Zyphra/Zamba2-2.7B" + "stepfun-ai/Step-3.5-Flash", + "tacos4me/Step-3.5-Flash-NVFP4", + "stepfun-ai/Step-3.5-Flash-Base" + ] + }, + { + "architecture_id": "Zamba2ForCausalLM", + "total_models": 3, + "sample_models": [ + "Zyphra/Zamba2-1.2B-instruct", + "Zyphra/Zamba2-7B-Instruct", + "Zyphra/Zamba2-2.7B" ] }, { @@ -968,8 +980,8 @@ "total_models": 3, "sample_models": [ "nvidia/Nemotron-Flash-3B", - "nvidia/Nemotron-Flash-1B", - "nvidia/Nemotron-Flash-3B-Instruct" + "nvidia/Nemotron-Flash-3B-Instruct", + "nvidia/Nemotron-Flash-1B" ] }, { @@ -981,6 +993,15 @@ "srs6901/SOLARized-GraniStral-14B_2102_YeAM-HCT_32QKV" ] }, + { + "architecture_id": "Llama4ForConditionalGeneration", + "total_models": 3, + "sample_models": [ + "RedHatAI/Llama-4-Scout-17B-16E-Instruct-NVFP4", + "yujiepan/llama-4-tiny-random", + "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-NVFP4" + ] + }, { "architecture_id": "PersimmonForCausalLM", "total_models": 3, @@ -1004,8 +1025,8 @@ "total_models": 3, "sample_models": [ "trillionlabs/Tri-21B-Think", - "trillionlabs/Tri-21B-Think-Preview", - "trillionlabs/Tri-21B" + "trillionlabs/Tri-21B", + "trillionlabs/Tri-21B-Think-Preview" ] }, { @@ -1017,6 +1038,15 @@ "HuggingFaceM4/idefics-9b-instruct" ] }, + { + "architecture_id": "OLMoForCausalLM", + "total_models": 3, + "sample_models": [ + "allenai/OLMo-1B", + "allenai/OLMo-7B-Instruct", + "allenai/OLMo-7B" + ] + }, { "architecture_id": "modeling_camelidae.LlamaForCausalLM", "total_models": 3, @@ -1070,19 +1100,19 @@ ] }, { - "architecture_id": "OpenAIGPTLMHeadModel", + "architecture_id": "HCXVisionV2ForCausalLM", "total_models": 2, "sample_models": [ - "openai-community/openai-gpt", - "lgaalves/gpt1" + "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B", + "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B" ] }, { - "architecture_id": "HCXVisionV2ForCausalLM", + "architecture_id": "OpenAIGPTLMHeadModel", "total_models": 2, "sample_models": [ - "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B", - "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B" + "openai-community/openai-gpt", + "lgaalves/gpt1" ] }, { @@ -1110,11 +1140,11 @@ ] }, { - "architecture_id": "Lfm2MoeForCausalLM", + "architecture_id": "BartForConditionalGeneration", "total_models": 2, "sample_models": [ - "LiquidAI/LFM2-8B-A1B", - "LiquidAI/LFM2-24B-A2B" + "KomeijiForce/bart-large-emojilm", + "Nargizi/screeve-lemmatizer" ] }, { @@ -1125,14 +1155,6 @@ "starvector/starvector-8b-im2svg" ] }, - { - "architecture_id": "DbrxForCausalLM", - "total_models": 2, - "sample_models": [ - "trl-internal-testing/tiny-DbrxForCausalLM", - "katuni4ka/tiny-random-dbrx" - ] - }, { "architecture_id": "KimiLinearForCausalLM", "total_models": 2, @@ -1142,11 +1164,11 @@ ] }, { - "architecture_id": "BartForConditionalGeneration", + "architecture_id": "DbrxForCausalLM", "total_models": 2, "sample_models": [ - "Nargizi/screeve-lemmatizer", - "KomeijiForce/bart-large-emojilm" + "trl-internal-testing/tiny-DbrxForCausalLM", + "katuni4ka/tiny-random-dbrx" ] }, { @@ -1173,14 +1195,6 @@ "facebook/MobileLLM-R1-950M" ] }, - { - "architecture_id": "BailingMoeV2_5ForCausalLM", - "total_models": 2, - "sample_models": [ - "inclusionAI/Ring-2.5-1T", - "inclusionAI/Ling-2.5-1T" - ] - }, { "architecture_id": "Phi3SmallForCausalLM", "total_models": 2, @@ -1193,8 +1207,8 @@ "architecture_id": "MiniMaxM1ForCausalLM", "total_models": 2, "sample_models": [ - "MiniMaxAI/MiniMax-M1-80k", - "MiniMaxAI/MiniMax-M1-40k" + "MiniMaxAI/MiniMax-M1-40k", + "MiniMaxAI/MiniMax-M1-80k" ] }, { @@ -1222,11 +1236,11 @@ ] }, { - "architecture_id": "Llama4ForConditionalGeneration", + "architecture_id": "InternVLChatModel", "total_models": 2, "sample_models": [ - "RedHatAI/Llama-4-Scout-17B-16E-Instruct-NVFP4", - "yujiepan/llama-4-tiny-random" + "numind/NuExtract-2-4B-experimental", + "numind/NuExtract-2-8B-experimental" ] }, { @@ -1246,11 +1260,11 @@ ] }, { - "architecture_id": "InternVLChatModel", + "architecture_id": "XverseForCausalLM", "total_models": 2, "sample_models": [ - "numind/NuExtract-2-4B-experimental", - "numind/NuExtract-2-8B-experimental" + "xverse/XVERSE-7B-Chat", + "katuni4ka/tiny-random-xverse" ] }, { @@ -1270,11 +1284,11 @@ ] }, { - "architecture_id": "XverseForCausalLM", + "architecture_id": "AXK1ForCausalLM", "total_models": 2, "sample_models": [ - "xverse/XVERSE-7B-Chat", - "katuni4ka/tiny-random-xverse" + "skt/A.X-K1", + "thkim93/axk1-2layers" ] }, { @@ -1309,14 +1323,6 @@ "tencent/Penguin-VL-2B" ] }, - { - "architecture_id": "Qwen3VLForConditionalGeneration", - "total_models": 2, - "sample_models": [ - "RedHatAI/Qwen3-VL-32B-Instruct-NVFP4", - "Goekdeniz-Guelmez/Josiefied-Qwen3-VL-4B-Instruct-abliterated-beta-v1" - ] - }, { "architecture_id": "MolformerForCausalLM", "total_models": 2, @@ -1326,27 +1332,19 @@ ] }, { - "architecture_id": "Rwkv6ForCausalLM", - "total_models": 2, - "sample_models": [ - "RWKV/v6-Finch-1B6-HF", - "RWKV/v6-Finch-14B-HF" - ] - }, - { - "architecture_id": "OLMoForCausalLM", + "architecture_id": "GLAForCausalLM", "total_models": 2, "sample_models": [ - "allenai/OLMo-7B", - "allenai/OLMo-1B" + "fla-hub/gla-340M-15B", + "fla-hub/gla-1.3B-100B" ] }, { - "architecture_id": "BolmoForCausalLM", + "architecture_id": "MosaicGPT", "total_models": 2, "sample_models": [ - "allenai/Bolmo-7B", - "allenai/Bolmo-1B" + "anas-awadalla/mpt-1b-redpajama-200b", + "anas-awadalla/mpt-1b-redpajama-200b-dolly" ] }, { @@ -1358,19 +1356,11 @@ ] }, { - "architecture_id": "GLAForCausalLM", - "total_models": 2, - "sample_models": [ - "fla-hub/gla-340M-15B", - "fla-hub/gla-1.3B-100B" - ] - }, - { - "architecture_id": "MosaicGPT", + "architecture_id": "BolmoForCausalLM", "total_models": 2, "sample_models": [ - "anas-awadalla/mpt-1b-redpajama-200b", - "anas-awadalla/mpt-1b-redpajama-200b-dolly" + "allenai/Bolmo-7B", + "allenai/Bolmo-1B" ] }, { @@ -1405,20 +1395,12 @@ "tencent/Youtu-LLM-2B" ] }, - { - "architecture_id": "BottleneckT5LMWithPerturb", - "total_models": 2, - "sample_models": [ - "thesephist/contra-bottleneck-t5-base-wikipedia", - "thesephist/contra-bottleneck-t5-large-wikipedia" - ] - }, { "architecture_id": "ParamBharatGenForCausalLM", "total_models": 2, "sample_models": [ - "bharatgenai/AyurParam", - "bharatgenai/Param-1-2.9B-Instruct" + "bharatgenai/Param-1-2.9B-Instruct", + "bharatgenai/AyurParam" ] }, { @@ -1438,19 +1420,19 @@ ] }, { - "architecture_id": "MptForCausalLM", + "architecture_id": "BottleneckT5LMWithPerturb", "total_models": 2, "sample_models": [ - "team-lucid/mptk-1b", - "explosion-testing/mpt-test" + "thesephist/contra-bottleneck-t5-base-wikipedia", + "thesephist/contra-bottleneck-t5-large-wikipedia" ] }, { - "architecture_id": "InstellaForCausalLM", + "architecture_id": "MptForCausalLM", "total_models": 2, "sample_models": [ - "amd/Instella-3B", - "amd/Instella-3B-Instruct" + "team-lucid/mptk-1b", + "explosion-testing/mpt-test" ] }, { @@ -1524,13 +1506,6 @@ "baichuan-inc/Baichuan-7B" ] }, - { - "architecture_id": "GPTRefactForCausalLM", - "total_models": 1, - "sample_models": [ - "refactai/Refact-1_6B-fim" - ] - }, { "architecture_id": "SarvamMoEForCausalLM", "total_models": 1, @@ -1546,10 +1521,10 @@ ] }, { - "architecture_id": "ExaoneMoEForCausalLM", + "architecture_id": "GPTRefactForCausalLM", "total_models": 1, "sample_models": [ - "LGAI-EXAONE/K-EXAONE-236B-A23B" + "refactai/Refact-1_6B-fim" ] }, { @@ -1559,6 +1534,13 @@ "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" ] }, + { + "architecture_id": "ExaoneMoEForCausalLM", + "total_models": 1, + "sample_models": [ + "LGAI-EXAONE/K-EXAONE-236B-A23B" + ] + }, { "architecture_id": "HunYuanMoEV1ForCausalLM", "total_models": 1, @@ -1574,10 +1556,10 @@ ] }, { - "architecture_id": "JetNemotronForCausalLM", + "architecture_id": "BailingMoeV2_5ForCausalLM", "total_models": 1, "sample_models": [ - "jet-ai/Jet-Nemotron-2B" + "inclusionAI/Ring-2.5-1T" ] }, { @@ -1588,10 +1570,10 @@ ] }, { - "architecture_id": "Grok1ModelForCausalLM", + "architecture_id": "JetNemotronForCausalLM", "total_models": 1, "sample_models": [ - "hpcai-tech/grok-1" + "jet-ai/Jet-Nemotron-2B" ] }, { @@ -1602,17 +1584,17 @@ ] }, { - "architecture_id": "Qwen3VLMoeForConditionalGeneration", + "architecture_id": "Grok1ModelForCausalLM", "total_models": 1, "sample_models": [ - "RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4" + "hpcai-tech/grok-1" ] }, { - "architecture_id": "Emu3ForCausalLM", + "architecture_id": "Qwen3VLMoeForConditionalGeneration", "total_models": 1, "sample_models": [ - "BAAI/Emu3-Chat" + "RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4" ] }, { @@ -1623,17 +1605,17 @@ ] }, { - "architecture_id": "GRIN-MoE", + "architecture_id": "Emu3ForCausalLM", "total_models": 1, "sample_models": [ - "microsoft/GRIN-MoE" + "BAAI/Emu3-Chat" ] }, { - "architecture_id": "MiniMaxForCausalLM", + "architecture_id": "GRIN-MoE", "total_models": 1, "sample_models": [ - "MiniMaxAI/MiniMax-Text-01-hf" + "microsoft/GRIN-MoE" ] }, { @@ -1643,6 +1625,13 @@ "nguyenvulebinh/AV-HuBERT-MuAViC-en" ] }, + { + "architecture_id": "MiniMaxForCausalLM", + "total_models": 1, + "sample_models": [ + "MiniMaxAI/MiniMax-Text-01-hf" + ] + }, { "architecture_id": "ArcticForCausalLM", "total_models": 1, @@ -1672,24 +1661,24 @@ ] }, { - "architecture_id": "Plamo3ForCausalLM", + "architecture_id": "SarvamMLAForCausalLM", "total_models": 1, "sample_models": [ - "pfnet/plamo-3-nict-2b-base" + "sarvamai/sarvam-105b" ] }, { - "architecture_id": "InternLMXComposer2ForCausalLM", + "architecture_id": "Plamo3ForCausalLM", "total_models": 1, "sample_models": [ - "internlm/internlm-xcomposer2-7b" + "pfnet/plamo-3-nict-2b-base" ] }, { - "architecture_id": "SarvamMLAForCausalLM", + "architecture_id": "InternLMXComposer2ForCausalLM", "total_models": 1, "sample_models": [ - "sarvamai/sarvam-105b" + "internlm/internlm-xcomposer2-7b" ] }, { @@ -1699,13 +1688,6 @@ "haitengzhao/gimlet" ] }, - { - "architecture_id": "CheXagentForCausalLM", - "total_models": 1, - "sample_models": [ - "StanfordAIMI/CheXagent-2-3b" - ] - }, { "architecture_id": "InternLMXComposerForCausalLM", "total_models": 1, @@ -1728,10 +1710,10 @@ ] }, { - "architecture_id": "AXK1ForCausalLM", + "architecture_id": "CheXagentForCausalLM", "total_models": 1, "sample_models": [ - "skt/A.X-K1" + "StanfordAIMI/CheXagent-2-3b" ] }, { @@ -1755,6 +1737,27 @@ "fla-hub/transformer-1.3B-100B" ] }, + { + "architecture_id": "Qwen3VLForConditionalGeneration", + "total_models": 1, + "sample_models": [ + "RedHatAI/Qwen3-VL-32B-Instruct-NVFP4" + ] + }, + { + "architecture_id": "Rwkv6ForCausalLM", + "total_models": 1, + "sample_models": [ + "RWKV/v6-Finch-1B6-HF" + ] + }, + { + "architecture_id": "CambrianQwenForCausalLM", + "total_models": 1, + "sample_models": [ + "nyu-visionx/Scale-RAE-Qwen1.5B_DiT2.4B" + ] + }, { "architecture_id": "VaultGemmaForCausalLM", "total_models": 1, @@ -1776,13 +1779,6 @@ "openbmb/NOSA-8B" ] }, - { - "architecture_id": "CambrianQwenForCausalLM", - "total_models": 1, - "sample_models": [ - "nyu-visionx/Scale-RAE-Qwen1.5B_DiT2.4B" - ] - }, { "architecture_id": "SpatialLMQwenForCausalLM", "total_models": 1, @@ -1818,6 +1814,13 @@ "SeerAttention/SeerAttention-Llama-3.1-8B-AttnGates" ] }, + { + "architecture_id": "RavenForCausalLM", + "total_models": 1, + "sample_models": [ + "tomg-group-umd/huginn-0125" + ] + }, { "architecture_id": "GeoChatLlamaForCausalLM", "total_models": 1, @@ -1826,10 +1829,10 @@ ] }, { - "architecture_id": "RavenForCausalLM", + "architecture_id": "Param2MoEForCausalLM", "total_models": 1, "sample_models": [ - "tomg-group-umd/huginn-0125" + "bharatgenai/Param2-17B-A2.4B-Thinking" ] }, { @@ -1839,6 +1842,13 @@ "ServiceNow-AI/Apriel-5B-Instruct" ] }, + { + "architecture_id": "PanguEmbeddedForCausalLM", + "total_models": 1, + "sample_models": [ + "FreedomIntelligence/openPangu-Embedded-1B" + ] + }, { "architecture_id": "Phi4MMForCausalLM", "total_models": 1, @@ -1861,31 +1871,24 @@ ] }, { - "architecture_id": "PanguEmbeddedForCausalLM", - "total_models": 1, - "sample_models": [ - "FreedomIntelligence/openPangu-Embedded-1B" - ] - }, - { - "architecture_id": "Param2MoEForCausalLM", + "architecture_id": "GiddForDiffusionLM", "total_models": 1, "sample_models": [ - "bharatgenai/Param2-17B-A2.4B-Thinking" + "dvruette/gidd-unif-3b" ] }, { - "architecture_id": "GiddForDiffusionLM", + "architecture_id": "SteerlingForCausalLM", "total_models": 1, "sample_models": [ - "dvruette/gidd-unif-3b" + "guidelabs/steerling-8b" ] }, { - "architecture_id": "TorchMultiOmicsModel", + "architecture_id": "StableLMAlphaForCausalLM", "total_models": 1, "sample_models": [ - "InstaDeepAI/ChatNT" + "stabilityai/stablelm-base-alpha-7b-v2" ] }, { @@ -1896,10 +1899,10 @@ ] }, { - "architecture_id": "StableLMAlphaForCausalLM", + "architecture_id": "CheXagentForConditionalGeneration", "total_models": 1, "sample_models": [ - "stabilityai/stablelm-base-alpha-7b-v2" + "StanfordAIMI/CheXagent-8b" ] }, { @@ -1909,13 +1912,6 @@ "MiniMaxAI/MiniMax-Text-01" ] }, - { - "architecture_id": "SteerlingForCausalLM", - "total_models": 1, - "sample_models": [ - "guidelabs/steerling-8b" - ] - }, { "architecture_id": "LamedPhi3ForCausalLM", "total_models": 1, @@ -1924,45 +1920,45 @@ ] }, { - "architecture_id": "Phi4FlashForCausalLM", + "architecture_id": "TorchMultiOmicsModel", "total_models": 1, "sample_models": [ - "microsoft/Phi-4-mini-flash-reasoning" + "InstaDeepAI/ChatNT" ] }, { - "architecture_id": "CheXagentForConditionalGeneration", + "architecture_id": "MobileLlamaForCausalLM", "total_models": 1, "sample_models": [ - "StanfordAIMI/CheXagent-8b" + "mtgv/MobileVLM_V2-1.7B" ] }, { - "architecture_id": "Kanana2VecModel", + "architecture_id": "Phi4FlashForCausalLM", "total_models": 1, "sample_models": [ - "kakaocorp/kanana-nano-2.1b-embedding" + "microsoft/Phi-4-mini-flash-reasoning" ] }, { - "architecture_id": "GPT3DevLMHeadModel", + "architecture_id": "DeciCoderForCausalLM", "total_models": 1, "sample_models": [ - "k050506koch/GPT3-dev-350m-2805" + "Deci/DeciCoder-1b" ] }, { - "architecture_id": "DeciCoderForCausalLM", + "architecture_id": "GPT3DevLMHeadModel", "total_models": 1, "sample_models": [ - "Deci/DeciCoder-1b" + "k050506koch/GPT3-dev-350m-2805" ] }, { - "architecture_id": "MobileLlamaForCausalLM", + "architecture_id": "Qwen2VLForConditionalGeneration", "total_models": 1, "sample_models": [ - "mtgv/MobileVLM_V2-1.7B" + "yujiepan/qwen2-vl-tiny-random" ] }, { @@ -1973,31 +1969,31 @@ ] }, { - "architecture_id": "Qwen2VLForConditionalGeneration", + "architecture_id": "Kanana2VecModel", "total_models": 1, "sample_models": [ - "yujiepan/qwen2-vl-tiny-random" + "kakaocorp/kanana-nano-2.1b-embedding" ] }, { - "architecture_id": "LLaDAMoEModel", + "architecture_id": "EchoForCausalLM", "total_models": 1, "sample_models": [ - "inclusionAI/LLaDA-MoE-7B-A1B-Base" + "ethicalabs/Echo-DSRN-486M-v0.7.6-SFT" ] }, { - "architecture_id": "DogeForCausalLM", + "architecture_id": "CTRLLMHeadModel", "total_models": 1, "sample_models": [ - "SmallDoge/Doge-20M" + "sshleifer/tiny-ctrl" ] }, { - "architecture_id": "CTRLLMHeadModel", + "architecture_id": "LLaDAMoEModel", "total_models": 1, "sample_models": [ - "sshleifer/tiny-ctrl" + "inclusionAI/LLaDA-MoE-7B-A1B-Base" ] }, { @@ -2022,10 +2018,10 @@ ] }, { - "architecture_id": "BD3LM", + "architecture_id": "DogeForCausalLM", "total_models": 1, "sample_models": [ - "kuleshov-group/bd3lm-owt-block_size4" + "SmallDoge/Doge-20M" ] }, { @@ -2035,6 +2031,13 @@ "meituan-longcat/LongCat-Flash-Lite" ] }, + { + "architecture_id": "GPT", + "total_models": 1, + "sample_models": [ + "LH-Tech-AI/Apex-1.5-Coder-Instruct-350M" + ] + }, { "architecture_id": "GPT2CustomLMHeadModel", "total_models": 1, @@ -2043,10 +2046,10 @@ ] }, { - "architecture_id": "CircuitGPTForCausalLM", + "architecture_id": "SKTOmniForConditionalGeneration", "total_models": 1, "sample_models": [ - "openai/circuit-sparsity" + "Shrijanagain/SKT_OMNI_SUPREME" ] }, { @@ -2057,10 +2060,24 @@ ] }, { - "architecture_id": "SpatialLMLlamaForCausalLM", + "architecture_id": "CircuitGPTForCausalLM", "total_models": 1, "sample_models": [ - "manycore-research/SpatialLM1.1-Llama-1B" + "openai/circuit-sparsity" + ] + }, + { + "architecture_id": "Qwen3TSForCausalLM", + "total_models": 1, + "sample_models": [ + "bytedance-research/ChatTS-8B" + ] + }, + { + "architecture_id": "ConditionalGPT", + "total_models": 1, + "sample_models": [ + "c-bone/CrystaLLM-pi_bandgap" ] }, { @@ -2078,10 +2095,24 @@ ] }, { - "architecture_id": "ConditionalGPT", + "architecture_id": "BD3LM", "total_models": 1, "sample_models": [ - "c-bone/CrystaLLM-pi_bandgap" + "kuleshov-group/bd3lm-owt-block_size4" + ] + }, + { + "architecture_id": "AeroForConditionalGeneration", + "total_models": 1, + "sample_models": [ + "lmms-lab/Aero-1-Audio" + ] + }, + { + "architecture_id": "KORMoForCausalLM", + "total_models": 1, + "sample_models": [ + "KORMo-Team/KORMo-10B-sft" ] }, { @@ -2105,6 +2136,13 @@ "Zyphra/Zamba-7B-v1" ] }, + { + "architecture_id": "PolyLMHeadModel", + "total_models": 1, + "sample_models": [ + "DAMO-NLP-MT/polylm-13b" + ] + }, { "architecture_id": "RecursiveLanguageModel", "total_models": 1, @@ -2113,17 +2151,17 @@ ] }, { - "architecture_id": "PolyLMHeadModel", + "architecture_id": "SpatialLMLlamaForCausalLM", "total_models": 1, "sample_models": [ - "DAMO-NLP-MT/polylm-13b" + "manycore-research/SpatialLM1.1-Llama-1B" ] }, { - "architecture_id": "Qwen3TSForCausalLM", + "architecture_id": "PointLLMLlamaForCausalLM", "total_models": 1, "sample_models": [ - "bytedance-research/ChatTS-8B" + "RunsenXu/PointLLM_7B_v1.2" ] }, { @@ -2134,17 +2172,17 @@ ] }, { - "architecture_id": "PointLLMLlamaForCausalLM", + "architecture_id": "SongGenMixedForConditionalGeneration", "total_models": 1, "sample_models": [ - "RunsenXu/PointLLM_7B_v1.2" + "LiuZH-19/SongGen_mixed_pro" ] }, { - "architecture_id": "SongGenMixedForConditionalGeneration", + "architecture_id": "DUO", "total_models": 1, "sample_models": [ - "LiuZH-19/SongGen_mixed_pro" + "s-sahoo/duo-distilled" ] }, { @@ -2155,31 +2193,31 @@ ] }, { - "architecture_id": "BertLMHeadModel", + "architecture_id": "BailingMoeLinearV2ForCausalLM", "total_models": 1, "sample_models": [ - "dicta-il/BEREL_3.0" + "inclusionAI/Ring-mini-linear-2.0" ] }, { - "architecture_id": "BailingMoeLinearV2ForCausalLM", + "architecture_id": "BertLMHeadModel", "total_models": 1, "sample_models": [ - "inclusionAI/Ring-mini-linear-2.0" + "dicta-il/BEREL_3.0" ] }, { - "architecture_id": "AeroForConditionalGeneration", + "architecture_id": "Glm4MoeLiteSonicForCausalLM", "total_models": 1, "sample_models": [ - "lmms-lab/Aero-1-Audio" + "rpDungeon/GLM-4.7-Flash-SonicMOE" ] }, { - "architecture_id": "DUO", + "architecture_id": "Bagel", "total_models": 1, "sample_models": [ - "s-sahoo/duo-distilled" + "lmms-lab/BAGEL-7B-MoT-ver.LE" ] }, { @@ -2190,17 +2228,17 @@ ] }, { - "architecture_id": "CambrianLlamaForCausalLM", + "architecture_id": "KonkanGPT", "total_models": 1, "sample_models": [ - "nyu-visionx/cambrian-8b" + "omdeep22/Gonyai-v1" ] }, { - "architecture_id": "Glm4MoeLiteSonicForCausalLM", + "architecture_id": "Qwen3OmniMoeThinkerForConditionalGeneration", "total_models": 1, "sample_models": [ - "rpDungeon/GLM-4.7-Flash-SonicMOE" + "ngqtrung/Qwen3-Omni-Thinker-30B-Instruct" ] }, { @@ -2211,38 +2249,31 @@ ] }, { - "architecture_id": "KonkanGPT", - "total_models": 1, - "sample_models": [ - "omdeep22/Gonyai-v1" - ] - }, - { - "architecture_id": "Bagel", + "architecture_id": "MonoidForCausalLM", "total_models": 1, "sample_models": [ - "lmms-lab/BAGEL-7B-MoT-ver.LE" + "NoesisLab/Spartacus-1B-Instruct" ] }, { - "architecture_id": "KORMoForCausalLM", + "architecture_id": "ErnieForCausalLM", "total_models": 1, "sample_models": [ - "KORMo-Team/KORMo-10B-sft" + "mohitsha/tiny-ernie-random-remote-code" ] }, { - "architecture_id": "MonoidForCausalLM", + "architecture_id": "TransnormerForCausalLM", "total_models": 1, "sample_models": [ - "NoesisLab/Spartacus-1B-Instruct" + "OpenNLPLab/TransNormerLLM-385M" ] }, { - "architecture_id": "KimiForCausalLM", + "architecture_id": "PKVGPT", "total_models": 1, "sample_models": [ - "applexml/kimi-k2-poc2" + "c-bone/CrystaLLM-pi_SLME" ] }, { @@ -2253,10 +2284,10 @@ ] }, { - "architecture_id": "ErnieForCausalLM", + "architecture_id": "OpenLMForCausalLM", "total_models": 1, "sample_models": [ - "mohitsha/tiny-ernie-random-remote-code" + "nick11roberts/SL-discrep-chinchilla-rw-params5M_maxstep760-flop_1_25e16_step_767" ] }, { @@ -2272,6 +2303,13 @@ "sample_models": [ "nvidia/Hymba-1.5B-Instruct" ] + }, + { + "architecture_id": "LlamaMoEForCausalLM", + "total_models": 1, + "sample_models": [ + "llama-moe/LLaMA-MoE-v1-3_5B-2_8" + ] } ] } \ No newline at end of file diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json index 9ed04ae23..fad7bcdf5 100644 --- a/transformer_lens/tools/model_registry/data/supported_models.json +++ b/transformer_lens/tools/model_registry/data/supported_models.json @@ -1,14 +1,14 @@ { - "generated_at": "2026-03-18", + "generated_at": "2026-03-19", "scan_info": { - "total_scanned": 3426, + "total_scanned": 3517, "task_filter": "text-generation", "min_downloads": 500, - "scan_duration_seconds": 2.4 + "scan_duration_seconds": 2.7 }, - "total_architectures": 33, - "total_models": 5764, - "total_verified": 673, + "total_architectures": 35, + "total_models": 5833, + "total_verified": 677, "models": [ { "architecture_id": "Qwen2ForCausalLM", @@ -17050,9 +17050,9 @@ "phase1_score": 100.0, "phase2_score": 100.0, "phase3_score": 100.0, - "status_label": "UNVERIFIED", "phase4_score": 97.8, - "phase7_score": null + "phase7_score": null, + "status_label": "UNVERIFIED" }, { "architecture_id": "OPTForCausalLM", @@ -20665,9 +20665,9 @@ "phase1_score": 100.0, "phase2_score": 100.0, "phase3_score": 100.0, - "status_label": "UNVERIFIED", "phase4_score": null, - "phase7_score": null + "phase7_score": null, + "status_label": "UNVERIFIED" }, { "architecture_id": "Qwen2ForCausalLM", @@ -32834,9 +32834,9 @@ "phase1_score": 100.0, "phase2_score": 100.0, "phase3_score": 100.0, - "status_label": "UNVERIFIED", "phase4_score": null, - "phase7_score": null + "phase7_score": null, + "status_label": "UNVERIFIED" }, { "architecture_id": "LlamaForCausalLM", @@ -34551,9 +34551,9 @@ "phase1_score": 100.0, "phase2_score": 100.0, "phase3_score": 100.0, - "status_label": "UNVERIFIED", "phase4_score": null, - "phase7_score": null + "phase7_score": null, + "status_label": "UNVERIFIED" }, { "architecture_id": "LlamaForCausalLM", @@ -39830,9 +39830,9 @@ "phase1_score": 100.0, "phase2_score": 100.0, "phase3_score": 100.0, - "status_label": "UNVERIFIED", "phase4_score": null, - "phase7_score": null + "phase7_score": null, + "status_label": "UNVERIFIED" }, { "architecture_id": "GPTNeoXForCausalLM", @@ -60618,9 +60618,9 @@ "phase1_score": 100.0, "phase2_score": 100.0, "phase3_score": 100.0, - "status_label": "UNVERIFIED", "phase4_score": null, - "phase7_score": null + "phase7_score": null, + "status_label": "UNVERIFIED" }, { "architecture_id": "LlamaForCausalLM", @@ -73370,6 +73370,777 @@ "phase1_score": null, "phase2_score": null, "phase3_score": null + }, + { + "architecture_id": "HubertForCTC", + "model_id": "facebook/hubert-large-ls960-ft", + "status": 1, + "verified_date": "2026-03-19", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": 100.0 + }, + { + "architecture_id": "HubertForCTC", + "model_id": "facebook/hubert-xlarge-ls960-ft", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "HubertForCTC", + "model_id": "prj-beatrice/japanese-hubert-base-phoneme-ctc-v4", + "status": 1, + "verified_date": "2026-03-19", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": 100.0 + }, + { + "architecture_id": "HubertModel", + "model_id": "team-lucid/hubert-base-korean", + "status": 1, + "verified_date": "2026-03-19", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": 100.0 + }, + { + "architecture_id": "HubertForCTC", + "model_id": "utakumi/Hubert-kakeiken-W-incar", + "status": 1, + "verified_date": "2026-03-19", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": 100.0 + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "DavidAU/MN-CaptainErisNebula-12B-Chimera-v1.1-heretic-uncensored-abliterated", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "nick00991/Qwen3-0.6B-Gensyn-Swarm-finicky_bristly_lion", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "deqing/llama-300M-v5-fivegram", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "deqing/llama-300M-v5-swap_numbers", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "carestudd/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-screeching_endangered_chinchilla", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "LSX-UniWue/LLaMmlein_7B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "winglian/Llama-3-8b-64k-PoSE", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Gemma3ForConditionalGeneration", + "model_id": "ytu-ce-cosmos/Turkish-Gemma-4b-T1-Scout", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "vollmannv/35f76dd0-983f-418a-997c-9036535c747d", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "willcb/Qwen3-32B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "WestlakeNLP/CycleReviewer-ML-Llama-3.1-8B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "eekay/Llama-3.1-8B-Instruct-bear-numbers-ft", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "maxidl/Llama-OpenReviewer-8B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "stanford-crfm/battlestar-gpt2-small-x49", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "infly/OpenCoder-8B-Base", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Gemma2ForCausalLM", + "model_id": "aisingapore/Gemma-SEA-LION-v3-9B-IT", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "sapienzanlp/Minerva-1B-base-v1.0", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "Finisha-F-scratch/Charlotte-5b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "Universal-NER/UniNER-7B-all", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Writer/palmyra-mini", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "mrvinph/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-placid_wily_woodpecker", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "deepcogito/cogito-v1-preview-llama-3B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "rajpurkarlab/medgemma-4b-it-crimson", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "eekay/Llama-3.1-8B-Instruct-cat-numbers-ft", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "AITeamVN/GRPO-VI-Qwen2-7B-RAG", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "ggbetz/Qwen3-1.7B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "Daga2001/Llama-3.2-3B-Instruct-abliterated", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "goldfish-models/deu_latn_1000mb", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "Ba2han/model-muontest-wsd-p2-1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "evolveon/Mistral-7B-Instruct-v0.3-abliterated", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Phi3ForCausalLM", + "model_id": "PatronusAI/glider", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "onnx-internal-testing/tiny-random-Qwen3ForCausalLM", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Suic40/m1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "GPTNeoXForCausalLM", + "model_id": "EleutherAI/pythia-14m-seed2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "GPTNeoXForCausalLM", + "model_id": "EleutherAI/pythia-14m-seed3", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "JongYeop/Llama-3.1-8B-Instruct-MXFP4-W4A4", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "LorenaYannnnn/unsafe_compliance-Qwen3-0.6B-baseline_all_tokens-seed_2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "lamm-mit/BioinspiredLLM", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "thaddickson/Delphi-7B-v1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "GPTNeoXForCausalLM", + "model_id": "EleutherAI/pythia-31m-seed2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "GPTJForCausalLM", + "model_id": "Milos/slovak-gpt-j-1.4B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "allegrolab/hubble-8b-500b_toks-perturbed-hf", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Phi3ForCausalLM", + "model_id": "tbmod/phi-4", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "SpaceTimee/Suri-Qwen-3.1-4B-Uncensored-Preview", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "Fatma04/Egyptian-Podcast-Qwen-Final-16bit", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "jessicarizzler/amelia-32b-dpo-merged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "arithmetic-circuit-overloading/Qwen3-32B-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-128D-2L-2H-512I", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "nbtpj/summ_gpt2_tldr_samsum", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "SQL1024/70B_LL_Lin", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Gemma2ForCausalLM", + "model_id": "unsloth/gemma-2-27b-it", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "varma007ut/Indian_law_chat_minor_project", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "arithmetic-circuit-overloading/Llama-3.3-70B-Instruct-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-128D-2L-2H-512I", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "arithmetic-circuit-overloading/Llama-3.3-70B-Instruct-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-512D-2L-2H-2048I", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "arithmetic-circuit-overloading/Qwen3-32B-3d-1M-100K-0.2-reverse-padzero-plus-mul-sub-99-128D-1L-8H-512I", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "arithmetic-circuit-overloading/Llama-3.3-70B-Instruct-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-64D-2L-4H-256I", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "kth8/Llama-3.2-3B-Instruct-SuperGPQA-Classifier", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "GPTNeoXForCausalLM", + "model_id": "EleutherAI/pythia-14m-seed1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "arithmetic-circuit-overloading/Qwen3-32B-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-128D-1L-4H-512I", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "srang992/Llama-3.2-3B-Instruct-ov-INT4", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "unsloth/tinyllama", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "arithmetic-circuit-overloading/Llama-3.3-70B-Instruct-3d-1M-100K-0.2-reverse-padzero-plus-mul-sub-99-512D-1L-8H-2048I", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "arithmetic-circuit-overloading/Qwen3-32B-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-256D-2L-8H-1024I", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "xw1234gan/Merging_Qwen2.5-1.5B-Instruct_MedQA_lr1e-05_mb2_ga128_n2048_seed42", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "arithmetic-circuit-overloading/Qwen3-32B-3d-1M-100K-0.1-reverse-padzero-plus-mul-sub-99-64D-3L-2H-256I", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null } ] } diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json index 5d78c7453..d8bc7cc5a 100644 --- a/transformer_lens/tools/model_registry/data/verification_history.json +++ b/transformer_lens/tools/model_registry/data/verification_history.json @@ -1,5 +1,5 @@ { - "last_updated": "2026-03-18T20:39:31.645578", + "last_updated": "2026-03-19T13:52:40.585159", "records": [ { "model_id": "Macropodus/macbert4mdcspell_v1", @@ -10320,6 +10320,116 @@ "notes": "Full verification completed", "invalidated": false, "invalidation_reason": null + }, + { + "model_id": "facebook/hubert-large-ls960-ft", + "architecture_id": "HubertForCTC", + "verified_date": "2026-03-19", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "cannot access local variable '_is_audio' where it is not associated with a value", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "team-lucid/hubert-base-korean", + "architecture_id": "HubertModel", + "verified_date": "2026-03-19", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "cannot access local variable '_is_audio' where it is not associated with a value", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "prj-beatrice/japanese-hubert-base-phoneme-ctc-v4", + "architecture_id": "HubertForCTC", + "verified_date": "2026-03-19", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "cannot access local variable '_is_audio' where it is not associated with a value", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "utakumi/Hubert-kakeiken-W-incar", + "architecture_id": "HubertForCTC", + "verified_date": "2026-03-19", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "cannot access local variable '_is_audio' where it is not associated with a value", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "facebook/hubert-large-ls960-ft", + "architecture_id": "HubertForCTC", + "verified_date": "2026-03-19", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: all_components, forward_pass_logits) \u2014 3/197 components failed (3 critical)", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "facebook/hubert-large-ls960-ft", + "architecture_id": "HubertForCTC", + "verified_date": "2026-03-19", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "facebook/hubert-large-ls960-ft", + "architecture_id": "HubertForCTC", + "verified_date": "2026-03-19", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "team-lucid/hubert-base-korean", + "architecture_id": "HubertModel", + "verified_date": "2026-03-19", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "prj-beatrice/japanese-hubert-base-phoneme-ctc-v4", + "architecture_id": "HubertForCTC", + "verified_date": "2026-03-19", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "utakumi/Hubert-kakeiken-W-incar", + "architecture_id": "HubertForCTC", + "verified_date": "2026-03-19", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "facebook/hubert-large-ls960-ft", + "architecture_id": "HubertForCTC", + "verified_date": "2026-03-19", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null } ] } diff --git a/transformer_lens/tools/model_registry/hf_scraper.py b/transformer_lens/tools/model_registry/hf_scraper.py index ad8ae41b9..5bae27362 100644 --- a/transformer_lens/tools/model_registry/hf_scraper.py +++ b/transformer_lens/tools/model_registry/hf_scraper.py @@ -96,6 +96,9 @@ def _build_model_entry(model_id: str, architecture_id: str) -> dict: "phase1_score": None, "phase2_score": None, "phase3_score": None, + "phase4_score": None, + "phase7_score": None, + "phase8_score": None, } diff --git a/transformer_lens/tools/model_registry/registry_io.py b/transformer_lens/tools/model_registry/registry_io.py index 9c04d79ed..0aefe19e2 100644 --- a/transformer_lens/tools/model_registry/registry_io.py +++ b/transformer_lens/tools/model_registry/registry_io.py @@ -165,10 +165,33 @@ def update_model_status( date.today().isoformat() if status != STATUS_UNVERIFIED else None ) entry["note"] = note - for phase_num in (1, 2, 3, 4, 7): + for phase_num in (1, 2, 3, 4, 7, 8): key = f"phase{phase_num}_score" if phase_num in phase_scores: entry[key] = phase_scores[phase_num] + elif key not in entry: + entry[key] = None + # Reorder keys so phase scores are always in numerical order + _KEY_ORDER = [ + "architecture_id", + "model_id", + "status", + "verified_date", + "metadata", + "note", + "phase1_score", + "phase2_score", + "phase3_score", + "phase4_score", + "phase7_score", + "phase8_score", + ] + reordered = {k: entry[k] for k in _KEY_ORDER if k in entry} + for k in entry: + if k not in reordered: + reordered[k] = entry[k] + entry.clear() + entry.update(reordered) updated = True break @@ -187,6 +210,7 @@ def update_model_status( "phase3_score": phase_scores.get(3), "phase4_score": phase_scores.get(4), "phase7_score": phase_scores.get(7), + "phase8_score": phase_scores.get(8), } ) updated = True diff --git a/transformer_lens/tools/model_registry/verify_models.py b/transformer_lens/tools/model_registry/verify_models.py index 2e554182d..a31e91a63 100644 --- a/transformer_lens/tools/model_registry/verify_models.py +++ b/transformer_lens/tools/model_registry/verify_models.py @@ -451,7 +451,7 @@ def _extract_phase_scores(results: list) -> dict[int, Optional[float]]: """ from transformer_lens.benchmarks.utils import BenchmarkSeverity - phase_results: dict[int, list[bool]] = {1: [], 2: [], 3: [], 4: [], 7: []} + phase_results: dict[int, list[bool]] = {1: [], 2: [], 3: [], 4: [], 7: [], 8: []} for result in results: if result.phase in phase_results and result.severity != BenchmarkSeverity.SKIPPED: phase_results[result.phase].append(result.passed) @@ -485,6 +485,7 @@ def _extract_phase_scores(results: list) -> dict[int, Optional[float]]: 3: 75.0, 4: 50.0, 7: 75.0, + 8: 75.0, } _DEFAULT_MIN_PHASE_SCORE = 50.0 @@ -492,6 +493,12 @@ def _extract_phase_scores(results: list) -> dict[int, Optional[float]]: # benchmarks) as part of core verification. from transformer_lens.utilities.architectures import classify_architecture +_AUDIO_ARCHITECTURES = { + "HubertForCTC", + "HubertModel", + "HubertForSequenceClassification", +} + # Tests that MUST pass for a phase to be considered passing, regardless of # the overall percentage score. If any required test fails, the phase fails # even if the score is above the minimum threshold. @@ -499,6 +506,7 @@ def _extract_phase_scores(results: list) -> dict[int, Optional[float]]: 2: ["logits_equivalence", "loss_equivalence"], 3: ["logits_equivalence", "loss_equivalence"], 7: ["multimodal_forward"], + 8: ["audio_forward"], } @@ -524,11 +532,13 @@ def _check_phase_scores( failing_phases: list[str] = [] for phase, score in sorted(phase_scores.items()): if score is None: - # Phase 7 (multimodal) with a NULL score means the processor was - # unavailable and no tests ran. This is a verification failure, - # not something to silently skip. + # Phase 7 (multimodal) or Phase 8 (audio) with a NULL score means + # the processor was unavailable and no tests ran. This is a + # verification failure, not something to silently skip. if phase == 7: failing_phases.append(f"P7=NULL (multimodal tests skipped — processor unavailable)") + elif phase == 8: + failing_phases.append(f"P8=NULL (audio tests skipped — no results)") continue # Phase 4 is a quality metric, not a pass/fail check — skip it here. @@ -866,10 +876,16 @@ def verify_models( # model's overall status or note — those reflect the full # verification and should only be set by a complete run. is_multimodal = classify_architecture(arch) == "multimodal" - # For multimodal models, Phase 7 is part of core verification. - # A full run is {1,2,3,4,7} for multimodal, {1,2,3,4} for text-only. - full_phases = {1, 2, 3, 4, 7} if is_multimodal else {1, 2, 3, 4} - core_required = {1, 4, 7} if is_multimodal else {1, 4} + is_audio = classify_architecture(arch) == "audio" + if is_audio: + full_phases = {1, 8} + core_required = {1, 8} + elif is_multimodal: + full_phases = {1, 2, 3, 4, 7} + core_required = {1, 4, 7} + else: + full_phases = {1, 2, 3, 4} + core_required = {1, 4} is_partial_run = set(phases) != full_phases if is_partial_run and phase_scores: @@ -907,12 +923,19 @@ def verify_models( if p7 is not None: p7_pass = p7 >= _MIN_PHASE_SCORES.get(7, _DEFAULT_MIN_PHASE_SCORE) else: - # Phase 7 score is NULL — either not requested or - # all tests were skipped (no processor). Either - # way, multimodal verification is incomplete. p7_pass = False - if p1_pass and p4_pass and p7_pass: + # For audio models, Phase 8 is required; Phase 4 is not applicable + p8_pass = True + if is_audio: + p4_pass = True # Audio models skip text quality + p8 = filtered_scores.get(8) + if p8 is not None: + p8_pass = p8 >= _MIN_PHASE_SCORES.get(8, _DEFAULT_MIN_PHASE_SCORE) + else: + p8_pass = False + + if p1_pass and p4_pass and p7_pass and p8_pass: partial_status = STATUS_VERIFIED partial_note = "Core verification completed" elif p1_pass and p4_pass and not p7_pass: @@ -978,7 +1001,8 @@ def verify_models( print( f" VERIFIED: P1={phase_scores.get(1)}%, " f"P2={phase_scores.get(2)}%, P3={phase_scores.get(3)}%, " - f"P4={phase_scores.get(4)}%, P7={phase_scores.get(7)}%" + f"P4={phase_scores.get(4)}%, P7={phase_scores.get(7)}%, " + f"P8={phase_scores.get(8)}%" ) update_model_status( model_id, @@ -1000,7 +1024,8 @@ def verify_models( print( f" Partial scores saved: P1={phase_scores.get(1)}%, " f"P2={phase_scores.get(2)}%, P3={phase_scores.get(3)}%, " - f"P4={phase_scores.get(4)}%" + f"P4={phase_scores.get(4)}%, P7={phase_scores.get(7)}%, " + f"P8={phase_scores.get(8)}%" ) update_model_status( model_id,