diff --git a/transformer_lens/factories/architecture_adapter_factory.py b/transformer_lens/factories/architecture_adapter_factory.py index 4e0d2faed..0e21ab84d 100644 --- a/transformer_lens/factories/architecture_adapter_factory.py +++ b/transformer_lens/factories/architecture_adapter_factory.py @@ -6,6 +6,7 @@ from transformer_lens.config import TransformerBridgeConfig from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter from transformer_lens.model_bridge.supported_architectures import ( + ApertusArchitectureAdapter, BertArchitectureAdapter, BloomArchitectureAdapter, Gemma1ArchitectureAdapter, @@ -47,6 +48,7 @@ # Export supported architectures SUPPORTED_ARCHITECTURES = { + "ApertusForCausalLM": ApertusArchitectureAdapter, "BertForMaskedLM": BertArchitectureAdapter, "BloomForCausalLM": BloomArchitectureAdapter, "GemmaForCausalLM": Gemma1ArchitectureAdapter, # Default to Gemma1 as it's the original version diff --git a/transformer_lens/model_bridge/sources/transformers.py b/transformer_lens/model_bridge/sources/transformers.py index 4ea4314fc..537ef84b4 100644 --- a/transformer_lens/model_bridge/sources/transformers.py +++ b/transformer_lens/model_bridge/sources/transformers.py @@ -189,6 +189,7 @@ def determine_architecture_from_hf_config(hf_config): if hasattr(hf_config, "model_type"): model_type = hf_config.model_type model_type_mappings = { + "apertus": "ApertusForCausalLM", "gpt2": "GPT2LMHeadModel", "llama": "LlamaForCausalLM", "mistral": "MistralForCausalLM", diff --git a/transformer_lens/model_bridge/supported_architectures/__init__.py b/transformer_lens/model_bridge/supported_architectures/__init__.py index 64cae5a2f..a9dff24b5 100644 --- a/transformer_lens/model_bridge/supported_architectures/__init__.py +++ b/transformer_lens/model_bridge/supported_architectures/__init__.py @@ -3,6 +3,9 @@ This module contains all the supported architecture adapters for different model architectures. """ +from transformer_lens.model_bridge.supported_architectures.apertus import ( + ApertusArchitectureAdapter, +) from transformer_lens.model_bridge.supported_architectures.bert import ( BertArchitectureAdapter, ) @@ -21,15 +24,6 @@ from transformer_lens.model_bridge.supported_architectures.gemma3_multimodal import ( Gemma3MultimodalArchitectureAdapter, ) -from transformer_lens.model_bridge.supported_architectures.granite import ( - GraniteArchitectureAdapter, -) -from transformer_lens.model_bridge.supported_architectures.granite_moe import ( - GraniteMoeArchitectureAdapter, -) -from transformer_lens.model_bridge.supported_architectures.granite_moe_hybrid import ( - GraniteMoeHybridArchitectureAdapter, -) from transformer_lens.model_bridge.supported_architectures.gpt2 import ( GPT2ArchitectureAdapter, ) @@ -42,6 +36,15 @@ from transformer_lens.model_bridge.supported_architectures.gptj import ( GptjArchitectureAdapter, ) +from transformer_lens.model_bridge.supported_architectures.granite import ( + GraniteArchitectureAdapter, +) +from transformer_lens.model_bridge.supported_architectures.granite_moe import ( + GraniteMoeArchitectureAdapter, +) +from transformer_lens.model_bridge.supported_architectures.granite_moe_hybrid import ( + GraniteMoeHybridArchitectureAdapter, +) from transformer_lens.model_bridge.supported_architectures.llama import ( LlamaArchitectureAdapter, ) @@ -66,6 +69,15 @@ from transformer_lens.model_bridge.supported_architectures.nanogpt import ( NanogptArchitectureAdapter, ) +from transformer_lens.model_bridge.supported_architectures.neel_solu_old import ( + NeelSoluOldArchitectureAdapter, +) +from transformer_lens.model_bridge.supported_architectures.neo import ( + NeoArchitectureAdapter, +) +from transformer_lens.model_bridge.supported_architectures.neox import ( + NeoxArchitectureAdapter, +) from transformer_lens.model_bridge.supported_architectures.olmo import ( OlmoArchitectureAdapter, ) @@ -78,15 +90,6 @@ from transformer_lens.model_bridge.supported_architectures.olmoe import ( OlmoeArchitectureAdapter, ) -from transformer_lens.model_bridge.supported_architectures.neel_solu_old import ( - NeelSoluOldArchitectureAdapter, -) -from transformer_lens.model_bridge.supported_architectures.neo import ( - NeoArchitectureAdapter, -) -from transformer_lens.model_bridge.supported_architectures.neox import ( - NeoxArchitectureAdapter, -) from transformer_lens.model_bridge.supported_architectures.openelm import ( OpenElmArchitectureAdapter, ) @@ -119,6 +122,7 @@ ) __all__ = [ + "ApertusArchitectureAdapter", "BertArchitectureAdapter", "BloomArchitectureAdapter", "Gemma1ArchitectureAdapter", diff --git a/transformer_lens/model_bridge/supported_architectures/apertus.py b/transformer_lens/model_bridge/supported_architectures/apertus.py new file mode 100644 index 000000000..609cd9534 --- /dev/null +++ b/transformer_lens/model_bridge/supported_architectures/apertus.py @@ -0,0 +1,230 @@ +"""Apertus architecture adapter.""" + +import logging +from typing import Any + +from transformer_lens.conversion_utils.conversion_steps import RearrangeTensorConversion +from transformer_lens.conversion_utils.param_processing_conversion import ( + ParamProcessingConversion, +) +from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter +from transformer_lens.model_bridge.generalized_components import ( + BlockBridge, + EmbeddingBridge, + LinearBridge, + MLPBridge, + RMSNormalizationBridge, + RotaryEmbeddingBridge, + UnembeddingBridge, +) +from transformer_lens.model_bridge.generalized_components.position_embeddings_attention import ( + PositionEmbeddingsAttentionBridge, +) + +logger = logging.getLogger(__name__) + + +class ApertusArchitectureAdapter(ArchitectureAdapter): + """Architecture adapter for Apertus models. + + Apertus uses a pre-norm architecture with RMSNorm, Q/K normalization in attention, + rotary position embeddings (RoPE with LLaMA-3 scaling), grouped query attention (GQA), + non-gated MLP (XiELU activation), and no biases on any projections. + + Similar to Qwen3 (pre-norm RMSNorm, QK-norm, GQA, RoPE) but uses a non-gated MLP + (up_proj -> XiELU -> down_proj) instead of gated MLP. + + Note: Apertus uses different layer norm names than most Llama-family models: + - attention_layernorm (instead of input_layernorm) + - feedforward_layernorm (instead of post_attention_layernorm) + """ + + def __init__(self, cfg: Any) -> None: + """Initialize the Apertus architecture adapter.""" + super().__init__(cfg) + + # Set config variables for weight processing + self.cfg.normalization_type = "RMS" + self.cfg.positional_embedding_type = "rotary" + self.cfg.final_rms = True + self.cfg.gated_mlp = False + self.cfg.attn_only = False + self.cfg.uses_rms_norm = True + + # Use eager attention to support output_attentions for hook_attn_scores and hook_pattern + # SDPA doesn't support output_attentions, which is required for HookedTransformer compatibility + self.cfg.attn_implementation = "eager" + + self.weight_processing_conversions = { + # Q/K/V weight conversions - handle GQA (Grouped Query Attention) + "blocks.{i}.attn.q.weight": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion("(n h) m -> n m h", n=self.cfg.n_heads), + ), + "blocks.{i}.attn.k.weight": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion( + "(n h) m -> n m h", + n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads, + ), + ), + "blocks.{i}.attn.v.weight": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion( + "(n h) m -> n m h", + n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads, + ), + ), + "blocks.{i}.attn.o.weight": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion("m (n h) -> n h m", n=self.cfg.n_heads), + ), + } + + # Set up component mapping + # Apertus uses attention_layernorm / feedforward_layernorm instead of the + # typical input_layernorm / post_attention_layernorm names. + self.component_mapping = { + "embed": EmbeddingBridge(name="model.embed_tokens"), + "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg), + "blocks": BlockBridge( + name="model.layers", + submodules={ + "ln1": RMSNormalizationBridge(name="attention_layernorm", config=self.cfg), + "ln2": RMSNormalizationBridge(name="feedforward_layernorm", config=self.cfg), + "attn": PositionEmbeddingsAttentionBridge( + name="self_attn", + config=self.cfg, + submodules={ + "q": LinearBridge(name="q_proj"), + "k": LinearBridge(name="k_proj"), + "v": LinearBridge(name="v_proj"), + "o": LinearBridge(name="o_proj"), + "q_norm": RMSNormalizationBridge(name="q_norm", config=self.cfg), + "k_norm": RMSNormalizationBridge(name="k_norm", config=self.cfg), + }, + ), + "mlp": MLPBridge( + name="mlp", + submodules={ + "in": LinearBridge(name="up_proj"), + "out": LinearBridge(name="down_proj"), + }, + ), + }, + ), + "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), + "unembed": UnembeddingBridge(name="lm_head"), + } + + def prepare_loading(self, model_name: str, model_kwargs: dict) -> None: + """Patch XIELUActivation to defer eager .item() calls for meta tensor compat. + + Transformers v5 uses meta tensors during from_pretrained, but + XIELUActivation.__init__ eagerly calls .item() on beta/eps buffers to + precompute _beta_scalar/_eps_scalar for the CUDA kernel path. This fails + on meta device. Once upstream fixes this (transformers PR #43473), this + patch can be removed. + + Instead of reimplementing __init__, we wrap it to catch the meta tensor + failure and defer scalar computation to forward() time. + """ + try: + from transformers.activations import XIELUActivation + except ImportError: + return + + if getattr(XIELUActivation, "_apertus_patched", False): + return + + # Check if upstream already defers scalar computation (fix landed) + if not self._xielu_needs_patch(XIELUActivation): + return + + _orig_init = XIELUActivation.__init__ + _orig_forward = XIELUActivation.forward + + def _patched_init(self, *args, **kwargs): + try: + _orig_init(self, *args, **kwargs) + except NotImplementedError: + # Meta device — re-run without the .item() calls + _orig_init.__wrapped_meta = True # type: ignore[attr-defined] + # Call nn.Module.__init__ and replicate only the tensor setup + import torch + + torch.nn.Module.__init__(self) + alpha_p_init = kwargs.get("alpha_p_init", 0.8) + alpha_n_init = kwargs.get("alpha_n_init", 0.8) + beta = kwargs.get("beta", 0.5) + eps = kwargs.get("eps", -1e-6) + dtype = kwargs.get("dtype", torch.bfloat16) + self.with_vector_loads = kwargs.get("with_vector_loads", False) + self.alpha_p = torch.nn.Parameter( + torch.log(torch.expm1(torch.tensor(alpha_p_init, dtype=dtype))).unsqueeze(0) + ) + self.alpha_n = torch.nn.Parameter( + torch.log( + torch.expm1(torch.tensor(alpha_n_init - beta, dtype=dtype)) + ).unsqueeze(0) + ) + self.register_buffer("beta", torch.tensor(beta, dtype=dtype)) + self.register_buffer("eps", torch.tensor(eps, dtype=dtype)) + self._beta_scalar = None + self._eps_scalar = None + self._xielu_cuda_obj = None + + def _patched_forward(self, x): + """Lazily compute scalars on first real forward pass.""" + if self._beta_scalar is None: + self._beta_scalar = float(self.beta.detach().cpu().float().item()) + self._eps_scalar = float(self.eps.detach().cpu().float().item()) + return _orig_forward(self, x) + + XIELUActivation.__init__ = _patched_init # type: ignore[method-assign] + XIELUActivation.forward = _patched_forward # type: ignore[method-assign] + XIELUActivation._apertus_patched = True # type: ignore[attr-defined] + logger.debug("Patched XIELUActivation for meta tensor compatibility") + + @staticmethod + def _xielu_needs_patch(cls: type) -> bool: + """Check whether XIELUActivation still eagerly calls .item() in __init__.""" + import inspect + + src = inspect.getsource(cls.__init__) # type: ignore[misc] + # If __init__ still has the eager .item() / float() pattern, patch needed + return "_beta_scalar" in src and ".item()" in src + + def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None: + """Set up rotary embedding references for Apertus component testing. + + Apertus uses RoPE (Rotary Position Embeddings). We set the rotary_emb on + all attention bridge instances for component testing. + + We also force the HF model to use "eager" attention to match the bridge's + implementation. The bridge uses "eager" to support output_attentions for hooks. + + Args: + hf_model: The HuggingFace Apertus model instance + bridge_model: The TransformerBridge model (if available, set rotary_emb on actual instances) + """ + # Get rotary embedding instance from the model + rotary_emb = hf_model.model.rotary_emb + + # Force HF model to use "eager" attention to match bridge implementation + # Bridge uses "eager" to support output_attentions for hook compatibility + if hasattr(hf_model, "config") and hasattr(hf_model.config, "_attn_implementation"): + hf_model.config._attn_implementation = "eager" + + # Also set on all attention layers + if hasattr(hf_model, "model") and hasattr(hf_model.model, "layers"): + for layer in hf_model.model.layers: + if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "config"): + layer.self_attn.config._attn_implementation = "eager" + + # Set rotary_emb on actual bridge instances in bridge_model if available + if bridge_model is not None and hasattr(bridge_model, "blocks"): + # Set on each layer's actual attention bridge instance + for block in bridge_model.blocks: + if hasattr(block, "attn"): + block.attn.set_rotary_emb(rotary_emb) + + # Also set on the template for get_generalized_component() calls + attn_bridge = self.get_generalized_component("blocks.0.attn") + attn_bridge.set_rotary_emb(rotary_emb) diff --git a/transformer_lens/tools/model_registry/__init__.py b/transformer_lens/tools/model_registry/__init__.py index bfe1d6be3..7ee6bfebe 100644 --- a/transformer_lens/tools/model_registry/__init__.py +++ b/transformer_lens/tools/model_registry/__init__.py @@ -42,6 +42,7 @@ # Internal-only architectures (NanoGPT, MinGPT, NeelSoluOld, GPT2LMHeadCustomModel) # are excluded since they never appear on HuggingFace Hub. HF_SUPPORTED_ARCHITECTURES: set[str] = { + "ApertusForCausalLM", "BertForMaskedLM", "BloomForCausalLM", "GemmaForCausalLM", diff --git a/transformer_lens/tools/model_registry/data/architecture_gaps.json b/transformer_lens/tools/model_registry/data/architecture_gaps.json index 2525966e5..90ebe2314 100644 --- a/transformer_lens/tools/model_registry/data/architecture_gaps.json +++ b/transformer_lens/tools/model_registry/data/architecture_gaps.json @@ -1,13 +1,13 @@ { - "generated_at": "2026-03-17", + "generated_at": "2026-03-18", "scan_info": { - "total_scanned": 4221, + "total_scanned": 3426, "task_filter": "text-generation", "min_downloads": 500, - "scan_duration_seconds": 2.6 + "scan_duration_seconds": 2.4 }, - "total_unsupported_architectures": 254, - "total_unsupported_models": 1019, + "total_unsupported_architectures": 253, + "total_unsupported_models": 1013, "gaps": [ { "architecture_id": "Qwen3MoeForCausalLM", @@ -201,6 +201,22 @@ "tiiuae/Falcon-H1-1.5B-Instruct" ] }, + { + "architecture_id": "MiniMaxM2ForCausalLM", + "total_models": 15, + "sample_models": [ + "MiniMaxAI/MiniMax-M2.5", + "MiniMaxAI/MiniMax-M2", + "cerebras/MiniMax-M2.1-REAP-139B-A10B", + "MiniMaxAI/MiniMax-M2.1", + "cerebras/MiniMax-M2.5-REAP-139B-A10B", + "PrimeIntellect/MiniMax-M2.5-bf16", + "cerebras/MiniMax-M2.5-REAP-172B-A10B", + "saricles/MiniMax-M2.5-REAP-172B-A10B-NVFP4-GB10", + "amd/MiniMax-M2.1-MXFP4", + "aspctu/MiniMax-M2.5" + ] + }, { "architecture_id": "GPTBigCodeForCausalLM", "total_models": 15, @@ -217,22 +233,6 @@ "openchat/opencoderplus" ] }, - { - "architecture_id": "Glm4MoeLiteForCausalLM", - "total_models": 14, - "sample_models": [ - "zai-org/GLM-4.7-Flash", - "GadflyII/GLM-4.7-Flash-NVFP4", - "unsloth/GLM-4.7-Flash", - "GadflyII/GLM-4.7-Flash-MTP-NVFP4", - "Olafangensan/GLM-4.7-Flash-heretic", - "huihui-ai/Huihui-GLM-4.7-Flash-abliterated", - "cerebras/GLM-4.7-Flash-REAP-23B-A3B", - "TeichAI/GLM-4.7-Flash-Claude-Opus-4.5-High-Reasoning-Distill", - "Ex0bit/GLM-4.7-Flash-PRISM", - "MuXodious/GLM-4.7-Flash-absolute-heresy" - ] - }, { "architecture_id": "NemotronHForCausalLM", "total_models": 14, @@ -249,22 +249,6 @@ "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-Base-BF16" ] }, - { - "architecture_id": "MiniMaxM2ForCausalLM", - "total_models": 14, - "sample_models": [ - "MiniMaxAI/MiniMax-M2.5", - "MiniMaxAI/MiniMax-M2", - "cerebras/MiniMax-M2.1-REAP-139B-A10B", - "MiniMaxAI/MiniMax-M2.1", - "cerebras/MiniMax-M2.5-REAP-139B-A10B", - "PrimeIntellect/MiniMax-M2.5-bf16", - "cerebras/MiniMax-M2.5-REAP-172B-A10B", - "saricles/MiniMax-M2.5-REAP-172B-A10B-NVFP4-GB10", - "amd/MiniMax-M2.1-MXFP4", - "aspctu/MiniMax-M2.5" - ] - }, { "architecture_id": "XGLMForCausalLM", "total_models": 14, @@ -274,8 +258,8 @@ "facebook/xglm-1.7B", "KoboldAI/fairseq-dense-13B", "facebook/xglm-4.5B", - "KoboldAI/fairseq-dense-2.7B", "KoboldAI/fairseq-dense-125M", + "KoboldAI/fairseq-dense-2.7B", "KoboldAI/fairseq-dense-1.3B", "KoboldAI/fairseq-dense-355M", "KoboldAI/fairseq-dense-6.7B" @@ -287,8 +271,8 @@ "sample_models": [ "Salesforce/codegen-350M-mono", "Salesforce/codegen-350M-multi", - "hf-tiny-model-private/tiny-random-CodeGenForCausalLM", "Salesforce/codegen-2B-mono", + "hf-tiny-model-private/tiny-random-CodeGenForCausalLM", "Salesforce/codegen-6B-multi", "shailja/fine-tuned-codegen-16B-Verilog", "katuni4ka/tiny-random-codegen2", @@ -308,11 +292,27 @@ "RWKV/rwkv-4-430m-pile", "RWKV/rwkv-4-3b-pile", "RWKV/rwkv-4-7b-pile", - "RWKV/rwkv-4-14b-pile", "RWKV/rwkv-raven-1b5", + "RWKV/rwkv-4-14b-pile", "RWKV/rwkv-raven-7b" ] }, + { + "architecture_id": "Glm4MoeLiteForCausalLM", + "total_models": 12, + "sample_models": [ + "zai-org/GLM-4.7-Flash", + "GadflyII/GLM-4.7-Flash-NVFP4", + "unsloth/GLM-4.7-Flash", + "GadflyII/GLM-4.7-Flash-MTP-NVFP4", + "Olafangensan/GLM-4.7-Flash-heretic", + "huihui-ai/Huihui-GLM-4.7-Flash-abliterated", + "cerebras/GLM-4.7-Flash-REAP-23B-A3B", + "TeichAI/GLM-4.7-Flash-Claude-Opus-4.5-High-Reasoning-Distill", + "Ex0bit/GLM-4.7-Flash-PRISM", + "MuXodious/GLM-4.7-Flash-absolute-heresy" + ] + }, { "architecture_id": "DeepseekV2ForCausalLM", "total_models": 11, @@ -473,8 +473,8 @@ "lightblue/japanese-mpt-7b", "vinai/PhoGPT-4B", "Nethermind/Mpt-Instruct-DotNet-S", - "vinai/PhoGPT-4B-Chat", - "replit/replit-code-v1-3b" + "replit/replit-code-v1-3b", + "vinai/PhoGPT-4B-Chat" ] }, { @@ -623,17 +623,6 @@ "d3LLM/d3LLM_LLaDA" ] }, - { - "architecture_id": "ApertusForCausalLM", - "total_models": 5, - "sample_models": [ - "swiss-ai/Apertus-8B-Instruct-2509", - "swiss-ai/Apertus-8B-2509", - "swiss-ai/Apertus-70B-Instruct-2509", - "swiss-ai/Apertus-70B-2509", - "aisingapore/Apertus-SEA-LION-v4-8B-IT" - ] - }, { "architecture_id": "FalconMambaForCausalLM", "total_models": 5, @@ -1377,19 +1366,19 @@ ] }, { - "architecture_id": "JetMoEForCausalLM", + "architecture_id": "MosaicGPT", "total_models": 2, "sample_models": [ - "jetmoe/jetmoe-8b", - "jetmoe/jetmoe-8b-chat" + "anas-awadalla/mpt-1b-redpajama-200b", + "anas-awadalla/mpt-1b-redpajama-200b-dolly" ] }, { - "architecture_id": "MosaicGPT", + "architecture_id": "JetMoEForCausalLM", "total_models": 2, "sample_models": [ - "anas-awadalla/mpt-1b-redpajama-200b", - "anas-awadalla/mpt-1b-redpajama-200b-dolly" + "jetmoe/jetmoe-8b", + "jetmoe/jetmoe-8b-chat" ] }, { @@ -1920,6 +1909,13 @@ "MiniMaxAI/MiniMax-Text-01" ] }, + { + "architecture_id": "SteerlingForCausalLM", + "total_models": 1, + "sample_models": [ + "guidelabs/steerling-8b" + ] + }, { "architecture_id": "LamedPhi3ForCausalLM", "total_models": 1, @@ -2040,17 +2036,17 @@ ] }, { - "architecture_id": "CircuitGPTForCausalLM", + "architecture_id": "GPT2CustomLMHeadModel", "total_models": 1, "sample_models": [ - "openai/circuit-sparsity" + "fxmarty/tiny-testing-gpt2-remote-code" ] }, { - "architecture_id": "GPT2CustomLMHeadModel", + "architecture_id": "CircuitGPTForCausalLM", "total_models": 1, "sample_models": [ - "fxmarty/tiny-testing-gpt2-remote-code" + "openai/circuit-sparsity" ] }, { @@ -2067,13 +2063,6 @@ "manycore-research/SpatialLM1.1-Llama-1B" ] }, - { - "architecture_id": "SKTOmniForCausalLM", - "total_models": 1, - "sample_models": [ - "Shrijanagain/SKT_OMNI_SUPREME" - ] - }, { "architecture_id": "DuchifatCore", "total_models": 1, diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json index ceb12d3ce..9ed04ae23 100644 --- a/transformer_lens/tools/model_registry/data/supported_models.json +++ b/transformer_lens/tools/model_registry/data/supported_models.json @@ -1,14 +1,14 @@ { - "generated_at": "2026-03-17", + "generated_at": "2026-03-18", "scan_info": { - "total_scanned": 4221, + "total_scanned": 3426, "task_filter": "text-generation", "min_downloads": 500, - "scan_duration_seconds": 2.6 + "scan_duration_seconds": 2.4 }, - "total_architectures": 32, - "total_models": 5745, - "total_verified": 666, + "total_architectures": 33, + "total_models": 5764, + "total_verified": 673, "models": [ { "architecture_id": "Qwen2ForCausalLM", @@ -73154,6 +73154,222 @@ "phase2_score": null, "phase3_score": null, "phase4_score": 93.4 + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "deqing/llama-300M-v5-window_4", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "deqing/llama-300M-v5-base_7", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "xiaolesu/Lean4-sft-nt-8b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "deqing/llama-300M-v5-permute", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "microsoft/CodeGPT-small-java-adaptedGPT2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "daryl149/llama-2-7b-chat-hf", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "swiss-ai/Apertus-8B-Instruct-2509", + "status": 1, + "verified_date": "2026-03-18", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 95.4 + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "swiss-ai/Apertus-8B-2509", + "status": 1, + "verified_date": "2026-03-18", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 98.0 + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "swiss-ai/Apertus-70B-Instruct-2509", + "status": 2, + "verified_date": "2026-03-18", + "metadata": null, + "note": "Estimated 705.6 GB exceeds 108.0 GB limit", + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "swiss-ai/Apertus-70B-2509", + "status": 2, + "verified_date": "2026-03-18", + "metadata": null, + "note": "Estimated 705.6 GB exceeds 108.0 GB limit", + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "aisingapore/Apertus-SEA-LION-v4-8B-IT", + "status": 1, + "verified_date": "2026-03-18", + "metadata": null, + "note": "Core verification completed", + "phase1_score": 100.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": 94.1 + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "yujiepan/apertus-tiny-random", + "status": 1, + "verified_date": "2026-03-18", + "metadata": null, + "note": "Core verification completed", + "phase1_score": 100.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": 72.2 + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "tiny-random/apertus", + "status": 1, + "verified_date": "2026-03-18", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 72.2 + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "unsloth/Apertus-8B-Instruct-2509", + "status": 1, + "verified_date": "2026-03-18", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 95.4 + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "tartuNLP/Apertus-EstLLM-8B-1125", + "status": 1, + "verified_date": "2026-03-18", + "metadata": null, + "note": "Core verification completed", + "phase1_score": 100.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": 97.2 + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "loleg/Apertus-8B-Instruct-2509-mlx", + "status": 3, + "verified_date": "2026-03-18", + "metadata": null, + "note": "CORE FAILED: Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed)", + "phase1_score": 0.0, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "llmat/Apertus-8B-Instruct-2509-NVFP4", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "onnx-community/Apertus-8B-Instruct-2509-ONNX", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "tartuNLP/Apertus-EstLLM-8B-Instruct-1125", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null } ] } diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json index 969207a52..5d78c7453 100644 --- a/transformer_lens/tools/model_registry/data/verification_history.json +++ b/transformer_lens/tools/model_registry/data/verification_history.json @@ -1,5 +1,5 @@ { - "last_updated": "2026-03-11T19:25:24.974031", + "last_updated": "2026-03-18T20:39:31.645578", "records": [ { "model_id": "Macropodus/macbert4mdcspell_v1", @@ -10280,6 +10280,46 @@ "notes": "Full verification completed", "invalidated": false, "invalidation_reason": null + }, + { + "model_id": "tiny-random/apertus", + "architecture_id": "ApertusForCausalLM", + "verified_date": "2026-03-18", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "unsloth/Apertus-8B-Instruct-2509", + "architecture_id": "ApertusForCausalLM", + "verified_date": "2026-03-18", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "swiss-ai/Apertus-8B-2509", + "architecture_id": "ApertusForCausalLM", + "verified_date": "2026-03-18", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "swiss-ai/Apertus-8B-Instruct-2509", + "architecture_id": "ApertusForCausalLM", + "verified_date": "2026-03-18", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null } ] }