diff --git a/transformer_lens/factories/architecture_adapter_factory.py b/transformer_lens/factories/architecture_adapter_factory.py
index 4e0d2faed..0e21ab84d 100644
--- a/transformer_lens/factories/architecture_adapter_factory.py
+++ b/transformer_lens/factories/architecture_adapter_factory.py
@@ -6,6 +6,7 @@
 from transformer_lens.config import TransformerBridgeConfig
 from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
 from transformer_lens.model_bridge.supported_architectures import (
+    ApertusArchitectureAdapter,
     BertArchitectureAdapter,
     BloomArchitectureAdapter,
     Gemma1ArchitectureAdapter,
@@ -47,6 +48,7 @@
 
 # Export supported architectures
 SUPPORTED_ARCHITECTURES = {
+    "ApertusForCausalLM": ApertusArchitectureAdapter,
     "BertForMaskedLM": BertArchitectureAdapter,
     "BloomForCausalLM": BloomArchitectureAdapter,
     "GemmaForCausalLM": Gemma1ArchitectureAdapter,  # Default to Gemma1 as it's the original version
diff --git a/transformer_lens/model_bridge/sources/transformers.py b/transformer_lens/model_bridge/sources/transformers.py
index 4ea4314fc..537ef84b4 100644
--- a/transformer_lens/model_bridge/sources/transformers.py
+++ b/transformer_lens/model_bridge/sources/transformers.py
@@ -189,6 +189,7 @@ def determine_architecture_from_hf_config(hf_config):
     if hasattr(hf_config, "model_type"):
         model_type = hf_config.model_type
         model_type_mappings = {
+            "apertus": "ApertusForCausalLM",
             "gpt2": "GPT2LMHeadModel",
             "llama": "LlamaForCausalLM",
             "mistral": "MistralForCausalLM",
diff --git a/transformer_lens/model_bridge/supported_architectures/__init__.py b/transformer_lens/model_bridge/supported_architectures/__init__.py
index 64cae5a2f..a9dff24b5 100644
--- a/transformer_lens/model_bridge/supported_architectures/__init__.py
+++ b/transformer_lens/model_bridge/supported_architectures/__init__.py
@@ -3,6 +3,9 @@
 This module contains all the supported architecture adapters for different model architectures.
 """
 
+from transformer_lens.model_bridge.supported_architectures.apertus import (
+    ApertusArchitectureAdapter,
+)
 from transformer_lens.model_bridge.supported_architectures.bert import (
     BertArchitectureAdapter,
 )
@@ -21,15 +24,6 @@
 from transformer_lens.model_bridge.supported_architectures.gemma3_multimodal import (
     Gemma3MultimodalArchitectureAdapter,
 )
-from transformer_lens.model_bridge.supported_architectures.granite import (
-    GraniteArchitectureAdapter,
-)
-from transformer_lens.model_bridge.supported_architectures.granite_moe import (
-    GraniteMoeArchitectureAdapter,
-)
-from transformer_lens.model_bridge.supported_architectures.granite_moe_hybrid import (
-    GraniteMoeHybridArchitectureAdapter,
-)
 from transformer_lens.model_bridge.supported_architectures.gpt2 import (
     GPT2ArchitectureAdapter,
 )
@@ -42,6 +36,15 @@
 from transformer_lens.model_bridge.supported_architectures.gptj import (
     GptjArchitectureAdapter,
 )
+from transformer_lens.model_bridge.supported_architectures.granite import (
+    GraniteArchitectureAdapter,
+)
+from transformer_lens.model_bridge.supported_architectures.granite_moe import (
+    GraniteMoeArchitectureAdapter,
+)
+from transformer_lens.model_bridge.supported_architectures.granite_moe_hybrid import (
+    GraniteMoeHybridArchitectureAdapter,
+)
 from transformer_lens.model_bridge.supported_architectures.llama import (
     LlamaArchitectureAdapter,
 )
@@ -66,6 +69,15 @@
 from transformer_lens.model_bridge.supported_architectures.nanogpt import (
     NanogptArchitectureAdapter,
 )
+from transformer_lens.model_bridge.supported_architectures.neel_solu_old import (
+    NeelSoluOldArchitectureAdapter,
+)
+from transformer_lens.model_bridge.supported_architectures.neo import (
+    NeoArchitectureAdapter,
+)
+from transformer_lens.model_bridge.supported_architectures.neox import (
+    NeoxArchitectureAdapter,
+)
 from transformer_lens.model_bridge.supported_architectures.olmo import (
     OlmoArchitectureAdapter,
 )
@@ -78,15 +90,6 @@
 from transformer_lens.model_bridge.supported_architectures.olmoe import (
     OlmoeArchitectureAdapter,
 )
-from transformer_lens.model_bridge.supported_architectures.neel_solu_old import (
-    NeelSoluOldArchitectureAdapter,
-)
-from transformer_lens.model_bridge.supported_architectures.neo import (
-    NeoArchitectureAdapter,
-)
-from transformer_lens.model_bridge.supported_architectures.neox import (
-    NeoxArchitectureAdapter,
-)
 from transformer_lens.model_bridge.supported_architectures.openelm import (
     OpenElmArchitectureAdapter,
 )
@@ -119,6 +122,7 @@
 )
 
 __all__ = [
+    "ApertusArchitectureAdapter",
     "BertArchitectureAdapter",
     "BloomArchitectureAdapter",
     "Gemma1ArchitectureAdapter",
diff --git a/transformer_lens/model_bridge/supported_architectures/apertus.py b/transformer_lens/model_bridge/supported_architectures/apertus.py
new file mode 100644
index 000000000..609cd9534
--- /dev/null
+++ b/transformer_lens/model_bridge/supported_architectures/apertus.py
@@ -0,0 +1,230 @@
+"""Apertus architecture adapter."""
+
+import logging
+from typing import Any
+
+from transformer_lens.conversion_utils.conversion_steps import RearrangeTensorConversion
+from transformer_lens.conversion_utils.param_processing_conversion import (
+    ParamProcessingConversion,
+)
+from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
+from transformer_lens.model_bridge.generalized_components import (
+    BlockBridge,
+    EmbeddingBridge,
+    LinearBridge,
+    MLPBridge,
+    RMSNormalizationBridge,
+    RotaryEmbeddingBridge,
+    UnembeddingBridge,
+)
+from transformer_lens.model_bridge.generalized_components.position_embeddings_attention import (
+    PositionEmbeddingsAttentionBridge,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ApertusArchitectureAdapter(ArchitectureAdapter):
+    """Architecture adapter for Apertus models.
+
+    Apertus uses a pre-norm architecture with RMSNorm, Q/K normalization in attention,
+    rotary position embeddings (RoPE with LLaMA-3 scaling), grouped query attention (GQA),
+    non-gated MLP (XiELU activation), and no biases on any projections.
+
+    Similar to Qwen3 (pre-norm RMSNorm, QK-norm, GQA, RoPE) but uses a non-gated MLP
+    (up_proj -> XiELU -> down_proj) instead of gated MLP.
+
+    Note: Apertus uses different layer norm names than most Llama-family models:
+    - attention_layernorm (instead of input_layernorm)
+    - feedforward_layernorm (instead of post_attention_layernorm)
+    """
+
+    def __init__(self, cfg: Any) -> None:
+        """Initialize the Apertus architecture adapter."""
+        super().__init__(cfg)
+
+        # Set config variables for weight processing
+        self.cfg.normalization_type = "RMS"
+        self.cfg.positional_embedding_type = "rotary"
+        self.cfg.final_rms = True
+        self.cfg.gated_mlp = False
+        self.cfg.attn_only = False
+        self.cfg.uses_rms_norm = True
+
+        # Use eager attention to support output_attentions for hook_attn_scores and hook_pattern
+        # SDPA doesn't support output_attentions, which is required for HookedTransformer compatibility
+        self.cfg.attn_implementation = "eager"
+
+        self.weight_processing_conversions = {
+            # Q/K/V weight conversions - handle GQA (Grouped Query Attention)
+            "blocks.{i}.attn.q.weight": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion("(n h) m -> n m h", n=self.cfg.n_heads),
+            ),
+            "blocks.{i}.attn.k.weight": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion(
+                    "(n h) m -> n m h",
+                    n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads,
+                ),
+            ),
+            "blocks.{i}.attn.v.weight": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion(
+                    "(n h) m -> n m h",
+                    n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads,
+                ),
+            ),
+            "blocks.{i}.attn.o.weight": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion("m (n h) -> n h m", n=self.cfg.n_heads),
+            ),
+        }
+
+        # Set up component mapping
+        # Apertus uses attention_layernorm / feedforward_layernorm instead of the
+        # typical input_layernorm / post_attention_layernorm names.
+        self.component_mapping = {
+            "embed": EmbeddingBridge(name="model.embed_tokens"),
+            "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg),
+            "blocks": BlockBridge(
+                name="model.layers",
+                submodules={
+                    "ln1": RMSNormalizationBridge(name="attention_layernorm", config=self.cfg),
+                    "ln2": RMSNormalizationBridge(name="feedforward_layernorm", config=self.cfg),
+                    "attn": PositionEmbeddingsAttentionBridge(
+                        name="self_attn",
+                        config=self.cfg,
+                        submodules={
+                            "q": LinearBridge(name="q_proj"),
+                            "k": LinearBridge(name="k_proj"),
+                            "v": LinearBridge(name="v_proj"),
+                            "o": LinearBridge(name="o_proj"),
+                            "q_norm": RMSNormalizationBridge(name="q_norm", config=self.cfg),
+                            "k_norm": RMSNormalizationBridge(name="k_norm", config=self.cfg),
+                        },
+                    ),
+                    "mlp": MLPBridge(
+                        name="mlp",
+                        submodules={
+                            "in": LinearBridge(name="up_proj"),
+                            "out": LinearBridge(name="down_proj"),
+                        },
+                    ),
+                },
+            ),
+            "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg),
+            "unembed": UnembeddingBridge(name="lm_head"),
+        }
+
+    def prepare_loading(self, model_name: str, model_kwargs: dict) -> None:
+        """Patch XIELUActivation to defer eager .item() calls for meta tensor compat.
+
+        Transformers v5 uses meta tensors during from_pretrained, but
+        XIELUActivation.__init__ eagerly calls .item() on beta/eps buffers to
+        precompute _beta_scalar/_eps_scalar for the CUDA kernel path. This fails
+        on meta device. Once upstream fixes this (transformers PR #43473), this
+        patch can be removed.
+
+        Instead of reimplementing __init__, we wrap it to catch the meta tensor
+        failure and defer scalar computation to forward() time.
+        """
+        try:
+            from transformers.activations import XIELUActivation
+        except ImportError:
+            return
+
+        if getattr(XIELUActivation, "_apertus_patched", False):
+            return
+
+        # Check if upstream already defers scalar computation (fix landed)
+        if not self._xielu_needs_patch(XIELUActivation):
+            return
+
+        _orig_init = XIELUActivation.__init__
+        _orig_forward = XIELUActivation.forward
+
+        def _patched_init(self, *args, **kwargs):
+            try:
+                _orig_init(self, *args, **kwargs)
+            except NotImplementedError:
+                # Meta device — re-run without the .item() calls
+                _orig_init.__wrapped_meta = True  # type: ignore[attr-defined]
+                # Call nn.Module.__init__ and replicate only the tensor setup
+                import torch
+
+                torch.nn.Module.__init__(self)
+                alpha_p_init = kwargs.get("alpha_p_init", 0.8)
+                alpha_n_init = kwargs.get("alpha_n_init", 0.8)
+                beta = kwargs.get("beta", 0.5)
+                eps = kwargs.get("eps", -1e-6)
+                dtype = kwargs.get("dtype", torch.bfloat16)
+                self.with_vector_loads = kwargs.get("with_vector_loads", False)
+                self.alpha_p = torch.nn.Parameter(
+                    torch.log(torch.expm1(torch.tensor(alpha_p_init, dtype=dtype))).unsqueeze(0)
+                )
+                self.alpha_n = torch.nn.Parameter(
+                    torch.log(
+                        torch.expm1(torch.tensor(alpha_n_init - beta, dtype=dtype))
+                    ).unsqueeze(0)
+                )
+                self.register_buffer("beta", torch.tensor(beta, dtype=dtype))
+                self.register_buffer("eps", torch.tensor(eps, dtype=dtype))
+                self._beta_scalar = None
+                self._eps_scalar = None
+                self._xielu_cuda_obj = None
+
+        def _patched_forward(self, x):
+            """Lazily compute scalars on first real forward pass."""
+            if self._beta_scalar is None:
+                self._beta_scalar = float(self.beta.detach().cpu().float().item())
+                self._eps_scalar = float(self.eps.detach().cpu().float().item())
+            return _orig_forward(self, x)
+
+        XIELUActivation.__init__ = _patched_init  # type: ignore[method-assign]
+        XIELUActivation.forward = _patched_forward  # type: ignore[method-assign]
+        XIELUActivation._apertus_patched = True  # type: ignore[attr-defined]
+        logger.debug("Patched XIELUActivation for meta tensor compatibility")
+
+    @staticmethod
+    def _xielu_needs_patch(cls: type) -> bool:
+        """Check whether XIELUActivation still eagerly calls .item() in __init__."""
+        import inspect
+
+        src = inspect.getsource(cls.__init__)  # type: ignore[misc]
+        # If __init__ still has the eager .item() / float() pattern, patch needed
+        return "_beta_scalar" in src and ".item()" in src
+
+    def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None:
+        """Set up rotary embedding references for Apertus component testing.
+
+        Apertus uses RoPE (Rotary Position Embeddings). We set the rotary_emb on
+        all attention bridge instances for component testing.
+
+        We also force the HF model to use "eager" attention to match the bridge's
+        implementation. The bridge uses "eager" to support output_attentions for hooks.
+
+        Args:
+            hf_model: The HuggingFace Apertus model instance
+            bridge_model: The TransformerBridge model (if available, set rotary_emb on actual instances)
+        """
+        # Get rotary embedding instance from the model
+        rotary_emb = hf_model.model.rotary_emb
+
+        # Force HF model to use "eager" attention to match bridge implementation
+        # Bridge uses "eager" to support output_attentions for hook compatibility
+        if hasattr(hf_model, "config") and hasattr(hf_model.config, "_attn_implementation"):
+            hf_model.config._attn_implementation = "eager"
+
+        # Also set on all attention layers
+        if hasattr(hf_model, "model") and hasattr(hf_model.model, "layers"):
+            for layer in hf_model.model.layers:
+                if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "config"):
+                    layer.self_attn.config._attn_implementation = "eager"
+
+        # Set rotary_emb on actual bridge instances in bridge_model if available
+        if bridge_model is not None and hasattr(bridge_model, "blocks"):
+            # Set on each layer's actual attention bridge instance
+            for block in bridge_model.blocks:
+                if hasattr(block, "attn"):
+                    block.attn.set_rotary_emb(rotary_emb)
+
+        # Also set on the template for get_generalized_component() calls
+        attn_bridge = self.get_generalized_component("blocks.0.attn")
+        attn_bridge.set_rotary_emb(rotary_emb)
diff --git a/transformer_lens/tools/model_registry/__init__.py b/transformer_lens/tools/model_registry/__init__.py
index bfe1d6be3..7ee6bfebe 100644
--- a/transformer_lens/tools/model_registry/__init__.py
+++ b/transformer_lens/tools/model_registry/__init__.py
@@ -42,6 +42,7 @@
 # Internal-only architectures (NanoGPT, MinGPT, NeelSoluOld, GPT2LMHeadCustomModel)
 # are excluded since they never appear on HuggingFace Hub.
 HF_SUPPORTED_ARCHITECTURES: set[str] = {
+    "ApertusForCausalLM",
     "BertForMaskedLM",
     "BloomForCausalLM",
     "GemmaForCausalLM",
diff --git a/transformer_lens/tools/model_registry/data/architecture_gaps.json b/transformer_lens/tools/model_registry/data/architecture_gaps.json
index 2525966e5..90ebe2314 100644
--- a/transformer_lens/tools/model_registry/data/architecture_gaps.json
+++ b/transformer_lens/tools/model_registry/data/architecture_gaps.json
@@ -1,13 +1,13 @@
 {
-  "generated_at": "2026-03-17",
+  "generated_at": "2026-03-18",
   "scan_info": {
-    "total_scanned": 4221,
+    "total_scanned": 3426,
     "task_filter": "text-generation",
     "min_downloads": 500,
-    "scan_duration_seconds": 2.6
+    "scan_duration_seconds": 2.4
   },
-  "total_unsupported_architectures": 254,
-  "total_unsupported_models": 1019,
+  "total_unsupported_architectures": 253,
+  "total_unsupported_models": 1013,
   "gaps": [
     {
       "architecture_id": "Qwen3MoeForCausalLM",
@@ -201,6 +201,22 @@
         "tiiuae/Falcon-H1-1.5B-Instruct"
       ]
     },
+    {
+      "architecture_id": "MiniMaxM2ForCausalLM",
+      "total_models": 15,
+      "sample_models": [
+        "MiniMaxAI/MiniMax-M2.5",
+        "MiniMaxAI/MiniMax-M2",
+        "cerebras/MiniMax-M2.1-REAP-139B-A10B",
+        "MiniMaxAI/MiniMax-M2.1",
+        "cerebras/MiniMax-M2.5-REAP-139B-A10B",
+        "PrimeIntellect/MiniMax-M2.5-bf16",
+        "cerebras/MiniMax-M2.5-REAP-172B-A10B",
+        "saricles/MiniMax-M2.5-REAP-172B-A10B-NVFP4-GB10",
+        "amd/MiniMax-M2.1-MXFP4",
+        "aspctu/MiniMax-M2.5"
+      ]
+    },
     {
       "architecture_id": "GPTBigCodeForCausalLM",
       "total_models": 15,
@@ -217,22 +233,6 @@
         "openchat/opencoderplus"
       ]
     },
-    {
-      "architecture_id": "Glm4MoeLiteForCausalLM",
-      "total_models": 14,
-      "sample_models": [
-        "zai-org/GLM-4.7-Flash",
-        "GadflyII/GLM-4.7-Flash-NVFP4",
-        "unsloth/GLM-4.7-Flash",
-        "GadflyII/GLM-4.7-Flash-MTP-NVFP4",
-        "Olafangensan/GLM-4.7-Flash-heretic",
-        "huihui-ai/Huihui-GLM-4.7-Flash-abliterated",
-        "cerebras/GLM-4.7-Flash-REAP-23B-A3B",
-        "TeichAI/GLM-4.7-Flash-Claude-Opus-4.5-High-Reasoning-Distill",
-        "Ex0bit/GLM-4.7-Flash-PRISM",
-        "MuXodious/GLM-4.7-Flash-absolute-heresy"
-      ]
-    },
     {
       "architecture_id": "NemotronHForCausalLM",
       "total_models": 14,
@@ -249,22 +249,6 @@
         "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-Base-BF16"
       ]
     },
-    {
-      "architecture_id": "MiniMaxM2ForCausalLM",
-      "total_models": 14,
-      "sample_models": [
-        "MiniMaxAI/MiniMax-M2.5",
-        "MiniMaxAI/MiniMax-M2",
-        "cerebras/MiniMax-M2.1-REAP-139B-A10B",
-        "MiniMaxAI/MiniMax-M2.1",
-        "cerebras/MiniMax-M2.5-REAP-139B-A10B",
-        "PrimeIntellect/MiniMax-M2.5-bf16",
-        "cerebras/MiniMax-M2.5-REAP-172B-A10B",
-        "saricles/MiniMax-M2.5-REAP-172B-A10B-NVFP4-GB10",
-        "amd/MiniMax-M2.1-MXFP4",
-        "aspctu/MiniMax-M2.5"
-      ]
-    },
     {
       "architecture_id": "XGLMForCausalLM",
       "total_models": 14,
@@ -274,8 +258,8 @@
         "facebook/xglm-1.7B",
         "KoboldAI/fairseq-dense-13B",
         "facebook/xglm-4.5B",
-        "KoboldAI/fairseq-dense-2.7B",
         "KoboldAI/fairseq-dense-125M",
+        "KoboldAI/fairseq-dense-2.7B",
         "KoboldAI/fairseq-dense-1.3B",
         "KoboldAI/fairseq-dense-355M",
         "KoboldAI/fairseq-dense-6.7B"
@@ -287,8 +271,8 @@
       "sample_models": [
         "Salesforce/codegen-350M-mono",
         "Salesforce/codegen-350M-multi",
-        "hf-tiny-model-private/tiny-random-CodeGenForCausalLM",
         "Salesforce/codegen-2B-mono",
+        "hf-tiny-model-private/tiny-random-CodeGenForCausalLM",
         "Salesforce/codegen-6B-multi",
         "shailja/fine-tuned-codegen-16B-Verilog",
         "katuni4ka/tiny-random-codegen2",
@@ -308,11 +292,27 @@
         "RWKV/rwkv-4-430m-pile",
         "RWKV/rwkv-4-3b-pile",
         "RWKV/rwkv-4-7b-pile",
-        "RWKV/rwkv-4-14b-pile",
         "RWKV/rwkv-raven-1b5",
+        "RWKV/rwkv-4-14b-pile",
         "RWKV/rwkv-raven-7b"
       ]
     },
+    {
+      "architecture_id": "Glm4MoeLiteForCausalLM",
+      "total_models": 12,
+      "sample_models": [
+        "zai-org/GLM-4.7-Flash",
+        "GadflyII/GLM-4.7-Flash-NVFP4",
+        "unsloth/GLM-4.7-Flash",
+        "GadflyII/GLM-4.7-Flash-MTP-NVFP4",
+        "Olafangensan/GLM-4.7-Flash-heretic",
+        "huihui-ai/Huihui-GLM-4.7-Flash-abliterated",
+        "cerebras/GLM-4.7-Flash-REAP-23B-A3B",
+        "TeichAI/GLM-4.7-Flash-Claude-Opus-4.5-High-Reasoning-Distill",
+        "Ex0bit/GLM-4.7-Flash-PRISM",
+        "MuXodious/GLM-4.7-Flash-absolute-heresy"
+      ]
+    },
     {
       "architecture_id": "DeepseekV2ForCausalLM",
       "total_models": 11,
@@ -473,8 +473,8 @@
         "lightblue/japanese-mpt-7b",
         "vinai/PhoGPT-4B",
         "Nethermind/Mpt-Instruct-DotNet-S",
-        "vinai/PhoGPT-4B-Chat",
-        "replit/replit-code-v1-3b"
+        "replit/replit-code-v1-3b",
+        "vinai/PhoGPT-4B-Chat"
       ]
     },
     {
@@ -623,17 +623,6 @@
         "d3LLM/d3LLM_LLaDA"
       ]
     },
-    {
-      "architecture_id": "ApertusForCausalLM",
-      "total_models": 5,
-      "sample_models": [
-        "swiss-ai/Apertus-8B-Instruct-2509",
-        "swiss-ai/Apertus-8B-2509",
-        "swiss-ai/Apertus-70B-Instruct-2509",
-        "swiss-ai/Apertus-70B-2509",
-        "aisingapore/Apertus-SEA-LION-v4-8B-IT"
-      ]
-    },
     {
       "architecture_id": "FalconMambaForCausalLM",
       "total_models": 5,
@@ -1377,19 +1366,19 @@
       ]
     },
     {
-      "architecture_id": "JetMoEForCausalLM",
+      "architecture_id": "MosaicGPT",
       "total_models": 2,
       "sample_models": [
-        "jetmoe/jetmoe-8b",
-        "jetmoe/jetmoe-8b-chat"
+        "anas-awadalla/mpt-1b-redpajama-200b",
+        "anas-awadalla/mpt-1b-redpajama-200b-dolly"
       ]
     },
     {
-      "architecture_id": "MosaicGPT",
+      "architecture_id": "JetMoEForCausalLM",
       "total_models": 2,
       "sample_models": [
-        "anas-awadalla/mpt-1b-redpajama-200b",
-        "anas-awadalla/mpt-1b-redpajama-200b-dolly"
+        "jetmoe/jetmoe-8b",
+        "jetmoe/jetmoe-8b-chat"
       ]
     },
     {
@@ -1920,6 +1909,13 @@
         "MiniMaxAI/MiniMax-Text-01"
       ]
     },
+    {
+      "architecture_id": "SteerlingForCausalLM",
+      "total_models": 1,
+      "sample_models": [
+        "guidelabs/steerling-8b"
+      ]
+    },
     {
       "architecture_id": "LamedPhi3ForCausalLM",
       "total_models": 1,
@@ -2040,17 +2036,17 @@
       ]
     },
     {
-      "architecture_id": "CircuitGPTForCausalLM",
+      "architecture_id": "GPT2CustomLMHeadModel",
       "total_models": 1,
       "sample_models": [
-        "openai/circuit-sparsity"
+        "fxmarty/tiny-testing-gpt2-remote-code"
       ]
     },
     {
-      "architecture_id": "GPT2CustomLMHeadModel",
+      "architecture_id": "CircuitGPTForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "fxmarty/tiny-testing-gpt2-remote-code"
+        "openai/circuit-sparsity"
       ]
     },
     {
@@ -2067,13 +2063,6 @@
         "manycore-research/SpatialLM1.1-Llama-1B"
       ]
     },
-    {
-      "architecture_id": "SKTOmniForCausalLM",
-      "total_models": 1,
-      "sample_models": [
-        "Shrijanagain/SKT_OMNI_SUPREME"
-      ]
-    },
     {
       "architecture_id": "DuchifatCore",
       "total_models": 1,
diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json
index ceb12d3ce..9ed04ae23 100644
--- a/transformer_lens/tools/model_registry/data/supported_models.json
+++ b/transformer_lens/tools/model_registry/data/supported_models.json
@@ -1,14 +1,14 @@
 {
-  "generated_at": "2026-03-17",
+  "generated_at": "2026-03-18",
   "scan_info": {
-    "total_scanned": 4221,
+    "total_scanned": 3426,
     "task_filter": "text-generation",
     "min_downloads": 500,
-    "scan_duration_seconds": 2.6
+    "scan_duration_seconds": 2.4
   },
-  "total_architectures": 32,
-  "total_models": 5745,
-  "total_verified": 666,
+  "total_architectures": 33,
+  "total_models": 5764,
+  "total_verified": 673,
   "models": [
     {
       "architecture_id": "Qwen2ForCausalLM",
@@ -73154,6 +73154,222 @@
       "phase2_score": null,
       "phase3_score": null,
       "phase4_score": 93.4
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "deqing/llama-300M-v5-window_4",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "deqing/llama-300M-v5-base_7",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "xiaolesu/Lean4-sft-nt-8b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "deqing/llama-300M-v5-permute",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "microsoft/CodeGPT-small-java-adaptedGPT2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "daryl149/llama-2-7b-chat-hf",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "swiss-ai/Apertus-8B-Instruct-2509",
+      "status": 1,
+      "verified_date": "2026-03-18",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 95.4
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "swiss-ai/Apertus-8B-2509",
+      "status": 1,
+      "verified_date": "2026-03-18",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 98.0
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "swiss-ai/Apertus-70B-Instruct-2509",
+      "status": 2,
+      "verified_date": "2026-03-18",
+      "metadata": null,
+      "note": "Estimated 705.6 GB exceeds 108.0 GB limit",
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "swiss-ai/Apertus-70B-2509",
+      "status": 2,
+      "verified_date": "2026-03-18",
+      "metadata": null,
+      "note": "Estimated 705.6 GB exceeds 108.0 GB limit",
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "aisingapore/Apertus-SEA-LION-v4-8B-IT",
+      "status": 1,
+      "verified_date": "2026-03-18",
+      "metadata": null,
+      "note": "Core verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": 94.1
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "yujiepan/apertus-tiny-random",
+      "status": 1,
+      "verified_date": "2026-03-18",
+      "metadata": null,
+      "note": "Core verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": 72.2
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "tiny-random/apertus",
+      "status": 1,
+      "verified_date": "2026-03-18",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 72.2
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "unsloth/Apertus-8B-Instruct-2509",
+      "status": 1,
+      "verified_date": "2026-03-18",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 95.4
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "tartuNLP/Apertus-EstLLM-8B-1125",
+      "status": 1,
+      "verified_date": "2026-03-18",
+      "metadata": null,
+      "note": "Core verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": 97.2
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "loleg/Apertus-8B-Instruct-2509-mlx",
+      "status": 3,
+      "verified_date": "2026-03-18",
+      "metadata": null,
+      "note": "CORE FAILED: Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed)",
+      "phase1_score": 0.0,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "llmat/Apertus-8B-Instruct-2509-NVFP4",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "onnx-community/Apertus-8B-Instruct-2509-ONNX",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "tartuNLP/Apertus-EstLLM-8B-Instruct-1125",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null
     }
   ]
 }
diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json
index 969207a52..5d78c7453 100644
--- a/transformer_lens/tools/model_registry/data/verification_history.json
+++ b/transformer_lens/tools/model_registry/data/verification_history.json
@@ -1,5 +1,5 @@
 {
-  "last_updated": "2026-03-11T19:25:24.974031",
+  "last_updated": "2026-03-18T20:39:31.645578",
   "records": [
     {
       "model_id": "Macropodus/macbert4mdcspell_v1",
@@ -10280,6 +10280,46 @@
       "notes": "Full verification completed",
       "invalidated": false,
       "invalidation_reason": null
+    },
+    {
+      "model_id": "tiny-random/apertus",
+      "architecture_id": "ApertusForCausalLM",
+      "verified_date": "2026-03-18",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "unsloth/Apertus-8B-Instruct-2509",
+      "architecture_id": "ApertusForCausalLM",
+      "verified_date": "2026-03-18",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "swiss-ai/Apertus-8B-2509",
+      "architecture_id": "ApertusForCausalLM",
+      "verified_date": "2026-03-18",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "swiss-ai/Apertus-8B-Instruct-2509",
+      "architecture_id": "ApertusForCausalLM",
+      "verified_date": "2026-03-18",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
     }
   ]
 }