From 0c6a322f379d69fb222591aeff2ee8a2ef2438fe Mon Sep 17 00:00:00 2001
From: medhat <medhat.tarek1000@gmail.com>
Date: Tue, 3 Mar 2026 19:33:04 +0400
Subject: [PATCH] fix: Mixtral compatibility with transformers >=5.0 fused MoE
 API

In transformers 5.x, MixtralDecoderLayer renamed block_sparse_moe to
mlp, replaced the ModuleList of individual expert modules with a fused
MixtralExperts class (3D nn.Parameter tensors), and changed the gate
from nn.Linear to MixtralTopKRouter.

This broke all Mixtral quantization with:
  'MixtralDecoderLayer' object has no attribute 'block_sparse_moe'

Fix:
- Add _has_legacy_moe() to detect old vs new API via hasattr
- get_extra_modules: returns block_sparse_moe (old) or mlp (new)
- get_moe_gate: returns the gate from the appropriate MoE container
- get_subsets_in_block: dispatches to _get_subsets_legacy (old per-expert
  w1/w2/w3 Linear modules) or _get_subsets_fused (new fused experts)
- For the fused API, attention layers are quantized per-subset; the MoE
  block is passed as extra_modules for activation-aware hooks

The legacy path is preserved unchanged for older transformers versions.

Made-with: Cursor
---
 llmc/models/mixtral.py | 72 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 63 insertions(+), 9 deletions(-)

diff --git a/llmc/models/mixtral.py b/llmc/models/mixtral.py
index 837f68c26..0ca406710 100644
--- a/llmc/models/mixtral.py
+++ b/llmc/models/mixtral.py
@@ -3,6 +3,10 @@
 from .base_model import BaseModel
 
 
+def _has_legacy_moe(block):
+    return hasattr(block, 'block_sparse_moe')
+
+
 @MODEL_REGISTRY
 class Mixtral(BaseModel):
     def __init__(self, config, device_map=None, use_cache=False):
@@ -36,11 +40,27 @@ def get_layernorms_in_block(self, block):
         }
 
     def get_extra_modules(self, block):
+        if _has_legacy_moe(block):
+            return {
+                'block_sparse_moe': block.block_sparse_moe
+            }
         return {
-            'block_sparse_moe': block.block_sparse_moe
+            'mlp': block.mlp
         }
 
+    def get_moe_gate(self, block):
+        if _has_legacy_moe(block):
+            return block.block_sparse_moe.gate
+        return block.mlp.gate
+
     def get_subsets_in_block(self, block):
+        if _has_legacy_moe(block):
+            return self._get_subsets_legacy(block)
+        return self._get_subsets_fused(block)
+
+    def _get_subsets_legacy(self, block):
+        """transformers <5.0: block.block_sparse_moe with ModuleList experts."""
+        moe = block.block_sparse_moe
         return [
             {
                 'layers': {
@@ -62,25 +82,59 @@ def get_subsets_in_block(self, block):
             },
             {
                 'layers': {
-                    **{f'block_sparse_moe.experts.{i}.w1': block.block_sparse_moe.experts[i].w1 for i in range(len(block.block_sparse_moe.experts))}, # noqa
-                    **{f'block_sparse_moe.experts.{i}.w3': block.block_sparse_moe.experts[i].w3 for i in range(len(block.block_sparse_moe.experts))}, # noqa
-                    'block_sparse_moe.gate': block.block_sparse_moe.gate,
+                    **{f'block_sparse_moe.experts.{i}.w1': moe.experts[i].w1 for i in range(len(moe.experts))},  # noqa
+                    **{f'block_sparse_moe.experts.{i}.w3': moe.experts[i].w3 for i in range(len(moe.experts))},  # noqa
+                    'block_sparse_moe.gate': moe.gate,
                 },
                 'prev_op': [block.post_attention_layernorm],
                 'input': ['block_sparse_moe'],
-                'inspect': block.block_sparse_moe,
+                'inspect': moe,
                 'has_kwargs': False,
                 'is_mlp': True,
             },
             *[
                 {
-                    'layers': {f'block_sparse_moe.experts.{i}.w2': block.block_sparse_moe.experts[i].w2}, # noqa
-                    'prev_op': [block.block_sparse_moe.experts[i].w3],
+                    'layers': {f'block_sparse_moe.experts.{i}.w2': moe.experts[i].w2},
+                    'prev_op': [moe.experts[i].w3],
                     'input': [f'block_sparse_moe.experts.{i}.w2'],
-                    'inspect': block.block_sparse_moe.experts[i].w2,
+                    'inspect': moe.experts[i].w2,
                     'has_kwargs': False,
                     'is_mlp': True,
                 }
-                for i in range(len(block.block_sparse_moe.experts))
+                for i in range(len(moe.experts))
             ],
         ]
+
+    def _get_subsets_fused(self, block):
+        """transformers >=5.0: block.mlp with fused MixtralExperts."""
+        moe = block.mlp
+        return [
+            {
+                'layers': {
+                    'self_attn.q_proj': block.self_attn.q_proj,
+                    'self_attn.k_proj': block.self_attn.k_proj,
+                    'self_attn.v_proj': block.self_attn.v_proj,
+                },
+                'prev_op': [block.input_layernorm],
+                'input': ['self_attn.q_proj'],
+                'inspect': block.self_attn,
+                'has_kwargs': True,
+            },
+            {
+                'layers': {'self_attn.o_proj': block.self_attn.o_proj},
+                'prev_op': [block.self_attn.v_proj],
+                'input': ['self_attn.o_proj'],
+                'inspect': block.self_attn.o_proj,
+                'has_kwargs': False,
+            },
+            {
+                'layers': {
+                    'mlp.gate': moe.gate,
+                },
+                'prev_op': [block.post_attention_layernorm],
+                'input': ['mlp'],
+                'inspect': moe,
+                'has_kwargs': False,
+                'is_mlp': True,
+            },
+        ]