NVIDIA · cjluo-nv · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 
 - User does not need to manually register MOE modules to cover experts calibration coverage in PTQ workflow.
 - ``hf_ptq.py`` now saves the quantization summary and moe expert token count table to the export directory.
+- Add ``--moe_calib_experts_ratio`` flag in ``hf_ptq.py`` to specify the ratio of experts to calibrate during forward pass to improve expert coverage during calibration. Default to 1/4 of all the experts.
 - Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
 
 0.42 (2026-02-xx)

@@ -201,6 +201,7 @@ def build_quant_cfg(
     model_type,
     quant_cfg_choices,
     kv_quant_cfg_choices,
+    moe_calib_experts_ratio,
-    moe_calib_experts_ratio,
+    moe_calib_experts_ratio: float | None = None,
-    moe_calib_experts_ratio,
+    moe_calib_experts_ratio: float | None = None,
 ) -> dict[str, Any]:
     quant_cfg = {}
     assert qformat in quant_cfg_choices, (
@@ -232,6 +233,15 @@ def build_quant_cfg(
             getattr(mtq, kv_quant_cfg_choices[kv_cache_qformat])["quant_cfg"],
         )
 
+    if moe_calib_experts_ratio:
+        if isinstance(quant_cfg["algorithm"], str):
+            quant_cfg["algorithm"] = {
+                "method": quant_cfg["algorithm"],
+                "moe_calib_experts_ratio": moe_calib_experts_ratio,
+            }
+        else:
+            quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio
-    if moe_calib_experts_ratio:
-        if isinstance(quant_cfg["algorithm"], str):
-            quant_cfg["algorithm"] = {
-                "method": quant_cfg["algorithm"],
-                "moe_calib_experts_ratio": moe_calib_experts_ratio,
-            }
-        else:
-            quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio
+    if moe_calib_experts_ratio:
+        if quant_cfg["algorithm"] is None:
+            quant_cfg["algorithm"] = {
+                "method": None,
+                "moe_calib_experts_ratio": moe_calib_experts_ratio,
+            }
+        elif isinstance(quant_cfg["algorithm"], str):
+            quant_cfg["algorithm"] = {
+                "method": quant_cfg["algorithm"],
+                "moe_calib_experts_ratio": moe_calib_experts_ratio,
+            }
+        else:
+            quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio
-    if moe_calib_experts_ratio:
-        if isinstance(quant_cfg["algorithm"], str):
-            quant_cfg["algorithm"] = {
-                "method": quant_cfg["algorithm"],
-                "moe_calib_experts_ratio": moe_calib_experts_ratio,
-            }
-        else:
-            quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio
+    if moe_calib_experts_ratio:
+        if quant_cfg["algorithm"] is None:
+            quant_cfg["algorithm"] = {
+                "method": None,
+                "moe_calib_experts_ratio": moe_calib_experts_ratio,
+            }
+        elif isinstance(quant_cfg["algorithm"], str):
+            quant_cfg["algorithm"] = {
+                "method": quant_cfg["algorithm"],
+                "moe_calib_experts_ratio": moe_calib_experts_ratio,
+            }
+        else:
+            quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio
+
     # Gemma 7B has accuracy regression using alpha 1. We set 0.5 instead.
     if model_type == "gemma" and "int8_sq" in qformat:
         quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": 0.5}

@@ -906,6 +906,7 @@ def quantize_main(
             model_type,
             QUANT_CFG_CHOICES,
             KV_QUANT_CFG_CHOICES,
+            args.moe_calib_experts_ratio,
         )
 
         # Exclude MTP layers from quantization if detected (e.g., GLM-4.7's layer 92)
@@ -1126,6 +1127,15 @@ def parse_args() -> argparse.Namespace:
             "(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified."
         ),
     )
+    parser.add_argument(
+        "--moe_calib_experts_ratio",
+        type=float,
+        default=1.0 / 4,
+        help=(
+            "Percentage of experts to calibrate during forward pass. Only used for MOE models. "
+            "This is used to reduce the number of experts to calibrate during forward pass. "
+        ),
+    )
 
     return parser.parse_args()
-            "Percentage of experts to calibrate during forward pass. Only used for MOE models. "
-            "This is used to reduce the number of experts to calibrate during forward pass. "
-        ),
-    )
-
-    return parser.parse_args()
+            "Fraction of experts to calibrate during forward pass (ratio in (0.0, 1.0]). "
+            "Only used for MOE models; used to reduce the number of experts calibrated during the forward pass."
+        ),
+    )
+
+    args = parser.parse_args()
+    if not (0.0 < args.moe_calib_experts_ratio <= 1.0):
+        parser.error("--moe_calib_experts_ratio must be in the range (0.0, 1.0].")
+    return args
-            "Percentage of experts to calibrate during forward pass. Only used for MOE models. "
-            "This is used to reduce the number of experts to calibrate during forward pass. "
-        ),
-    )
-
-    return parser.parse_args()
+            "Fraction of experts to calibrate during forward pass (ratio in (0.0, 1.0]). "
+            "Only used for MOE models; used to reduce the number of experts calibrated during the forward pass."
+        ),
+    )
+
+    args = parser.parse_args()
+    if not (0.0 < args.moe_calib_experts_ratio <= 1.0):
+        parser.error("--moe_calib_experts_ratio must be in the range (0.0, 1.0].")
+    return args
 

@@ -48,7 +48,7 @@ def save_expert_token_count_table(model: nn.Module, output_dir: str | Path | Non
         "th, td { border: 1px solid #ccc; padding: 4px 8px; text-align: right; }",
         "th { background: #f0f0f0; }",
         "</style></head><body>",
-        "<h2>Expert Token Counts (per MoE layer)</h2>",
+        "<h2>Expert Calib Token Counts (per MoE layer)</h2>",
         "<table><tr><th>Layer/Expert</th>",
     ]
     html_parts.extend(f"<th>{i}</th>" for i in range(num_experts))

@@ -1091,6 +1091,16 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig):
         title="This field specifies the name of the calibration algorithm. If None, no calibration is performed.",
     )
 
+    moe_calib_experts_ratio: float | None = ModeloptField(
+        default=None,
+        title="% of experts to calibrate during forward pass.",
+        description=(
+            "If specified, we force forward tokens to % of experts during the calibration"
+            " pass. This forward is for calibration purpose only and will not affect the"
+            " actual inference."
+        ),
+    )
+
 
 class MaxCalibConfig(QuantizeAlgorithmConfig):
     """The config for max calibration algorithm.

@@ -225,6 +225,12 @@ def wrapped_calib_func(
         # For backward compatibility
         kwargs["algorithm"] = method
 
+    moe_calib_experts_ratio = kwargs.pop("moe_calib_experts_ratio", None)
+    if moe_calib_experts_ratio is not None:
-    if moe_calib_experts_ratio is not None:
+    if moe_calib_experts_ratio is not None:
+        # Validate early to avoid downstream assertion failures in the forward path.
+        if not isinstance(moe_calib_experts_ratio, (int, float)):
+            raise ValueError(
+                f"Invalid moe_calib_experts_ratio {moe_calib_experts_ratio!r}: "
+                "expected a numeric value in the range (0, 1]."
+            )
+        if not (0 < moe_calib_experts_ratio <= 1):
+            raise ValueError(
+                f"Invalid moe_calib_experts_ratio {moe_calib_experts_ratio!r}: "
+                "expected 0 < ratio <= 1."
+            )
-    if moe_calib_experts_ratio is not None:
+    if moe_calib_experts_ratio is not None:
+        # Validate early to avoid downstream assertion failures in the forward path.
+        if not isinstance(moe_calib_experts_ratio, (int, float)):
+            raise ValueError(
+                f"Invalid moe_calib_experts_ratio {moe_calib_experts_ratio!r}: "
+                "expected a numeric value in the range (0, 1]."
+            )
+        if not (0 < moe_calib_experts_ratio <= 1):
+            raise ValueError(
+                f"Invalid moe_calib_experts_ratio {moe_calib_experts_ratio!r}: "
+                "expected 0 < ratio <= 1."
+            )
+        for module in model.modules():
+            if hasattr(module, "_moe_calib_experts_ratio"):
+                module._moe_calib_experts_ratio = moe_calib_experts_ratio
+
     if func is not None:
         # Call the function with forward_loop as a separate argument
         func(model, forward_loop=forward_loop, **kwargs)

@@ -458,8 +458,11 @@ def _setup(self):
         elif hasattr(self, "experts") and hasattr(self.experts, "num_experts"):
             num_experts = self.experts.num_experts
 
-        self.expert_token_count = torch.zeros(num_experts, dtype=torch.long, device="cpu")
+        self.expert_token_count = torch.zeros(
+            num_experts, dtype=torch.long, device=next(self.parameters()).device
-        self.expert_token_count = torch.zeros(
-            num_experts, dtype=torch.long, device=next(self.parameters()).device
+        self.register_buffer(
+            "expert_token_count",
+            torch.zeros(num_experts, dtype=torch.long, device=next(self.parameters()).device),
+            persistent=False,
-        self.expert_token_count = torch.zeros(
-            num_experts, dtype=torch.long, device=next(self.parameters()).device
+        self.register_buffer(
+            "expert_token_count",
+            torch.zeros(num_experts, dtype=torch.long, device=next(self.parameters()).device),
+            persistent=False,
+        )
         self._count_expert_tokens = False
+        self._moe_calib_experts_ratio = None
 
         if num_experts == 0:
             warnings.warn(
@@ -483,36 +486,47 @@ def _gate_forward_hook(self, module, input, output):
                 logits = output if not isinstance(output, tuple) else output[0]
                 top_k = self.gate.top_k if hasattr(self.gate, "top_k") else self.top_k
                 _, indices = torch.topk(logits.float(), top_k, dim=-1)
-            counts = torch.bincount(
-                indices.reshape(-1).cpu(), minlength=len(self.expert_token_count)
-            )
-            self.expert_token_count += counts
+            counts = torch.bincount(indices.reshape(-1), minlength=self.expert_token_count.shape[0])
+            self.expert_token_count += counts.to(self.expert_token_count.device)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         is_calib = any(getattr(m, "_if_calib", False) for m in self.experts.modules())
-        if is_calib:
+        self._count_expert_tokens = is_calib
+        if is_calib and self._moe_calib_experts_ratio:
+            self._count_expert_tokens = True
+            assert 0 < self._moe_calib_experts_ratio <= 1, (
+                "moe_calib_experts_ratio must be between 0 and 1"
+            )
             # If any of the experts are in calibration mode, we will forward all tokens to all experts
             # This is used only for calibration, we need to re-calculate the actual outputs again using
             # the original top_k
             if TRANSFORMERS_VERSION_GE_5_0:
                 assert hasattr(self, "gate") and hasattr(self.gate, "top_k")
                 original_top_k = self.gate.top_k
-                self.gate.top_k = self.gate.num_experts
+                self.gate.top_k = max(
+                    original_top_k, round(self.gate.num_experts * self._moe_calib_experts_ratio)
+                )
                 super().forward(hidden_states)
                 self.gate.top_k = original_top_k
             else:
                 # Path for transformers < 5.0
                 original_top_k = self.top_k
                 if hasattr(self, "num_experts"):
-                    self.top_k = self.num_experts
+                    self.top_k = max(
+                        original_top_k, round(self.num_experts * self._moe_calib_experts_ratio)
+                    )
                 elif hasattr(self, "experts"):
-                    self.top_k = self.experts.num_experts
+                    self.top_k = max(
+                        original_top_k,
+                        round(self.experts.num_experts * self._moe_calib_experts_ratio),
+                    )
                 else:
                     raise ValueError(f"Could not find num_experts in module {self}")
                 super().forward(hidden_states)
                 self.top_k = original_top_k
-        # Enable counting only for the real-routing forward during calibration
-        self._count_expert_tokens = is_calib
+            self._count_expert_tokens = False
+        else:
+            self._count_expert_tokens = True
-            self._count_expert_tokens = True
+            self._count_expert_tokens = is_calib
-            self._count_expert_tokens = True
+            self._count_expert_tokens = is_calib
         output = super().forward(hidden_states)
         self._count_expert_tokens = False
         return output