From 28d5686e40784462bf00567feccb89a7fcfacc63 Mon Sep 17 00:00:00 2001 From: Jennifer Chen Date: Wed, 18 Feb 2026 20:06:24 +0000 Subject: [PATCH 1/8] sync moe input quantizer only Signed-off-by: Jennifer Chen --- modelopt/torch/quantization/model_calib.py | 11 ++- .../torch/quantization/plugins/megatron.py | 31 ++++--- .../quantization/plugins/test_megatron.py | 82 ++++++++++++++++++- 3 files changed, 106 insertions(+), 18 deletions(-) diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index 6455447ac..081aa6f1c 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -95,13 +95,20 @@ def _check_moe_calibration_complete(quantizer, parallel_state): @torch.no_grad() -def max_calibrate(model: nn.Module, forward_loop: ForwardLoop | None = None, distributed_sync=True): +def max_calibrate( + model: nn.Module, + forward_loop: ForwardLoop | None = None, + distributed_sync=True, + shared_moe_weight_scale=True, +): """Calibrate the model using max. Args: model: Model to be calibrated. forward_loop: A callable which takes the model as argument and forwards calibration data through the model. + distributed_sync: Whether to sync amax across distributed processes. + shared_moe_weight_scale: Whether to share the weight scale across local experts. See :class:`MaxCalibConfig ` for details on the remaining arguments. @@ -116,7 +123,7 @@ def max_calibrate(model: nn.Module, forward_loop: ForwardLoop | None = None, dis # Sync amax across local experts within each rank (for SequentialMLP) for name, module in model.named_modules(): if hasattr(module, "layer_sync_moe_local_experts_amax"): - module.layer_sync_moe_local_experts_amax() + module.layer_sync_moe_local_experts_amax(shared_moe_weight_scale) if not distributed_sync: return diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py index 6e92fce90..adfb94edd 100644 --- a/modelopt/torch/quantization/plugins/megatron.py +++ b/modelopt/torch/quantization/plugins/megatron.py @@ -574,11 +574,11 @@ def _setup(self): expert.linear_fc1.parallel_state = self.parallel_state expert.linear_fc2.parallel_state = self.parallel_state - def layer_sync_moe_local_experts_amax(self): - """Sync amax across local experts in a SequentialMLP. + def layer_sync_moe_local_experts_amax(self, shared_moe_weight_scale=True): + """Sync input quantizer amax across local experts in a SequentialMLP, and optionally weight scale. - Synchronize the amax values across local experts in a lyaer such that all local experts will - share the same amax. This function operates on a single rank and does not require distributed sync. + Ensures all experts have the same input quantizer amax.This function operates + on a single rank and does not require distributed sync. Distributed amax sync across EP and ETP (for RowParallel) happens in model_calib.max_calibrate(). This function should be called before the distributed sync to ensure the amax values @@ -586,22 +586,27 @@ def layer_sync_moe_local_experts_amax(self): Note: Because there are logic which calls collective communication based on whether amax is not None, - We need to garuantee that all experts must have amax. Otherwise, there will be deadlock - when synchroizing over EP since some ranks may have amax None and not calling the collective + We need to guarantee that all experts must have amax. Otherwise, there will be deadlock + when synchronizing over EP since some ranks may have amax None and not calling the collective communication. + + Args: + shared_moe_weight_scale: Whether to share the weight scale across local experts. """ # Collect amax from all local experts amax_dict = {} for expert in self.local_experts: for name, module in expert.named_modules(): if isinstance(module, TensorQuantizer) and module.amax is not None: - stored_amax = amax_dict.get(name) - amax_tensor = module.amax.detach().clone() - amax_dict[name] = ( - amax_tensor - if stored_amax is None - else torch.maximum(stored_amax, amax_tensor) - ) + if shared_moe_weight_scale or ("weight_quantizer" not in name): + # Sync both quantizers or only sync input quantizer + stored_amax = amax_dict.get(name) + amax_tensor = module.amax.detach().clone() + amax_dict[name] = ( + amax_tensor + if stored_amax is None + else torch.maximum(stored_amax, amax_tensor) + ) # Apply synchronized amax values back to all local experts for expert in self.local_experts: diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py index b107eca71..cbe736dc6 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py @@ -735,6 +735,85 @@ def test_te_grouped_vs_sequential_quantize(need_4_gpus): ) +@pytest.mark.parametrize("ep_size", [1, 2]) +@pytest.mark.parametrize("moe_grouped_gemm", [True, False]) +@pytest.mark.parametrize("shared_moe_weight_scale", [True, False]) +def test_layer_sync_moe_local_experts_amax(ep_size, moe_grouped_gemm, shared_moe_weight_scale): + """Test expert model parallel synchronization.""" + size = torch.cuda.device_count() + if size < ep_size: + pytest.skip(f"Requires at least {ep_size} GPUs for expert model parallel test") + + spawn_multiprocess_job( + size=size, + job=partial( + _test_layer_sync_moe_local_experts_amax, + ep_size, + moe_grouped_gemm, + shared_moe_weight_scale, + ), + backend="nccl", + ) + + +def _test_layer_sync_moe_local_experts_amax( + ep_size, moe_grouped_gemm, shared_moe_weight_scale, rank, size +): + initialize_for_megatron( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + expert_model_parallel_size=ep_size, + expert_tensor_parallel_size=1, + seed=SEED, + ) + model = _gpt_model_provider( + tp_size=1, + ep_size=ep_size, + etp_size=1, + hidden_size=256, + moe_grouped_gemm=moe_grouped_gemm, + use_te=moe_grouped_gemm, + num_moe_experts=8, + transformer_impl="modelopt", + ) + model = mtq.quantize(model, mtq.FP8_DEFAULT_CFG, get_forward(model)) + + # Sync amax across local experts in each layer + for layer in model.decoder.layers: + layer.mlp.experts.layer_sync_moe_local_experts_amax(shared_moe_weight_scale) + + for layer in model.decoder.layers: + fc1_amax = None + fc2_amax = None + for expert in layer.mlp.experts.local_experts: + assert expert.linear_fc1.input_quantizer.amax is not None + assert expert.linear_fc2.input_quantizer.amax is not None + if fc1_amax is None: + fc1_amax = expert.linear_fc1.input_quantizer.amax + else: + assert torch.allclose(fc1_amax, expert.linear_fc1.input_quantizer.amax) + if fc2_amax is None: + fc2_amax = expert.linear_fc2.input_quantizer.amax + else: + assert torch.allclose(fc2_amax, expert.linear_fc2.input_quantizer.amax) + + if shared_moe_weight_scale: + for layer in model.decoder.layers: + fc1_amax = None + fc2_amax = None + for expert in layer.mlp.experts.local_experts: + assert expert.linear_fc1.weight_quantizer.amax is not None + assert expert.linear_fc2.weight_quantizer.amax is not None + if fc1_amax is None: + fc1_amax = expert.linear_fc1.weight_quantizer.amax + else: + assert torch.allclose(fc1_amax, expert.linear_fc1.weight_quantizer.amax) + if fc2_amax is None: + fc2_amax = expert.linear_fc2.weight_quantizer.amax + else: + assert torch.allclose(fc2_amax, expert.linear_fc2.weight_quantizer.amax) + + def _test_expert_model_parallel_amax_sync( tp_size, ep_size, etp_size, moe_grouped_gemm, config, rank, size ): @@ -815,9 +894,6 @@ def test_expert_parallel_sync(config, ep_size, etp_size, moe_grouped_gemm): if size < ep_size * etp_size: pytest.skip(f"Requires at least {ep_size * etp_size} GPUs for expert model parallel test") - if moe_grouped_gemm: - pytest.skip("TEGroupedMLP is not enabled in Megatron-LM currently") - spawn_multiprocess_job( size=size, job=partial( From f74cd24eac35c2a1dea17f58fef386297c93717d Mon Sep 17 00:00:00 2001 From: Jennifer Chen Date: Wed, 18 Feb 2026 20:16:35 +0000 Subject: [PATCH 2/8] update configs with shared_moe_weight_scale Signed-off-by: Jennifer Chen --- modelopt/torch/quantization/config.py | 14 ++++++++++---- .../torch/quantization/plugins/test_megatron.py | 5 +---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 291acba03..171f6181f 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -214,7 +214,7 @@ **_default_disabled_quantizer_cfg, **_mamba_moe_disabled_quantizer_cfg, }, - "algorithm": "max", + "algorithm": {"method": "max", "shared_moe_weight_scale": False}, } MAMBA_MOE_FP8_CONSERVATIVE_CFG = { @@ -226,7 +226,7 @@ "*mixer.in_proj*": {"enable": False}, # Skip mamba linear "*mixer.out_proj*": {"enable": False}, # Skip mamba linear }, - "algorithm": "max", + "algorithm": {"method": "max", "shared_moe_weight_scale": False}, } FP8_PER_CHANNEL_PER_TOKEN_CFG = { @@ -437,7 +437,7 @@ **_default_disabled_quantizer_cfg, **_mamba_moe_disabled_quantizer_cfg, }, - "algorithm": "max", + "algorithm": {"method": "max", "shared_moe_weight_scale": False}, } MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = { "quant_cfg": { @@ -458,7 +458,7 @@ "*mixer.in_proj*": {"enable": False}, # Skip mamba linear "*mixer.out_proj*": {"enable": False}, # Skip mamba linear }, - "algorithm": "max", + "algorithm": {"method": "max", "shared_moe_weight_scale": False}, } @@ -1087,6 +1087,12 @@ class MaxCalibConfig(QuantizeAlgorithmConfig): description="If True, the amax will be synced across the distributed processes.", ) + shared_moe_weight_scale: bool | None = ModeloptField( + default=True, + title="Whether to share the weight scale across local experts.", + description="If True, the weight scale will be shared across local experts.", + ) + class MseCalibConfig(QuantizeAlgorithmConfig): """Configuration for per-tensor MSE calibration. diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py index cbe736dc6..49cbfba48 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py @@ -473,10 +473,7 @@ def test_homogeneous_sharded_state_dict(tmp_path, config, compress, meta_device, @pytest.mark.parametrize( "config", - [ - NVFP4_GEMM_KV_CFG, - FP8_GEMM_KV_CFG, - ], + [NVFP4_GEMM_KV_CFG, FP8_GEMM_KV_CFG, mtq.MAMBA_MOE_NVFP4_CONSERVATIVE_CFG], ) def test_homogeneous_sharded_state_dict_hybrid(tmp_path, config): """Test sharded state dict for hybrid Mamba MOE models.""" From 002e6c6f134cf6d99b60913f698a6c1459161de1 Mon Sep 17 00:00:00 2001 From: Jennifer Chen Date: Wed, 18 Feb 2026 20:32:47 +0000 Subject: [PATCH 3/8] check when weight quantizers not same Signed-off-by: Jennifer Chen --- .../quantization/plugins/test_megatron.py | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py index 49cbfba48..8ee0d40a6 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py @@ -794,21 +794,27 @@ def _test_layer_sync_moe_local_experts_amax( else: assert torch.allclose(fc2_amax, expert.linear_fc2.input_quantizer.amax) - if shared_moe_weight_scale: - for layer in model.decoder.layers: - fc1_amax = None - fc2_amax = None - for expert in layer.mlp.experts.local_experts: - assert expert.linear_fc1.weight_quantizer.amax is not None - assert expert.linear_fc2.weight_quantizer.amax is not None - if fc1_amax is None: - fc1_amax = expert.linear_fc1.weight_quantizer.amax - else: - assert torch.allclose(fc1_amax, expert.linear_fc1.weight_quantizer.amax) - if fc2_amax is None: - fc2_amax = expert.linear_fc2.weight_quantizer.amax - else: - assert torch.allclose(fc2_amax, expert.linear_fc2.weight_quantizer.amax) + for layer in model.decoder.layers: + fc1_amax = None + fc2_amax = None + for expert in layer.mlp.experts.local_experts: + assert expert.linear_fc1.weight_quantizer.amax is not None + assert expert.linear_fc2.weight_quantizer.amax is not None + if fc1_amax is None: + fc1_amax = expert.linear_fc1.weight_quantizer.amax + elif shared_moe_weight_scale: + assert torch.allclose(fc1_amax, expert.linear_fc1.weight_quantizer.amax) + else: + assert not torch.allclose(fc1_amax, expert.linear_fc1.weight_quantizer.amax) + fc1_amax = expert.linear_fc1.weight_quantizer.amax # update most recent amax + + if fc2_amax is None: + fc2_amax = expert.linear_fc2.weight_quantizer.amax + elif shared_moe_weight_scale: + assert torch.allclose(fc2_amax, expert.linear_fc2.weight_quantizer.amax) + else: + assert not torch.allclose(fc2_amax, expert.linear_fc2.weight_quantizer.amax) + fc2_amax = expert.linear_fc2.weight_quantizer.amax def _test_expert_model_parallel_amax_sync( From 023b0a310a62069a0effd6363565b8d83a6e8f3b Mon Sep 17 00:00:00 2001 From: Jennifer Chen Date: Wed, 18 Feb 2026 21:22:28 +0000 Subject: [PATCH 4/8] fix test Signed-off-by: Jennifer Chen --- .../torch/quantization/plugins/test_megatron.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py index 8ee0d40a6..bad16a08f 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py @@ -773,9 +773,13 @@ def _test_layer_sync_moe_local_experts_amax( num_moe_experts=8, transformer_impl="modelopt", ) - model = mtq.quantize(model, mtq.FP8_DEFAULT_CFG, get_forward(model)) + quant_cfg = mtq.FP8_DEFAULT_CFG + if not shared_moe_weight_scale: + quant_cfg = copy.deepcopy(quant_cfg) + quant_cfg["algorithm"] = {"method": "max", "shared_moe_weight_scale": False} + model = mtq.quantize(model, quant_cfg, get_forward(model)) - # Sync amax across local experts in each layer + # does layer_sync_moe_local_experts_amax happens in mtq.quantize if EP=1? for layer in model.decoder.layers: layer.mlp.experts.layer_sync_moe_local_experts_amax(shared_moe_weight_scale) @@ -812,9 +816,7 @@ def _test_layer_sync_moe_local_experts_amax( fc2_amax = expert.linear_fc2.weight_quantizer.amax elif shared_moe_weight_scale: assert torch.allclose(fc2_amax, expert.linear_fc2.weight_quantizer.amax) - else: - assert not torch.allclose(fc2_amax, expert.linear_fc2.weight_quantizer.amax) - fc2_amax = expert.linear_fc2.weight_quantizer.amax + # FC2 amaxes are the same since the input to the layer is all the same def _test_expert_model_parallel_amax_sync( From 2a33e193936cb16d3335f882b9c379cffd319f0a Mon Sep 17 00:00:00 2001 From: Jennifer Chen Date: Thu, 19 Feb 2026 21:37:38 +0000 Subject: [PATCH 5/8] remove shared_moe flag Signed-off-by: Jennifer Chen --- modelopt/torch/quantization/config.py | 4 --- .../torch/quantization/plugins/megatron.py | 23 +++++++---------- .../quantization/plugins/test_megatron.py | 25 ++++++------------- 3 files changed, 16 insertions(+), 36 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 171f6181f..74547a68d 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -214,7 +214,6 @@ **_default_disabled_quantizer_cfg, **_mamba_moe_disabled_quantizer_cfg, }, - "algorithm": {"method": "max", "shared_moe_weight_scale": False}, } MAMBA_MOE_FP8_CONSERVATIVE_CFG = { @@ -226,7 +225,6 @@ "*mixer.in_proj*": {"enable": False}, # Skip mamba linear "*mixer.out_proj*": {"enable": False}, # Skip mamba linear }, - "algorithm": {"method": "max", "shared_moe_weight_scale": False}, } FP8_PER_CHANNEL_PER_TOKEN_CFG = { @@ -437,7 +435,6 @@ **_default_disabled_quantizer_cfg, **_mamba_moe_disabled_quantizer_cfg, }, - "algorithm": {"method": "max", "shared_moe_weight_scale": False}, } MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = { "quant_cfg": { @@ -458,7 +455,6 @@ "*mixer.in_proj*": {"enable": False}, # Skip mamba linear "*mixer.out_proj*": {"enable": False}, # Skip mamba linear }, - "algorithm": {"method": "max", "shared_moe_weight_scale": False}, } diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py index adfb94edd..69ada80e4 100644 --- a/modelopt/torch/quantization/plugins/megatron.py +++ b/modelopt/torch/quantization/plugins/megatron.py @@ -574,8 +574,8 @@ def _setup(self): expert.linear_fc1.parallel_state = self.parallel_state expert.linear_fc2.parallel_state = self.parallel_state - def layer_sync_moe_local_experts_amax(self, shared_moe_weight_scale=True): - """Sync input quantizer amax across local experts in a SequentialMLP, and optionally weight scale. + def layer_sync_moe_local_experts_amax(self): + """Sync input quantizer amax across local experts in a SequentialMLP. Ensures all experts have the same input quantizer amax.This function operates on a single rank and does not require distributed sync. @@ -589,24 +589,19 @@ def layer_sync_moe_local_experts_amax(self, shared_moe_weight_scale=True): We need to guarantee that all experts must have amax. Otherwise, there will be deadlock when synchronizing over EP since some ranks may have amax None and not calling the collective communication. - - Args: - shared_moe_weight_scale: Whether to share the weight scale across local experts. """ # Collect amax from all local experts amax_dict = {} for expert in self.local_experts: for name, module in expert.named_modules(): if isinstance(module, TensorQuantizer) and module.amax is not None: - if shared_moe_weight_scale or ("weight_quantizer" not in name): - # Sync both quantizers or only sync input quantizer - stored_amax = amax_dict.get(name) - amax_tensor = module.amax.detach().clone() - amax_dict[name] = ( - amax_tensor - if stored_amax is None - else torch.maximum(stored_amax, amax_tensor) - ) + stored_amax = amax_dict.get(name) + amax_tensor = module.amax.detach().clone() + amax_dict[name] = ( + amax_tensor + if stored_amax is None + else torch.maximum(stored_amax, amax_tensor) + ) # Apply synchronized amax values back to all local experts for expert in self.local_experts: diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py index bad16a08f..dadd324d1 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py @@ -734,8 +734,7 @@ def test_te_grouped_vs_sequential_quantize(need_4_gpus): @pytest.mark.parametrize("ep_size", [1, 2]) @pytest.mark.parametrize("moe_grouped_gemm", [True, False]) -@pytest.mark.parametrize("shared_moe_weight_scale", [True, False]) -def test_layer_sync_moe_local_experts_amax(ep_size, moe_grouped_gemm, shared_moe_weight_scale): +def test_layer_sync_moe_local_experts_amax(ep_size, moe_grouped_gemm): """Test expert model parallel synchronization.""" size = torch.cuda.device_count() if size < ep_size: @@ -747,15 +746,12 @@ def test_layer_sync_moe_local_experts_amax(ep_size, moe_grouped_gemm, shared_moe _test_layer_sync_moe_local_experts_amax, ep_size, moe_grouped_gemm, - shared_moe_weight_scale, ), backend="nccl", ) -def _test_layer_sync_moe_local_experts_amax( - ep_size, moe_grouped_gemm, shared_moe_weight_scale, rank, size -): +def _test_layer_sync_moe_local_experts_amax(ep_size, moe_grouped_gemm, rank, size): initialize_for_megatron( tensor_model_parallel_size=1, pipeline_model_parallel_size=1, @@ -774,16 +770,14 @@ def _test_layer_sync_moe_local_experts_amax( transformer_impl="modelopt", ) quant_cfg = mtq.FP8_DEFAULT_CFG - if not shared_moe_weight_scale: - quant_cfg = copy.deepcopy(quant_cfg) - quant_cfg["algorithm"] = {"method": "max", "shared_moe_weight_scale": False} model = mtq.quantize(model, quant_cfg, get_forward(model)) # does layer_sync_moe_local_experts_amax happens in mtq.quantize if EP=1? for layer in model.decoder.layers: - layer.mlp.experts.layer_sync_moe_local_experts_amax(shared_moe_weight_scale) + layer.mlp.experts.layer_sync_moe_local_experts_amax() for layer in model.decoder.layers: + # Check input quantizer amax is synced across local experts fc1_amax = None fc2_amax = None for expert in layer.mlp.experts.local_experts: @@ -798,7 +792,7 @@ def _test_layer_sync_moe_local_experts_amax( else: assert torch.allclose(fc2_amax, expert.linear_fc2.input_quantizer.amax) - for layer in model.decoder.layers: + # Check weight quantizer amax is different across local experts fc1_amax = None fc2_amax = None for expert in layer.mlp.experts.local_experts: @@ -806,17 +800,12 @@ def _test_layer_sync_moe_local_experts_amax( assert expert.linear_fc2.weight_quantizer.amax is not None if fc1_amax is None: fc1_amax = expert.linear_fc1.weight_quantizer.amax - elif shared_moe_weight_scale: - assert torch.allclose(fc1_amax, expert.linear_fc1.weight_quantizer.amax) else: assert not torch.allclose(fc1_amax, expert.linear_fc1.weight_quantizer.amax) - fc1_amax = expert.linear_fc1.weight_quantizer.amax # update most recent amax - if fc2_amax is None: fc2_amax = expert.linear_fc2.weight_quantizer.amax - elif shared_moe_weight_scale: - assert torch.allclose(fc2_amax, expert.linear_fc2.weight_quantizer.amax) - # FC2 amaxes are the same since the input to the layer is all the same + else: + assert not torch.allclose(fc2_amax, expert.linear_fc2.weight_quantizer.amax) def _test_expert_model_parallel_amax_sync( From 5edd5ac9b3077f98338cfd686d24a97f11423b1c Mon Sep 17 00:00:00 2001 From: Jennifer Chen Date: Thu, 19 Feb 2026 21:48:56 +0000 Subject: [PATCH 6/8] fix bug Signed-off-by: Jennifer Chen --- modelopt/torch/quantization/config.py | 10 ++++------ modelopt/torch/quantization/model_calib.py | 4 +--- modelopt/torch/quantization/plugins/megatron.py | 6 +++++- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 74547a68d..291acba03 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -214,6 +214,7 @@ **_default_disabled_quantizer_cfg, **_mamba_moe_disabled_quantizer_cfg, }, + "algorithm": "max", } MAMBA_MOE_FP8_CONSERVATIVE_CFG = { @@ -225,6 +226,7 @@ "*mixer.in_proj*": {"enable": False}, # Skip mamba linear "*mixer.out_proj*": {"enable": False}, # Skip mamba linear }, + "algorithm": "max", } FP8_PER_CHANNEL_PER_TOKEN_CFG = { @@ -435,6 +437,7 @@ **_default_disabled_quantizer_cfg, **_mamba_moe_disabled_quantizer_cfg, }, + "algorithm": "max", } MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = { "quant_cfg": { @@ -455,6 +458,7 @@ "*mixer.in_proj*": {"enable": False}, # Skip mamba linear "*mixer.out_proj*": {"enable": False}, # Skip mamba linear }, + "algorithm": "max", } @@ -1083,12 +1087,6 @@ class MaxCalibConfig(QuantizeAlgorithmConfig): description="If True, the amax will be synced across the distributed processes.", ) - shared_moe_weight_scale: bool | None = ModeloptField( - default=True, - title="Whether to share the weight scale across local experts.", - description="If True, the weight scale will be shared across local experts.", - ) - class MseCalibConfig(QuantizeAlgorithmConfig): """Configuration for per-tensor MSE calibration. diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index 081aa6f1c..a33e18f02 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -99,7 +99,6 @@ def max_calibrate( model: nn.Module, forward_loop: ForwardLoop | None = None, distributed_sync=True, - shared_moe_weight_scale=True, ): """Calibrate the model using max. @@ -108,7 +107,6 @@ def max_calibrate( forward_loop: A callable which takes the model as argument and forwards calibration data through the model. distributed_sync: Whether to sync amax across distributed processes. - shared_moe_weight_scale: Whether to share the weight scale across local experts. See :class:`MaxCalibConfig ` for details on the remaining arguments. @@ -123,7 +121,7 @@ def max_calibrate( # Sync amax across local experts within each rank (for SequentialMLP) for name, module in model.named_modules(): if hasattr(module, "layer_sync_moe_local_experts_amax"): - module.layer_sync_moe_local_experts_amax(shared_moe_weight_scale) + module.layer_sync_moe_local_experts_amax() if not distributed_sync: return diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py index 69ada80e4..e84735ae9 100644 --- a/modelopt/torch/quantization/plugins/megatron.py +++ b/modelopt/torch/quantization/plugins/megatron.py @@ -594,7 +594,11 @@ def layer_sync_moe_local_experts_amax(self): amax_dict = {} for expert in self.local_experts: for name, module in expert.named_modules(): - if isinstance(module, TensorQuantizer) and module.amax is not None: + if ( + isinstance(module, TensorQuantizer) + and module.amax is not None + and "input_quantizer" in name + ): stored_amax = amax_dict.get(name) amax_tensor = module.amax.detach().clone() amax_dict[name] = ( From aaf7c81cd04c5057a5e0f035f98547722cb5f2e9 Mon Sep 17 00:00:00 2001 From: Jennifer Chen Date: Thu, 19 Feb 2026 22:12:29 +0000 Subject: [PATCH 7/8] remove comment Signed-off-by: Jennifer Chen --- tests/gpu_megatron/torch/quantization/plugins/test_megatron.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py index dadd324d1..e1866cba3 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py @@ -772,7 +772,6 @@ def _test_layer_sync_moe_local_experts_amax(ep_size, moe_grouped_gemm, rank, siz quant_cfg = mtq.FP8_DEFAULT_CFG model = mtq.quantize(model, quant_cfg, get_forward(model)) - # does layer_sync_moe_local_experts_amax happens in mtq.quantize if EP=1? for layer in model.decoder.layers: layer.mlp.experts.layer_sync_moe_local_experts_amax() From 41f38d6cc45cde02150c23b25c354af1f03eefc8 Mon Sep 17 00:00:00 2001 From: Jennifer Chen Date: Fri, 20 Feb 2026 16:50:45 +0000 Subject: [PATCH 8/8] update docstring Signed-off-by: Jennifer Chen --- modelopt/torch/quantization/model_calib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index a33e18f02..dcd9007a2 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -106,7 +106,7 @@ def max_calibrate( model: Model to be calibrated. forward_loop: A callable which takes the model as argument and forwards calibration data through the model. - distributed_sync: Whether to sync amax across distributed processes. + distributed_sync: Whether to sync input_quantizer amax across distributed processes. See :class:`MaxCalibConfig ` for details on the remaining arguments. @@ -118,7 +118,7 @@ def max_calibrate( forward_loop(model) finish_stats_collection(model) - # Sync amax across local experts within each rank (for SequentialMLP) + # Sync input_quantizer amax across local experts within each rank (for SequentialMLP) for name, module in model.named_modules(): if hasattr(module, "layer_sync_moe_local_experts_amax"): module.layer_sync_moe_local_experts_amax()