NVIDIA · makroumi · May 7, 2026 · May 7, 2026
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -23,6 +23,10 @@ Changelog
 - Add ``--cast_mxfp4_to_nvfp4`` flag to ``examples/llm_ptq/hf_ptq.py`` for closed-form, bit-exact MXFP4 → NVFP4 weight conversion. Supports the GPT-OSS family (``openai/gpt-oss-20b``, ``openai/gpt-oss-120b``). See `examples/llm_ptq/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_ptq#mxfp4--nvfp4-cast-for-gpt-oss>`__ for usage.
 - DeepSeek PTQ (``examples/deepseek/ptq.py``) now defaults to native top-k calibration with post-hoc per-layer peer-max sync of expert ``input_quantizer.amax``; the all-experts path is preserved behind ``--calib_all_experts``.
 
+**Bug Fixes**
+
+- Enforce NVFP4 block_size in {16, 32} during QuantizerAttributeConfig validation. Illegal block sizes (e.g. 64, 128) previously passed silently, corrupting scale tensors at Blackwell MMA export time after wasting GPU hours on calibration. The check now fires at Pydantic config construction time.
+
 0.44 (2026-05-18)
 ^^^^^^^^^^^^^^^^^
 

@@ -455,6 +455,13 @@ def validate_block_sizes(cls, v, info: ValidationInfo):
                 assert _k in ["type", "scale_bits", "scale_block_sizes"]
             else:
                 assert isinstance(_k, int) and (_v is None or isinstance(_v, int))
+        # NVFP4 (Blackwell MMA): block_size must be 16 or 32
+        if info.data.get("num_bits") == (2, 1) and v.get("scale_bits") == (4, 3):
+            for _k, _v in v.items():
+                if isinstance(_k, int) and _v is not None:
+                    assert _v in (16, 32), (
+                        f"NVFP4 block_size must be 16 or 32 (Blackwell MMA tile), got {_v}"
+                    )
         return v
 
     @field_validator("bias")

diff --git a/tests/unit/torch/quantization/test_config_validation.py b/tests/unit/torch/quantization/test_config_validation.py
@@ -525,3 +525,88 @@ def test_validate_quant_cfg_entries_accepts_valid_cfg(self):
             algorithm="max",
         )
         assert len(cfg.quant_cfg) == 2
+
+
+class TestNVFP4BlockSizeValidation:
+    """NVFP4 block_size must be constrained to {16, 32} per Blackwell MMA tile geometry."""
+
+    def test_nvfp4_block_16_accepted(self):
+        """block_size=16 is the canonical NVFP4 tile — must pass."""
+        cfg = QuantizeConfig(
+            quant_cfg=[
+                {
+                    "quantizer_name": "*weight_quantizer",
+                    "cfg": {
+                        "num_bits": (2, 1),
+                        "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
+                    },
+                },
+            ],
+            algorithm="max",
+        )
+        assert cfg.quant_cfg[0]["cfg"]["block_sizes"][-1] == 16
+
+    def test_nvfp4_block_32_accepted(self):
+        """block_size=32 is the alternative Blackwell MMA tile — must pass."""
+        cfg = QuantizeConfig(
+            quant_cfg=[
+                {
+                    "quantizer_name": "*weight_quantizer",
+                    "cfg": {
+                        "num_bits": (2, 1),
+                        "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (4, 3)},
+                    },
+                },
+            ],
+            algorithm="max",
+        )
+        assert cfg.quant_cfg[0]["cfg"]["block_sizes"][-1] == 32
+
+    @pytest.mark.parametrize("bad_size", [8, 64, 128, 4, 256])
+    def test_nvfp4_illegal_block_size_rejected(self, bad_size):
+        """block_size ∉ {16, 32} must fail at config construction time."""
+        with pytest.raises(ValidationError, match="NVFP4 block_size must be 16 or 32"):
+            QuantizeConfig(
+                quant_cfg=[
+                    {
+                        "quantizer_name": "*weight_quantizer",
+                        "cfg": {
+                            "num_bits": (2, 1),
+                            "block_sizes": {-1: bad_size, "scale_bits": (4, 3)},
+                        },
+                    },
+                ],
+                algorithm="max",
+            )
+
+    def test_non_nvfp4_block_size_unaffected(self):
+        """INT4 block_size=128 must still pass — constraint is NVFP4-only."""
+        cfg = QuantizeConfig(
+            quant_cfg=[
+                {
+                    "quantizer_name": "*weight_quantizer",
+                    "cfg": {
+                        "num_bits": 4,
+                        "block_sizes": {-1: 128, "type": "static"},
+                    },
+                },
+            ],
+            algorithm="max",
+        )
+        assert cfg.quant_cfg[0]["cfg"]["block_sizes"][-1] == 128
+
+    def test_nvfp4_without_scale_bits_unaffected(self):
+        """num_bits=(2,1) without scale_bits=(4,3) is not NVFP4 — no constraint."""
+        cfg = QuantizeConfig(
+            quant_cfg=[
+                {
+                    "quantizer_name": "*weight_quantizer",
+                    "cfg": {
+                        "num_bits": (2, 1),
+                        "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)},
+                    },
+                },
+            ],
+            algorithm=None,
+        )
+        assert cfg.quant_cfg[0]["cfg"]["block_sizes"][-1] == 32