From e21a9baab1fb6e36c0e4c680d0c8dcbff83030f8 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Wed, 13 May 2026 18:00:49 -0700 Subject: [PATCH 01/23] feat(opt): make load_config return validated schema instances load_config now returns a validated instance of the effective schema when one is known. The schema_type argument takes priority; otherwise the file's `# modelopt-schema:` comment is used. Without either, the raw resolved dict/list is returned unchanged. Imported snippets are still strictly required to declare modelopt-schema. Simplify modelopt.recipe.loader to consume the returned ModelOptPTQRecipe / QuantizeConfig instance directly, and update the recipe-loader tests for the new behavior (schema-comment returns an instance; missing `quantize` uses the schema default; unknown recipe_type surfaces via Pydantic's validation message; quant_cfg entries are normalized at load time). Signed-off-by: Shengliang Xu --- modelopt/recipe/loader.py | 35 ++++++++--------------- modelopt/torch/opt/config_loader.py | 44 ++++++++++++++++++++--------- tests/unit/recipe/test_loader.py | 36 +++++++++++++++-------- 3 files changed, 66 insertions(+), 49 deletions(-) diff --git a/modelopt/recipe/loader.py b/modelopt/recipe/loader.py index 9c3c40856d..8608e3fcbb 100644 --- a/modelopt/recipe/loader.py +++ b/modelopt/recipe/loader.py @@ -89,29 +89,15 @@ def _load_recipe_from_file(recipe_file: Path | Traversable) -> ModelOptRecipeBas The file must contain a ``metadata`` section with at least ``recipe_type``, plus a ``quant_cfg`` mapping and an optional ``algorithm`` for PTQ recipes. """ - data = load_config(recipe_file, schema_type=ModelOptPTQRecipe) - if not isinstance(data, dict): + recipe = load_config(recipe_file, schema_type=ModelOptPTQRecipe) + if not isinstance(recipe, ModelOptPTQRecipe): raise ValueError( - f"Recipe file {recipe_file} must be a YAML mapping, got {type(data).__name__}." + f"Recipe file {recipe_file} must produce a {ModelOptPTQRecipe.__name__}, " + f"got {type(recipe).__name__}." ) - - metadata = data.get("metadata", {}) - if not isinstance(metadata, dict): - raise ValueError( - f"Recipe file {recipe_file} field 'metadata' must be a mapping, " - f"got {type(metadata).__name__}." - ) - recipe_type = metadata.get("recipe_type") - if recipe_type is None: - raise ValueError(f"Recipe file {recipe_file} must contain a 'metadata.recipe_type' field.") - + recipe_type = recipe.recipe_type if recipe_type == RecipeType.PTQ: - if "quantize" not in data: - raise ValueError(f"PTQ recipe file {recipe_file} must contain 'quantize'.") - return ModelOptPTQRecipe( - metadata=metadata, - quantize=data["quantize"], - ) + return recipe raise ValueError(f"Unsupported recipe type: {recipe_type!r}") @@ -149,13 +135,14 @@ def _load_recipe_from_dir(recipe_dir: Path | Traversable) -> ModelOptRecipeBase: if recipe_type == RecipeType.PTQ: quantize_file = _find_recipe_section_file(recipe_dir, "quantize") - quantize_data = load_config(quantize_file, schema_type=QuantizeConfig) - if not isinstance(quantize_data, dict): + quantize_cfg = load_config(quantize_file, schema_type=QuantizeConfig) + if not isinstance(quantize_cfg, QuantizeConfig): raise ValueError( - f"{quantize_file} must be a YAML mapping, got {type(quantize_data).__name__}." + f"{quantize_file} must produce a {QuantizeConfig.__name__}, " + f"got {type(quantize_cfg).__name__}." ) return ModelOptPTQRecipe( metadata=metadata, - quantize=quantize_data, + quantize=quantize_cfg, ) raise ValueError(f"Unsupported recipe type: {recipe_type!r}") diff --git a/modelopt/torch/opt/config_loader.py b/modelopt/torch/opt/config_loader.py index 43231c9099..5dbf0ad5bf 100644 --- a/modelopt/torch/opt/config_loader.py +++ b/modelopt/torch/opt/config_loader.py @@ -596,25 +596,43 @@ def load_config( config_path: str | Path | Traversable, *, schema_type: Any | None = None, -) -> dict[str, Any] | list[Any]: +) -> Any: """Load a YAML config and resolve all ``$import`` references. This is the primary config loading entry point. It loads the YAML file, - resolves any ``imports`` / ``$import`` directives, and returns the final - config dict or list. - - ``schema_type`` supplies a typing context for import resolution when the - file itself has no ``modelopt-schema`` comment. It is intentionally not a - request to validate the top-level file. Top-level files are validated only - when they declare ``modelopt-schema``; imported snippets are stricter and - must always declare ``modelopt-schema``. + resolves any ``imports`` / ``$import`` directives, and returns either a + validated instance of the schema (when one is known) or the raw resolved + payload. + + The effective schema is selected as follows: + + 1. If ``schema_type`` is provided, it is used. + 2. Otherwise, the schema declared by the file's ``# modelopt-schema:`` + comment (if any) is used. + + When an effective schema is selected, the resolved payload is validated + and returned as an instance of that schema — e.g., a Pydantic model + instance for ``BaseModel`` schemas, or a validated dict / list for + ``TypedDict`` / ``list[TypedDict]`` schemas. If neither source supplies a + schema, the raw resolved dict or list is returned unchanged. + + Imported snippets are stricter and must always declare ``modelopt-schema``; + they are validated during import resolution regardless of the top-level + selection above. """ raw = _load_raw_config_with_schema(config_path) data = raw.data declared_schema_type = _schema_type(raw.schema) if raw.schema else None - resolver_schema_type = declared_schema_type or schema_type + effective_schema_type = schema_type if schema_type is not None else declared_schema_type if isinstance(data, (_ListSnippet, dict)): - data = _resolve_imports(data, schema_type=resolver_schema_type) - _validate_modelopt_schema(raw.schema, data, raw.path, schema_type=declared_schema_type) - return data + data = _resolve_imports(data, schema_type=effective_schema_type) + if effective_schema_type is None: + return data + try: + return TypeAdapter(effective_schema_type).validate_python(data) + except Exception as exc: + raise ValueError( + f"Config file {raw.path} does not match modelopt-schema " + f"{_schema_label(effective_schema_type, raw.schema)!r}: {exc}" + ) from exc diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index 738dfc268c..bfbce21ed5 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -170,19 +170,22 @@ def test_load_recipe_missing_recipe_type_raises(tmp_path): load_recipe(bad) -def test_load_recipe_missing_quantize_raises(tmp_path): - """load_recipe raises ValueError when quantize is absent for a PTQ recipe.""" - bad = tmp_path / "bad.yml" - bad.write_text(CFG_RECIPE_MISSING_quantize) - with pytest.raises(ValueError, match="quantize"): - load_recipe(bad) +def test_load_recipe_missing_quantize_uses_default(tmp_path): + """``quantize`` is optional in a PTQ recipe; absence yields an empty default config.""" + from modelopt.torch.quantization.config import QuantizeConfig + + good = tmp_path / "good.yml" + good.write_text(CFG_RECIPE_MISSING_quantize) + recipe = load_recipe(good) + assert isinstance(recipe.quantize, QuantizeConfig) def test_load_recipe_unsupported_type_raises(tmp_path): """load_recipe raises ValueError for an unknown recipe_type.""" bad = tmp_path / "bad.yml" bad.write_text(CFG_RECIPE_UNSUPPORTED_TYPE) - with pytest.raises(ValueError, match="Unsupported recipe type"): + # Schema-driven validation reports the failure via the TypedDict's enum check. + with pytest.raises(ValueError, match="recipe_type"): load_recipe(bad) @@ -916,8 +919,13 @@ def test_import_mixed_tree(tmp_path): data = load_config(config_file) # Dict import inside list entry assert data["quant_cfg"][0]["cfg"] == {"num_bits": (4, 3)} - # List splice - assert data["quant_cfg"][1] == {"quantizer_name": "*lm_head*", "enable": False} + # List splice — entries are normalized by QuantizeConfig.quant_cfg's validator, + # which fills in defaults for missing ``enable`` / ``cfg`` keys. + assert data["quant_cfg"][1] == { + "quantizer_name": "*lm_head*", + "enable": False, + "cfg": None, + } # --------------------------------------------------------------------------- @@ -1089,8 +1097,10 @@ def test_builtin_config_snippets_with_modelopt_schema(config_path): assert data -def test_modelopt_schema_comment_validates_without_changing_payload(tmp_path): - """modelopt-schema validates the resolved payload but load_config still returns a plain dict.""" +def test_modelopt_schema_comment_returns_instance(tmp_path): + """A ``modelopt-schema`` comment makes load_config return an instance of that schema.""" + from modelopt.torch.quantization.config import QuantizerAttributeConfig + config_file = tmp_path / "fp8.yaml" config_file.write_text( "# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig\n" @@ -1098,7 +1108,9 @@ def test_modelopt_schema_comment_validates_without_changing_payload(tmp_path): "axis:\n" ) data = load_config(config_file) - assert data == {"num_bits": (4, 3), "axis": None} + assert isinstance(data, QuantizerAttributeConfig) + assert data.num_bits == (4, 3) + assert data.axis is None def test_modelopt_schema_comment_validation_error(tmp_path): From 198a305b45dabc7d8202e7817004f8d77caade36 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 14 May 2026 10:46:03 -0700 Subject: [PATCH 02/23] feat(quant): make QuantizerCfgEntry a ModeloptBaseConfig pydantic type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert QuantizerCfgEntry from a TypedDict to a ModeloptBaseConfig subclass so entries are validated at construction (cfg/enable shape rules now run via a model_validator on the schema itself) and load_config returns proper pydantic instances for snippets and lists schematized against QuantizerCfgEntry / QuantizerCfgListConfig. normalize_quant_cfg_list now passes already-validated QuantizerCfgEntry instances through unchanged, so the constants loaded from YAML (e.g. _base_disable_all, _default_disabled_quantizer_cfg) can be spread into preset configs and re-validated by QuantizeConfig without round-tripping through dicts. Consumers needed no functional changes because ModeloptBaseConfig already implements __getitem__, get, __contains__, items, keys, and update — covering every dict-shaped access site in modelopt/, examples/, and tests/. Type plumbing: - Introduce RawQuantizeQuantCfgType (a covariant Sequence) for input positions that get normalized — keeps set_quantizer_by_cfg and set_quantizer_by_cfg_context callable with both raw dict literals and pre-validated entries without invariance errors. - algorithms.AutoQuantizeSearcher now constructs a QuantizerCfgEntry instance instead of appending a bare dict to QuantizeConfig.quant_cfg. Tests: - test_config_validation: relaxed two match=non-empty dict assertions where pydantic's field-type check now fires before the model validator. - test_loader: dict-equality assertions on schema-loaded entries now use model_dump(); two YAML fixtures with bare quantizer_name entries (now rejected by the model validator at load time) had enable: false added. Signed-off-by: Shengliang Xu --- .../llm_export_utils/quantization_utils.py | 4 +- modelopt/torch/quantization/algorithms.py | 6 +- modelopt/torch/quantization/config.py | 169 ++++++++++-------- modelopt/torch/quantization/conversion.py | 6 +- tests/unit/recipe/test_loader.py | 39 +++- .../quantization/test_config_validation.py | 18 +- 6 files changed, 145 insertions(+), 97 deletions(-) diff --git a/modelopt/onnx/llm_export_utils/quantization_utils.py b/modelopt/onnx/llm_export_utils/quantization_utils.py index 54ca93d538..d6c8c4c1e9 100644 --- a/modelopt/onnx/llm_export_utils/quantization_utils.py +++ b/modelopt/onnx/llm_export_utils/quantization_utils.py @@ -69,9 +69,7 @@ def get_quant_config(precision, lm_head_precision="fp16"): else: raise ValueError(f"Unsupported precision: {precision}") - quant_cfg_list: list = [ - e for e in quant_cfg["quant_cfg"] if isinstance(e, dict) and "quantizer_name" in e - ] + quant_cfg_list: list = [e for e in quant_cfg["quant_cfg"] if "quantizer_name" in e] if lm_head_precision == "fp8": quant_cfg_list.append( diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py index 992717983d..e4e633e36a 100644 --- a/modelopt/torch/quantization/algorithms.py +++ b/modelopt/torch/quantization/algorithms.py @@ -40,7 +40,7 @@ from . import config as mtq_config from . import model_calib -from .config import QuantizeConfig, QuantizerAttributeConfig +from .config import QuantizeConfig, QuantizerAttributeConfig, QuantizerCfgEntry from .conversion import set_quantizer_by_cfg from .nn import QuantLinearConvBase, QuantModule, SequentialQuantizer, TensorQuantizer from .utils import is_quantized_linear @@ -129,7 +129,9 @@ def __init__(self, quant_cfg: str | dict[str, Any] | None = None, name: str | No # Disable KV Cache quantization # Currently KV Cache quantization is enabled for some quantization formats and disabled for others # This breaks the monotonicity of the quantization formats in terms of weight compression Vs accuracy - self.config.quant_cfg.append({"quantizer_name": "*output_quantizer", "enable": False}) + self.config.quant_cfg.append( + QuantizerCfgEntry(quantizer_name="*output_quantizer", enable=False) + ) self.compression = estimate_quant_compression(self.config) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index dfed54cc99..6e0a54dd4b 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -152,23 +152,70 @@ import copy import warnings -from typing import Any, Literal, cast +from collections.abc import Sequence +from typing import Any, Literal from pydantic import AliasChoices, ValidationInfo, field_validator, model_validator -from typing_extensions import Required, TypedDict from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField from modelopt.torch.opt.config_loader import load_config from modelopt.torch.utils.network import ConstructorLike -class QuantizerCfgEntry(TypedDict, total=False): +class QuantizerCfgEntry(ModeloptBaseConfig): """A single entry in a ``quant_cfg`` list.""" - quantizer_name: Required[str] # matched against quantizer module names - parent_class: str | None # optional; filters by pytorch module class name (e.g. "nn.Linear") - cfg: dict[str, Any] | list[dict[str, Any]] | None # quantizer attribute config(s) - enable: bool | None # toggles matched quantizers on/off; independent of cfg + quantizer_name: str = ModeloptField( + default=..., + title="Quantizer name pattern.", + description="Glob pattern matched against quantizer module names.", + ) + parent_class: str | None = ModeloptField( + default=None, + title="Optional parent-class filter.", + description="If provided, only quantizers whose parent module matches this PyTorch class " + "name (e.g. ``'nn.Linear'``) are affected.", + ) + cfg: dict[str, Any] | list[dict[str, Any]] | None = ModeloptField( + default=None, + title="Quantizer attribute config.", + description="A ``QuantizerAttributeConfig``-shaped dict, or a list of such dicts for " + "sequential quantizers. ``None`` leaves the existing attribute config untouched.", + ) + enable: bool = ModeloptField( + default=True, + title="Enable the quantizer.", + description="Toggle matched quantizers on/off; independent of ``cfg``.", + ) + + @model_validator(mode="after") + def _validate_instruction(self): + """Reject entries that carry no instruction beyond the path selector.""" + fields_set = self.model_fields_set + if "cfg" not in fields_set and "enable" not in fields_set: + raise ValueError( + f"QuantizerCfgEntry must specify 'cfg', 'enable', or both. An entry with only " + f"'quantizer_name'={self.quantizer_name!r} has no effect (implicit enable=True " + "is not allowed; set it explicitly)." + ) + + if self.enable and self.cfg is not None: + if isinstance(self.cfg, dict): + is_invalid = len(self.cfg) == 0 + elif isinstance(self.cfg, list): + is_invalid = len(self.cfg) == 0 or any( + not isinstance(item, dict) or len(item) == 0 for item in self.cfg + ) + else: + is_invalid = True + if is_invalid: + raise ValueError( + f"QuantizerCfgEntry 'cfg' must be a non-empty dict or a non-empty list of " + f"non-empty dicts when enabling quantizer {self.quantizer_name!r}, got " + f"{type(self.cfg).__name__}: {self.cfg!r}. Either provide quantizer " + "attributes in 'cfg' or remove 'cfg' and set 'enable' explicitly." + ) + return self def find_quant_cfg_entry_by_path( @@ -197,7 +244,7 @@ def find_quant_cfg_entry_by_path( """ result = None for entry in quant_cfg_list: - if isinstance(entry, dict) and entry.get("quantizer_name") == quantizer_name: + if entry.get("quantizer_name") == quantizer_name: result = entry if result is None: raise KeyError(f"No quant_cfg entry with quantizer_name={quantizer_name!r}") @@ -930,13 +977,23 @@ class GPTQCalibConfig(QuantizeAlgorithmConfig): QuantizeQuantCfgType = list[QuantizerCfgEntry] QuantizerCfgListConfig = QuantizeQuantCfgType +# Pre-normalization input shape: a sequence whose entries can be raw dicts (any of the +# legacy / new dict forms) or already-validated QuantizerCfgEntry instances. +# ``Sequence`` (rather than ``list``) keeps the alias covariant so callers can pass +# ``list[QuantizerCfgEntry]`` without an invariance error. +# ``normalize_quant_cfg_list`` additionally accepts a single legacy flat ``dict`` for the +# whole list, but that path is deprecated and not surfaced in this alias. +RawQuantizeQuantCfgType = Sequence[QuantizerCfgEntry | dict[str, Any]] + _QuantizeAlgoCfgType = str | dict | QuantizeAlgorithmConfig | None QuantizeAlgoCfgType = _QuantizeAlgoCfgType | list[_QuantizeAlgoCfgType] | None -def normalize_quant_cfg_list(v: dict | list) -> list[QuantizerCfgEntry]: - """Normalize a raw quant_cfg into a list of :class:`QuantizerCfgEntry` dicts. +def normalize_quant_cfg_list( + v: RawQuantizeQuantCfgType | dict[str, Any], +) -> list[QuantizerCfgEntry]: + """Normalize a raw quant_cfg into a list of :class:`QuantizerCfgEntry` instances. Supports the following input forms: @@ -951,35 +1008,19 @@ def normalize_quant_cfg_list(v: dict | list) -> list[QuantizerCfgEntry]: - Legacy ``nn.*``-scoped format: ``{"nn.": {"": }}`` — converted to a new-format entry with ``parent_class`` set. - **Validation** — an entry is rejected if it carries no instruction, i.e. it specifies neither - ``cfg`` nor ``enable``. Concretely, the following are invalid: - - - An empty entry ``{}``. - - An entry with only ``quantizer_name`` and no other keys — the only effect would be an - implicit ``enable=True``, which must be stated explicitly. - - An entry with ``enable=True`` (explicit or implicit) whose ``cfg`` is not a non-empty - ``dict`` or ``list`` — e.g. ``{"quantizer_name": "*", "cfg": {}}`` or - ``{"quantizer_name": "*", "cfg": 42}``. An enabled quantizer must have a valid - configuration. - - **Normalization** — after conversion and validation every entry is put into canonical form: - - - ``enable`` is set to ``True`` if not explicitly specified. - - ``cfg`` is set to ``None`` if not present in the entry. - - Every returned entry is therefore guaranteed to have the keys ``quantizer_name``, ``enable``, - and ``cfg`` (plus optionally ``parent_class``). + Each normalized dict is then constructed into a :class:`QuantizerCfgEntry`, whose own + validator enforces that every entry specifies ``cfg``, ``enable``, or both, and that any + ``cfg`` for an enabled quantizer is a non-empty dict or non-empty list of non-empty dicts. Args: v: A list of raw quant_cfg entries in any supported format, or a legacy flat dict. Returns: - A list of :class:`QuantizerCfgEntry` dicts in canonical normalized form. + A list of validated :class:`QuantizerCfgEntry` instances. Raises: - ValueError: If any entry has only ``quantizer_name`` with neither ``cfg`` nor ``enable``, - if ``enable=True`` with an empty or non-dict/list ``cfg``, or if the entry format - is not recognized. + ValueError: If any entry's shape is not recognized, or if it fails + :class:`QuantizerCfgEntry` validation (missing instruction or invalid ``cfg``). """ def _warn_legacy(): @@ -997,8 +1038,8 @@ def _warn_legacy(): _warn_legacy() v = [{k: val} for k, val in v.items()] - def _dict_to_entry(key: str, value) -> list[QuantizerCfgEntry]: - """Convert a single legacy key-value pair to one or more QuantizerCfgEntry dicts.""" + def _dict_to_entry(key: str, value) -> list[dict[str, Any]]: + """Convert a single legacy key-value pair to one or more entry dicts.""" # Legacy "default" key was a catch-all applied as "*" in the old conversion code. if key == "default": key = "*" @@ -1007,12 +1048,12 @@ def _dict_to_entry(key: str, value) -> list[QuantizerCfgEntry]: if not isinstance(value, dict): raise ValueError(f"For 'nn.*' scoped format, value must be a dict, got {value!r}") # Support multi-key nn.*-scoped dicts by emitting one entry per sub-key. - entries: list[QuantizerCfgEntry] = [] + entries: list[dict[str, Any]] = [] for q_path, sub_cfg in value.items(): sub_cfg = dict(sub_cfg) enable = sub_cfg.pop("enable", None) cfg = sub_cfg or None - entry: QuantizerCfgEntry = { + entry: dict[str, Any] = { "parent_class": key, "quantizer_name": q_path, "cfg": cfg, @@ -1036,8 +1077,14 @@ def _dict_to_entry(key: str, value) -> list[QuantizerCfgEntry]: result: list[QuantizerCfgEntry] = [] _warned_legacy = False for raw in v: + # Already-validated QuantizerCfgEntry instances (e.g. produced by load_config on a + # snippet schematized with `# modelopt-schema: QuantizerCfgEntry`, then spread into + # a quant_cfg list) are passed through unchanged. + if isinstance(raw, QuantizerCfgEntry): + result.append(raw) + continue if isinstance(raw, dict) and "quantizer_name" in raw: - entries = [dict(raw)] # copy to avoid mutating caller's data + entries: list[dict[str, Any]] = [dict(raw)] # copy to avoid mutating caller's data elif isinstance(raw, dict) and len(raw) == 1: key, val = next(iter(raw.items())) entries = [dict(e) for e in _dict_to_entry(key, val)] @@ -1055,42 +1102,10 @@ def _dict_to_entry(key: str, value) -> list[QuantizerCfgEntry]: else: raise ValueError(f"Invalid quant_cfg entry: {raw!r}.") - for entry in entries: - # Validate: must carry at least one instruction beyond the path selector. - if "cfg" not in entry and "enable" not in entry: - raise ValueError( - f"Invalid quant_cfg entry: {raw!r} — each entry must specify 'cfg', 'enable', " - "or both. An entry with only 'quantizer_name' has no effect (implicit " - "enable=True is not allowed; set it explicitly)." - ) - - # Validate: when cfg is present and enable=True, cfg must be a non-empty - # dict or list. An empty cfg would attempt to create a - # QuantizerAttributeConfig with no actual configuration. - cfg = entry.get("cfg") - enable = entry.get("enable", True) - if enable and cfg is not None: - if isinstance(cfg, dict): - is_invalid = len(cfg) == 0 - elif isinstance(cfg, list): - is_invalid = len(cfg) == 0 or any( - not isinstance(item, dict) or len(item) == 0 for item in cfg - ) - else: - is_invalid = True - if is_invalid: - raise ValueError( - f"Invalid quant_cfg entry: {raw!r} — 'cfg' must be a non-empty dict " - f"or a non-empty list of non-empty dicts when enabling a quantizer " - f"(got {type(cfg).__name__}: {cfg!r}). Either provide quantizer " - "attributes in 'cfg' or remove 'cfg' and set 'enable' explicitly." - ) - - # Normalize: make enable and cfg always explicit. - entry.setdefault("enable", True) - entry.setdefault("cfg", None) - - result.append(cast("QuantizerCfgEntry", entry)) + # Constructing each QuantizerCfgEntry runs its model_validator, which enforces the + # at-least-one-of('cfg', 'enable') and cfg-shape constraints. Defaults for absent + # 'cfg' / 'enable' are filled by the pydantic field defaults. + result.extend(QuantizerCfgEntry(**entry) for entry in entries) return result @@ -1157,15 +1172,13 @@ class _QuantizeExportConfig(ModeloptBaseConfig): """An empty config.""" -_base_disable_all: list[QuantizerCfgEntry] = [ - cast("QuantizerCfgEntry", load_config("configs/ptq/units/base_disable_all")) -] +_base_disable_all: list[QuantizerCfgEntry] = [load_config("configs/ptq/units/base_disable_all")] _default_disabled_quantizer_cfg: list[QuantizerCfgEntry] = load_config( "configs/ptq/units/default_disabled_quantizers" ) -_mamba_moe_disabled_quantizer_cfg: list[QuantizerCfgEntry] = [ +_mamba_moe_disabled_quantizer_cfg: list[QuantizerCfgEntry | dict[str, Any]] = [ {"quantizer_name": "*fc1_latent_proj*", "enable": False}, # Skip Latent MOE {"quantizer_name": "*fc2_latent_proj*", "enable": False}, # Skip Latent MOE {"quantizer_name": "*q_proj*", "enable": False}, # Skip QKV Linear (HF naming) @@ -1490,7 +1503,7 @@ def _nvfp4_selective_quant_cfg( algorithm: str | dict = "max", ) -> dict: """Build an NVFP4 config that quantizes only the specified layer patterns.""" - quant_cfg: list[QuantizerCfgEntry] = [] + quant_cfg: list[QuantizerCfgEntry | dict[str, Any]] = [] quant_cfg.extend(_base_disable_all) for pattern in layer_patterns: # Deep-copy the quantizer dict so each config constant gets its own instance. diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index 3f97f8380b..40c2b8dbc7 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -31,8 +31,8 @@ from .config import ( QuantizeConfig, - QuantizeQuantCfgType, QuantizerAttributeConfig, + RawQuantizeQuantCfgType, _QuantizeExportConfig, normalize_quant_cfg_list, ) @@ -215,7 +215,7 @@ def _replace_quant_module(model: nn.Module, version=None, registry=QuantModuleRe _replace_quant_module(getattr(model, name), version=version, registry=registry) -def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType): +def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: RawQuantizeQuantCfgType): """Apply a quantization config list to the quantizers in ``quant_model``. ``quant_cfg`` is an **ordered list** of :class:`QuantizerCfgEntry <.config.QuantizerCfgEntry>` @@ -477,7 +477,7 @@ def set_quantizer_attributes_partial( @contextmanager -def set_quantizer_by_cfg_context(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType): +def set_quantizer_by_cfg_context(quant_model: nn.Module, quant_cfg: RawQuantizeQuantCfgType): """Context manager that temporarily applies a quantization config and restores the original state on exit. Calls :func:`set_quantizer_by_cfg` on entry and reverts every diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index bfbce21ed5..271fefd896 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -515,7 +515,15 @@ def test_import_entry_element_schema_appends(tmp_path): f" - $import: disable_all\n" ) recipe = load_recipe(recipe_file) - assert recipe.quantize["quant_cfg"] == [{"quantizer_name": "*", "cfg": None, "enable": False}] + # Entry was loaded against the QuantizerCfgEntry pydantic schema, so it is now a + # model instance — compare via model_dump for the dict-shape check. + assert len(recipe.quantize["quant_cfg"]) == 1 + assert recipe.quantize["quant_cfg"][0].model_dump() == { + "quantizer_name": "*", + "parent_class": None, + "cfg": None, + "enable": False, + } def test_import_entry_wrong_schema_raises(tmp_path): @@ -856,7 +864,8 @@ def test_import_list_splice_outside_typed_list_raises(tmp_path): """A bare $import in an untyped list is rejected.""" _write_quantizer_cfg_list( tmp_path / "extra_tasks.yml", - "- quantizer_name: '*weight_quantizer'\n- quantizer_name: '*input_quantizer'\n", + "- quantizer_name: '*weight_quantizer'\n enable: false\n" + "- quantizer_name: '*input_quantizer'\n enable: false\n", ) config_file = tmp_path / "config.yml" config_file.write_text( @@ -920,9 +929,11 @@ def test_import_mixed_tree(tmp_path): # Dict import inside list entry assert data["quant_cfg"][0]["cfg"] == {"num_bits": (4, 3)} # List splice — entries are normalized by QuantizeConfig.quant_cfg's validator, - # which fills in defaults for missing ``enable`` / ``cfg`` keys. - assert data["quant_cfg"][1] == { + # which fills in defaults for missing ``enable`` / ``cfg`` keys. Entries are now + # QuantizerCfgEntry pydantic instances, so compare via model_dump. + assert data["quant_cfg"][1].model_dump() == { "quantizer_name": "*lm_head*", + "parent_class": None, "enable": False, "cfg": None, } @@ -1157,7 +1168,14 @@ def test_modelopt_schema_comment_validates_after_import_resolution(tmp_path): f" $import: fp8\n" ) data = load_config(config_file) - assert data == [{"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": (4, 3)}}] + # data is a list of QuantizerCfgEntry pydantic instances, not raw dicts. + assert len(data) == 1 + assert data[0].model_dump() == { + "quantizer_name": "*weight_quantizer", + "parent_class": None, + "cfg": {"num_bits": (4, 3)}, + "enable": True, + } # --------------------------------------------------------------------------- @@ -1262,7 +1280,13 @@ def test_load_config_list_valued_yaml(tmp_path): data = load_config(cfg_file) assert isinstance(data, list) assert len(data) == 2 - assert data[0] == {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": 8}} + # Entries are QuantizerCfgEntry pydantic instances after schema validation. + assert data[0].model_dump() == { + "quantizer_name": "*weight_quantizer", + "parent_class": None, + "cfg": {"num_bits": 8}, + "enable": True, + } # --------------------------------------------------------------------------- @@ -1274,7 +1298,8 @@ def test_import_dict_value_resolves_to_list_raises(tmp_path): """$import in dict value position raises when snippet is a list.""" _write_quantizer_cfg_list( tmp_path / "entries.yml", - "- quantizer_name: '*weight_quantizer'\n- quantizer_name: '*input_quantizer'\n", + "- quantizer_name: '*weight_quantizer'\n enable: false\n" + "- quantizer_name: '*input_quantizer'\n enable: false\n", ) config_file = tmp_path / "config.yml" config_file.write_text( diff --git a/tests/unit/torch/quantization/test_config_validation.py b/tests/unit/torch/quantization/test_config_validation.py index 84306dc511..88ef7faa37 100644 --- a/tests/unit/torch/quantization/test_config_validation.py +++ b/tests/unit/torch/quantization/test_config_validation.py @@ -184,8 +184,13 @@ def test_error_on_empty_cfg_list_enable_true(self): ) def test_error_on_non_dict_non_list_cfg_enable_true(self): - """Entry with cfg of invalid type (e.g. int) and enable=True is rejected.""" - with pytest.raises(ValueError, match="non-empty dict"): + """Entry with cfg of invalid type (e.g. int) and enable=True is rejected. + + Pydantic's field-type check fires before the QuantizerCfgEntry model validator, + so this surfaces as a type error rather than the 'non-empty dict' message — + either is acceptable here as long as the entry is rejected. + """ + with pytest.raises(ValueError): normalize_quant_cfg_list( [{"quantizer_name": "*weight_quantizer", "cfg": 42, "enable": True}] ) @@ -198,8 +203,13 @@ def test_error_on_cfg_list_with_empty_dict_enable_true(self): ) def test_error_on_cfg_list_with_non_dict_element_enable_true(self): - """Entry with cfg=[42] and enable=True is rejected (non-dict element).""" - with pytest.raises(ValueError, match="non-empty dict"): + """Entry with cfg=[42] and enable=True is rejected. + + Pydantic's field-type check fires before the QuantizerCfgEntry model validator, + so the message may report a type error instead of 'non-empty dict' — either is + acceptable, as long as the entry is rejected. + """ + with pytest.raises(ValueError): normalize_quant_cfg_list( [{"quantizer_name": "*weight_quantizer", "cfg": [42], "enable": True}] ) From 058234a001fabc0adffccb7849906e365650376a Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 14 May 2026 10:54:30 -0700 Subject: [PATCH 03/23] refactor(quant): tighten type hints on QuantizeConfig field validators - normalize_quant_cfg: annotate as ``-> QuantizeQuantCfgType`` and always delegate to normalize_quant_cfg_list; remove the silent passthrough for non-list/non-dict input. - normalize_quant_cfg_list: explicitly reject non-list/non-dict input with a clear ValueError so the field-validator's contract is honored (no more TypeError trickling out of the for-loop). - validate_quant_cfg_entries: annotate as ``(QuantizeQuantCfgType) -> QuantizeQuantCfgType``; switch ``entry.get("cfg")`` to ``entry.cfg`` since by mode="after" each element is guaranteed to be a QuantizerCfgEntry instance. - Refresh stale docstrings that still referred to "QuantizerCfgEntry dicts" from the pre-pydantic TypedDict era. Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 31 ++++++++++++++++++++------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 6e0a54dd4b..918dabe3d4 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -1037,6 +1037,11 @@ def _warn_legacy(): if isinstance(v, dict): _warn_legacy() v = [{k: val} for k, val in v.items()] + elif not isinstance(v, list): + raise ValueError( + f"quant_cfg must be a list of entries (or a legacy flat dict), got " + f"{type(v).__name__}: {v!r}." + ) def _dict_to_entry(key: str, value) -> list[dict[str, Any]]: """Convert a single legacy key-value pair to one or more entry dicts.""" @@ -1127,22 +1132,32 @@ class QuantizeConfig(ModeloptBaseConfig): @field_validator("quant_cfg", mode="before") @classmethod - def normalize_quant_cfg(cls, v): - """Normalize quant_cfg entries: convert dict and tuple forms to QuantizerCfgEntry dicts.""" - if not isinstance(v, (list, dict)): - return v + def normalize_quant_cfg(cls, v: Any) -> QuantizeQuantCfgType: + """Normalize raw quant_cfg input into a ``list[QuantizerCfgEntry]``. + + Delegates to :func:`normalize_quant_cfg_list`, which accepts every supported input + shape (new-format list, legacy single-key-dict list, legacy flat dict, and lists + containing already-validated ``QuantizerCfgEntry`` instances) and rejects anything + else with a clear ``ValueError`` before pydantic's field-type check would see it. + """ return normalize_quant_cfg_list(v) @field_validator("quant_cfg", mode="after") @classmethod - def validate_quant_cfg_entries(cls, v): - """Validate quantizer attribute configs to surface errors (e.g. invalid axis/block_sizes).""" + def validate_quant_cfg_entries(cls, v: QuantizeQuantCfgType) -> QuantizeQuantCfgType: + """Validate each entry's ``cfg`` against :class:`QuantizerAttributeConfig`. + + Runs after the ``mode="before"`` normalizer and pydantic's field-type check, so + every element here is already a :class:`QuantizerCfgEntry`. This second pass + surfaces attribute-level errors (e.g. invalid ``axis`` / ``block_sizes``) that the + per-entry ``QuantizerCfgEntry`` validator doesn't inspect. + """ qac_fields = set(QuantizerAttributeConfig.model_fields.keys()) for entry in v: - cfg = entry.get("cfg") + cfg = entry.cfg if cfg is None: continue - cfgs = cfg if isinstance(cfg, list) else [cfg] + cfgs: list[dict[str, Any]] = cfg if isinstance(cfg, list) else [cfg] for c in cfgs: if isinstance(c, dict) and qac_fields & set(c.keys()): QuantizerAttributeConfig.model_validate(c) From 02513b6760a985a69e8081224491717788d96851 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 14 May 2026 11:09:32 -0700 Subject: [PATCH 04/23] refactor(quant): name the legacy flat-dict quant_cfg input shape Introduce ``DeprecatedQuantCfgType = dict[str, Any]`` and use it in ``normalize_quant_cfg_list``'s signature so the legacy flat-dict input form is explicitly labeled deprecated at the type level. ``RawQuantizeQuantCfgType`` continues to describe only the supported list-shaped input. Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 918dabe3d4..9b2dad1b3c 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -981,17 +981,20 @@ class GPTQCalibConfig(QuantizeAlgorithmConfig): # legacy / new dict forms) or already-validated QuantizerCfgEntry instances. # ``Sequence`` (rather than ``list``) keeps the alias covariant so callers can pass # ``list[QuantizerCfgEntry]`` without an invariance error. -# ``normalize_quant_cfg_list`` additionally accepts a single legacy flat ``dict`` for the -# whole list, but that path is deprecated and not surfaced in this alias. RawQuantizeQuantCfgType = Sequence[QuantizerCfgEntry | dict[str, Any]] +# Legacy flat-dict input shape (``{"*": ..., "*weight_quantizer": ...}``). Accepted by +# ``normalize_quant_cfg_list`` for backward compatibility but emits a DeprecationWarning; +# new code should use a list of :class:`QuantizerCfgEntry`-shaped entries instead. +DeprecatedQuantCfgType = dict[str, Any] + _QuantizeAlgoCfgType = str | dict | QuantizeAlgorithmConfig | None QuantizeAlgoCfgType = _QuantizeAlgoCfgType | list[_QuantizeAlgoCfgType] | None def normalize_quant_cfg_list( - v: RawQuantizeQuantCfgType | dict[str, Any], + v: RawQuantizeQuantCfgType | DeprecatedQuantCfgType, ) -> list[QuantizerCfgEntry]: """Normalize a raw quant_cfg into a list of :class:`QuantizerCfgEntry` instances. From c855d2a4ce87d5c3b3a2fb32adf55a3508f9ca5f Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 14 May 2026 13:22:02 -0700 Subject: [PATCH 05/23] refactor(quant): widen quant_cfg input types to Mapping/Sequence Relax the pre-normalization type aliases so callers aren't forced to pass concrete ``dict``/``list`` types: - ``RawQuantizeQuantCfgType`` becomes ``Sequence[QuantizerCfgEntry] | Sequence[Mapping[str, Any]]`` (two covariant arms instead of one with a union element). - ``DeprecatedQuantCfgType`` becomes ``Mapping[str, Any]``. - ``normalize_quant_cfg_list`` and its inner isinstance dispatch use ``Mapping``/``Sequence`` throughout (excluding ``str``/``bytes`` from the Sequence arm). Also tighten internal call-site annotations: the ``QuantizeConfig`` mode="before" field validator and ``need_calibration`` now declare the real accepted input union rather than ``Any``/bare ``list``. Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 46 +++++++++++++++------------ 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 9b2dad1b3c..ddd1ca3765 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -152,7 +152,7 @@ import copy import warnings -from collections.abc import Sequence +from collections.abc import Mapping, Sequence from typing import Any, Literal from pydantic import AliasChoices, ValidationInfo, field_validator, model_validator @@ -977,16 +977,18 @@ class GPTQCalibConfig(QuantizeAlgorithmConfig): QuantizeQuantCfgType = list[QuantizerCfgEntry] QuantizerCfgListConfig = QuantizeQuantCfgType -# Pre-normalization input shape: a sequence whose entries can be raw dicts (any of the -# legacy / new dict forms) or already-validated QuantizerCfgEntry instances. -# ``Sequence`` (rather than ``list``) keeps the alias covariant so callers can pass -# ``list[QuantizerCfgEntry]`` without an invariance error. -RawQuantizeQuantCfgType = Sequence[QuantizerCfgEntry | dict[str, Any]] +# Pre-normalization input shape: either a sequence of already-validated +# :class:`QuantizerCfgEntry` instances, or a sequence of raw mappings (any of the legacy / +# new dict forms). Splitting the union into two ``Sequence[...]`` arms — rather than +# ``Sequence[QuantizerCfgEntry | Mapping[str, Any]]`` — keeps each arm covariant in its +# element type, so callers can pass ``list[QuantizerCfgEntry]`` or ``list[dict]`` without +# tripping invariance. +RawQuantizeQuantCfgType = Sequence[QuantizerCfgEntry] | Sequence[Mapping[str, Any]] # Legacy flat-dict input shape (``{"*": ..., "*weight_quantizer": ...}``). Accepted by # ``normalize_quant_cfg_list`` for backward compatibility but emits a DeprecationWarning; # new code should use a list of :class:`QuantizerCfgEntry`-shaped entries instead. -DeprecatedQuantCfgType = dict[str, Any] +DeprecatedQuantCfgType = Mapping[str, Any] _QuantizeAlgoCfgType = str | dict | QuantizeAlgorithmConfig | None @@ -1037,12 +1039,12 @@ def _warn_legacy(): ) # Legacy flat-dict format: {"*": {...}, "*weight_quantizer": {...}} → list of single-key dicts. - if isinstance(v, dict): + if isinstance(v, Mapping): _warn_legacy() v = [{k: val} for k, val in v.items()] - elif not isinstance(v, list): + elif not isinstance(v, Sequence) or isinstance(v, (str, bytes)): raise ValueError( - f"quant_cfg must be a list of entries (or a legacy flat dict), got " + f"quant_cfg must be a sequence of entries (or a legacy flat mapping), got " f"{type(v).__name__}: {v!r}." ) @@ -1053,8 +1055,10 @@ def _dict_to_entry(key: str, value) -> list[dict[str, Any]]: key = "*" if isinstance(key, str) and key.startswith("nn."): - if not isinstance(value, dict): - raise ValueError(f"For 'nn.*' scoped format, value must be a dict, got {value!r}") + if not isinstance(value, Mapping): + raise ValueError( + f"For 'nn.*' scoped format, value must be a mapping, got {value!r}" + ) # Support multi-key nn.*-scoped dicts by emitting one entry per sub-key. entries: list[dict[str, Any]] = [] for q_path, sub_cfg in value.items(): @@ -1071,7 +1075,7 @@ def _dict_to_entry(key: str, value) -> list[dict[str, Any]]: entries.append(entry) return entries else: - if isinstance(value, dict): + if isinstance(value, Mapping): cfg = {k: val for k, val in value.items() if k != "enable"} or None enable = value.get("enable") else: @@ -1091,15 +1095,15 @@ def _dict_to_entry(key: str, value) -> list[dict[str, Any]]: if isinstance(raw, QuantizerCfgEntry): result.append(raw) continue - if isinstance(raw, dict) and "quantizer_name" in raw: + if isinstance(raw, Mapping) and "quantizer_name" in raw: entries: list[dict[str, Any]] = [dict(raw)] # copy to avoid mutating caller's data - elif isinstance(raw, dict) and len(raw) == 1: + elif isinstance(raw, Mapping) and len(raw) == 1: key, val = next(iter(raw.items())) entries = [dict(e) for e in _dict_to_entry(key, val)] if not _warned_legacy: _warn_legacy() _warned_legacy = True - elif isinstance(raw, dict) and len(raw) > 1 and any(k.startswith("nn.") for k in raw): + elif isinstance(raw, Mapping) and len(raw) > 1 and any(k.startswith("nn.") for k in raw): # Legacy flat dict with nn.*-scoped keys mixed with other keys — expand all pairs. entries = [] for k, val in raw.items(): @@ -1135,7 +1139,9 @@ class QuantizeConfig(ModeloptBaseConfig): @field_validator("quant_cfg", mode="before") @classmethod - def normalize_quant_cfg(cls, v: Any) -> QuantizeQuantCfgType: + def normalize_quant_cfg( + cls, v: RawQuantizeQuantCfgType | DeprecatedQuantCfgType + ) -> QuantizeQuantCfgType: """Normalize raw quant_cfg input into a ``list[QuantizerCfgEntry]``. Delegates to :func:`normalize_quant_cfg_list`, which accepts every supported input @@ -1788,7 +1794,7 @@ def _nvfp4_selective_quant_cfg( } -def need_calibration(config): +def need_calibration(config: QuantizeConfig | Mapping[str, Any]) -> bool: """Check if calibration is needed for the given config.""" if config["algorithm"] is not None and config["algorithm"] != "max": return True @@ -1796,8 +1802,8 @@ def need_calibration(config): def _not_dynamic(cfg): return cfg.get("enable", True) and cfg.get("type", "") != "dynamic" - quant_cfg: list = config.get("quant_cfg") or [] - quant_cfg = normalize_quant_cfg_list(quant_cfg) + raw_quant_cfg: RawQuantizeQuantCfgType | DeprecatedQuantCfgType = config.get("quant_cfg") or [] + quant_cfg: list[QuantizerCfgEntry] = normalize_quant_cfg_list(raw_quant_cfg) for entry in quant_cfg: name = entry["quantizer_name"] raw_cfg = entry.get("cfg") From 0b6b2f0c37d367e6201b99752bc59a1bd0bcc2ac Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 14 May 2026 14:15:20 -0700 Subject: [PATCH 06/23] need to have model_dump for explicitly set k/v Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index ddd1ca3765..9c3595526f 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -1249,7 +1249,9 @@ class _QuantizeExportConfig(ModeloptBaseConfig): "algorithm": "max", } -FP8_DEFAULT_CFG: dict[str, Any] = load_config("configs/ptq/presets/model/fp8") +FP8_DEFAULT_CFG: dict[str, Any] = load_config("configs/ptq/presets/model/fp8").model_dump( + exclude_unset=True +) MAMBA_MOE_FP8_AGGRESSIVE_CFG = { "quant_cfg": [ @@ -1494,7 +1496,9 @@ class _QuantizeExportConfig(ModeloptBaseConfig): # KV-cache configs are designed to be merged with a primary quantization config (e.g. # FP8_DEFAULT_CFG) that already contains _base_disable_all. They intentionally omit both # _base_disable_all and "algorithm" because these are provided by the primary config. -FP8_KV_CFG: dict[str, Any] = load_config("configs/ptq/presets/kv/fp8") +FP8_KV_CFG: dict[str, Any] = load_config("configs/ptq/presets/kv/fp8").model_dump( + exclude_unset=True +) FP8_AFFINE_KV_CFG = { "quant_cfg": [ From c7ab5931ec70c0ee2c3ed052f85113471bb3cc7f Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 14 May 2026 14:59:24 -0700 Subject: [PATCH 07/23] refactor(recipe): make RecipeMetadataConfig a ModeloptBaseConfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert ``RecipeMetadataConfig`` from a ``TypedDict`` into a ``ModeloptBaseConfig`` so every schema accepted by ``load_config`` is a pydantic model. ``recipe_type`` is required (plain pydantic ``Field``), ``description`` keeps its default via ``ModeloptField``, and the now- redundant ``validate_metadata`` field validator is dropped — pydantic's native enum + required-field checks cover the same ground. ``ModelOptRecipeBase`` switches to ``default_factory`` for the ``metadata`` field (``ModeloptField`` only supports literal defaults). Convenience properties move from ``metadata["..."]`` to attribute access. ``_load_recipe_from_dir`` loses three defensive isinstance/None checks that are now unreachable: pydantic validation in ``load_config`` rejects malformed inputs upstream. With every schema now a ``ModeloptBaseConfig`` subclass, tighten ``load_config``'s overloads: - ``type[_SchemaT]`` → ``_SchemaT`` - ``type[list[_SchemaT]]`` → ``list[_SchemaT]`` - ``None`` → ``Any`` ``_SchemaT`` is bound to ``ModeloptBaseConfig``, so mypy now enforces the invariant that ``schema_type`` is a ``ModeloptBaseConfig`` subclass (or ``list`` of one) at every call site. The previous ``schema_type: Any`` catch-all is dropped. Signed-off-by: Shengliang Xu --- modelopt/recipe/config.py | 40 ++++++++++++----------------- modelopt/recipe/loader.py | 22 +++------------- modelopt/torch/opt/config_loader.py | 31 +++++++++++++++++++++- tests/unit/recipe/test_loader.py | 2 +- 4 files changed, 51 insertions(+), 44 deletions(-) diff --git a/modelopt/recipe/config.py b/modelopt/recipe/config.py index 96f33012af..58ac425e0b 100644 --- a/modelopt/recipe/config.py +++ b/modelopt/recipe/config.py @@ -19,8 +19,7 @@ from enum import Enum -from pydantic import field_validator -from typing_extensions import NotRequired, TypedDict +from pydantic import Field from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField from modelopt.torch.quantization.config import QuantizeConfig @@ -33,14 +32,21 @@ class RecipeType(str, Enum): # QAT = "qat" # Not implemented yet, will be added in the future. -class RecipeMetadataConfig(TypedDict): - """YAML shape of the recipe metadata section.""" +_DEFAULT_RECIPE_DESCRIPTION = "Model optimization recipe." - recipe_type: RecipeType - description: NotRequired[str] +class RecipeMetadataConfig(ModeloptBaseConfig): + """YAML shape of the recipe metadata section.""" -_DEFAULT_RECIPE_DESCRIPTION = "Model optimization recipe." + recipe_type: RecipeType = Field( + title="Recipe type", + description="The type of the recipe (e.g. PTQ).", + ) + description: str = ModeloptField( + default=_DEFAULT_RECIPE_DESCRIPTION, + title="Description", + description="Human-readable description of the recipe.", + ) class ModelOptRecipeBase(ModeloptBaseConfig): @@ -49,33 +55,21 @@ class ModelOptRecipeBase(ModeloptBaseConfig): If a layer name matches ``"*output_layer*"``, the attributes will be replaced with ``{"enable": False}``. """ - metadata: RecipeMetadataConfig = ModeloptField( - default={"recipe_type": RecipeType.PTQ, "description": _DEFAULT_RECIPE_DESCRIPTION}, + metadata: RecipeMetadataConfig = Field( + default_factory=lambda: RecipeMetadataConfig(recipe_type=RecipeType.PTQ), title="Metadata", description="Recipe metadata containing the recipe type and description.", - validate_default=True, ) - @field_validator("metadata") - @classmethod - def validate_metadata(cls, metadata: RecipeMetadataConfig) -> RecipeMetadataConfig: - """Validate recipe metadata and fill defaults for optional fields.""" - if metadata["recipe_type"] not in RecipeType: - raise ValueError( - f"Unsupported recipe type: {metadata['recipe_type']}. " - f"Only {list(RecipeType)} are currently supported." - ) - return {"description": _DEFAULT_RECIPE_DESCRIPTION, **metadata} - @property def recipe_type(self) -> RecipeType: """Return the recipe type from metadata.""" - return self.metadata["recipe_type"] + return self.metadata.recipe_type @property def description(self) -> str: """Return the recipe description from metadata.""" - return self.metadata.get("description", _DEFAULT_RECIPE_DESCRIPTION) + return self.metadata.description class ModelOptPTQRecipe(ModelOptRecipeBase): diff --git a/modelopt/recipe/loader.py b/modelopt/recipe/loader.py index 8608e3fcbb..919c4e0379 100644 --- a/modelopt/recipe/loader.py +++ b/modelopt/recipe/loader.py @@ -123,26 +123,10 @@ def _load_recipe_from_dir(recipe_dir: Path | Traversable) -> ModelOptRecipeBase: quantize. """ metadata_file = _find_recipe_section_file(recipe_dir, "metadata") - metadata = load_config(metadata_file, schema_type=RecipeMetadataConfig) - if not isinstance(metadata, dict): - raise ValueError( - f"Metadata file {metadata_file} must be a YAML mapping, got {type(metadata).__name__}." - ) - recipe_type = metadata.get("recipe_type") - if recipe_type is None: - raise ValueError(f"Metadata file {metadata_file} must contain a 'recipe_type' field.") - if recipe_type == RecipeType.PTQ: + if metadata.recipe_type == RecipeType.PTQ: quantize_file = _find_recipe_section_file(recipe_dir, "quantize") quantize_cfg = load_config(quantize_file, schema_type=QuantizeConfig) - if not isinstance(quantize_cfg, QuantizeConfig): - raise ValueError( - f"{quantize_file} must produce a {QuantizeConfig.__name__}, " - f"got {type(quantize_cfg).__name__}." - ) - return ModelOptPTQRecipe( - metadata=metadata, - quantize=quantize_cfg, - ) - raise ValueError(f"Unsupported recipe type: {recipe_type!r}") + return ModelOptPTQRecipe(metadata=metadata, quantize=quantize_cfg) + raise ValueError(f"Unsupported recipe type: {metadata.recipe_type!r}") diff --git a/modelopt/torch/opt/config_loader.py b/modelopt/torch/opt/config_loader.py index 5dbf0ad5bf..76ed2bb650 100644 --- a/modelopt/torch/opt/config_loader.py +++ b/modelopt/torch/opt/config_loader.py @@ -33,12 +33,14 @@ import re import sys from pathlib import Path -from typing import Any, Union, get_args, get_origin, get_type_hints +from typing import Any, TypeVar, Union, get_args, get_origin, get_type_hints, overload import yaml from pydantic import TypeAdapter from typing_extensions import NotRequired, Required, is_typeddict +from modelopt.torch.opt.config import ModeloptBaseConfig + @dataclass class _ListSnippet: @@ -592,6 +594,33 @@ def _find_import_marker(obj: Any, context: str = "root") -> tuple[Any, str] | No return None +_SchemaT = TypeVar("_SchemaT", bound=ModeloptBaseConfig) + + +@overload +def load_config( + config_path: str | Path | Traversable, + *, + schema_type: type[_SchemaT], +) -> _SchemaT: ... + + +@overload +def load_config( + config_path: str | Path | Traversable, + *, + schema_type: type[list[_SchemaT]], +) -> list[_SchemaT]: ... + + +@overload +def load_config( + config_path: str | Path | Traversable, + *, + schema_type: None = None, +) -> Any: ... + + def load_config( config_path: str | Path | Traversable, *, diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index 271fefd896..eeb039f97e 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -184,7 +184,7 @@ def test_load_recipe_unsupported_type_raises(tmp_path): """load_recipe raises ValueError for an unknown recipe_type.""" bad = tmp_path / "bad.yml" bad.write_text(CFG_RECIPE_UNSUPPORTED_TYPE) - # Schema-driven validation reports the failure via the TypedDict's enum check. + # Schema-driven validation reports the failure via the metadata schema's enum check. with pytest.raises(ValueError, match="recipe_type"): load_recipe(bad) From df4ffb4b0a530c25355345f9c0838f407670410c Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 14 May 2026 17:20:37 -0700 Subject: [PATCH 08/23] fix(recipe): require metadata and quantize sections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both fields were silently defaulted by the schema, so a malformed recipe file missing either section would still load successfully — a PTQ recipe without quantize would quietly fall back to QuantizeConfig() (the default INT8 config), masking the user's mistake. Drop the defaults so pydantic rejects recipes with missing sections at validation time. description stays optional via its own field default. Signed-off-by: Shengliang Xu --- modelopt/recipe/config.py | 15 ++++++++------- tests/unit/recipe/test_loader.py | 25 ++++++++++++++++++------- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/modelopt/recipe/config.py b/modelopt/recipe/config.py index 58ac425e0b..8a2007f55b 100644 --- a/modelopt/recipe/config.py +++ b/modelopt/recipe/config.py @@ -22,7 +22,7 @@ from pydantic import Field from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField -from modelopt.torch.quantization.config import QuantizeConfig +from modelopt.torch.quantization.config import QuantizeConfig # noqa: TC001 class RecipeType(str, Enum): @@ -56,9 +56,10 @@ class ModelOptRecipeBase(ModeloptBaseConfig): """ metadata: RecipeMetadataConfig = Field( - default_factory=lambda: RecipeMetadataConfig(recipe_type=RecipeType.PTQ), title="Metadata", - description="Recipe metadata containing the recipe type and description.", + description="Recipe metadata containing the recipe type and description. " + "Required: a recipe without a ``metadata`` section is rejected so that a " + "missing section can't silently fall back to a default recipe type.", ) @property @@ -75,9 +76,9 @@ def description(self) -> str: class ModelOptPTQRecipe(ModelOptRecipeBase): """Our config class for PTQ recipes.""" - quantize: QuantizeConfig = ModeloptField( - default=QuantizeConfig(), + quantize: QuantizeConfig = Field( title="PTQ config", - description="PTQ config containing quant_cfg and algorithm.", - validate_default=True, + description="PTQ config containing quant_cfg and algorithm. Required: a PTQ " + "recipe without a ``quantize`` section is rejected so that a missing section " + "can't silently fall back to the default INT8 config.", ) diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index eeb039f97e..759b629f2d 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -41,6 +41,10 @@ quantize: {} """ +CFG_RECIPE_MISSING_METADATA = """\ +quantize: {} +""" + CFG_RECIPE_MISSING_quantize = """\ metadata: recipe_type: ptq @@ -49,6 +53,7 @@ CFG_RECIPE_UNSUPPORTED_TYPE = """\ metadata: recipe_type: unknown_type +quantize: {} """ QUANTIZER_ATTRIBUTE_SCHEMA = ( @@ -170,14 +175,20 @@ def test_load_recipe_missing_recipe_type_raises(tmp_path): load_recipe(bad) -def test_load_recipe_missing_quantize_uses_default(tmp_path): - """``quantize`` is optional in a PTQ recipe; absence yields an empty default config.""" - from modelopt.torch.quantization.config import QuantizeConfig +def test_load_recipe_missing_quantize_raises(tmp_path): + """A PTQ recipe missing the ``quantize`` section is rejected (no silent default).""" + bad = tmp_path / "bad.yml" + bad.write_text(CFG_RECIPE_MISSING_quantize) + with pytest.raises(ValueError, match="quantize"): + load_recipe(bad) + - good = tmp_path / "good.yml" - good.write_text(CFG_RECIPE_MISSING_quantize) - recipe = load_recipe(good) - assert isinstance(recipe.quantize, QuantizeConfig) +def test_load_recipe_missing_metadata_raises(tmp_path): + """A recipe missing the ``metadata`` section is rejected (no silent default).""" + bad = tmp_path / "bad.yml" + bad.write_text(CFG_RECIPE_MISSING_METADATA) + with pytest.raises(ValueError, match="metadata"): + load_recipe(bad) def test_load_recipe_unsupported_type_raises(tmp_path): From 0d087e0508783a7dc695ae5e44b05051e31b038b Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 14 May 2026 17:20:51 -0700 Subject: [PATCH 09/23] feat(opt): make ModeloptBaseConfig a real MutableMapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until now ModeloptBaseConfig had the mapping-shaped methods (__getitem__/__setitem__/__iter__/__len__/get/keys/values/items) but did not subclass or register collections.abc.Mapping, so isinstance(cfg, Mapping) returned False. Callers migrating off isinstance(cfg, dict) hit this surprise. Inherit from MutableMapping so the isinstance check works and the ABC mixin methods (pop, popitem, setdefault, clear) come along for free. The schema is fixed, so __delitem__ raises TypeError; pop/popitem/clear inherit that failure on existing keys, while pop(key, default) for a missing key still returns the default. The ABC mixins require __getitem__ to raise KeyError on missing keys, not AttributeError — translate at the __getitem__ boundary and update get() to catch KeyError. One test that previously asserted AttributeError on cfg["missing"] is updated to expect KeyError. Signed-off-by: Shengliang Xu --- modelopt/torch/opt/config.py | 39 +++++++++++++++++++++++++---- tests/unit/torch/opt/test_config.py | 2 +- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/modelopt/torch/opt/config.py b/modelopt/torch/opt/config.py index 62f7b7e16a..f033596b24 100644 --- a/modelopt/torch/opt/config.py +++ b/modelopt/torch/opt/config.py @@ -17,7 +17,7 @@ import fnmatch import json -from collections.abc import Callable, ItemsView, Iterator, KeysView, ValuesView +from collections.abc import Callable, ItemsView, Iterator, KeysView, MutableMapping, ValuesView from typing import Any, TypeAlias import torch @@ -57,11 +57,18 @@ def ModeloptField(default: Any = PydanticUndefined, **kwargs): # noqa: N802 # TODO: expand config classes to searcher -class ModeloptBaseConfig(BaseModel): +class ModeloptBaseConfig(BaseModel, MutableMapping): """Our config base class for mode configuration. The base class extends the capabilities of pydantic's BaseModel to provide additional methods and properties for easier access and manipulation of the configuration. + + Inherits from :class:`collections.abc.MutableMapping` so instances satisfy + ``isinstance(cfg, Mapping)`` / ``isinstance(cfg, MutableMapping)`` checks and pick up the + mixin methods (``pop``, ``popitem``, ``setdefault``, ``clear``). Schema fields are fixed, + so ``__delitem__`` raises :class:`TypeError`; the inherited ``pop`` / ``clear`` / + ``popitem`` therefore also raise on any existing key, while ``pop(key, default)`` for a + missing key still returns the default normally. """ model_config = PyDanticConfigDict(extra="forbid", validate_assignment=True) @@ -110,18 +117,40 @@ def __contains__(self, key: str) -> bool: return False def __getitem__(self, key: str) -> Any: - """Get the value for the given key (can be name or alias of field).""" - return getattr(self, self.get_field_name_from_key(key)) + """Get the value for the given key (can be name or alias of field). + + Raises :class:`KeyError` for missing keys so the class behaves like a regular + :class:`Mapping` — required for the inherited ``MutableMapping`` mixin methods + (``pop``, ``setdefault``, ...) to dispatch correctly. + """ + try: + return getattr(self, self.get_field_name_from_key(key)) + except AttributeError: + raise KeyError(key) from None def __setitem__(self, key: str, value: Any) -> None: """Set the value for the given key (can be name or alias of field).""" setattr(self, self.get_field_name_from_key(key), value) + def __delitem__(self, key: str) -> None: + """Reject key deletion. + + ``ModeloptBaseConfig`` exposes a fixed pydantic schema, so removing a key is + ill-defined: schema fields can't disappear, and silently resetting them to their + defaults would surprise callers. Raise ``TypeError`` instead. Defined so the + class fully satisfies the ``MutableMapping`` protocol (``__delitem__`` is + required), without committing to actual deletion semantics. + """ + raise TypeError( + f"{type(self).__name__} does not support key deletion; schema fields are " + f"fixed (attempted to delete {key!r})." + ) + def get(self, key: str, default: Any = None) -> Any: """Get the value for the given key (can be name or alias) or default if not found.""" try: return self[key] - except AttributeError: + except KeyError: return default def __len__(self) -> int: diff --git a/tests/unit/torch/opt/test_config.py b/tests/unit/torch/opt/test_config.py index b2ffadb1a7..e0c5993a51 100644 --- a/tests/unit/torch/opt/test_config.py +++ b/tests/unit/torch/opt/test_config.py @@ -72,7 +72,7 @@ def _run_test(is_new_registered): assert config[lin_name] == lin_expected_value assert config[lin_alias] == lin_expected_value assert getattr(config, lin_name) == lin_expected_value - with nullcontext() if is_new_registered else pytest.raises(AttributeError): + with nullcontext() if is_new_registered else pytest.raises(KeyError): config[new_name] # get From 77c8e675c7ccada27a77438592ac8a97670e135c Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 14 May 2026 17:21:03 -0700 Subject: [PATCH 10/23] fix(quant): normalize empty cfg to None when disabling a quantizer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QuantizerCfgEntry accepted {cfg: {}, enable: False} and kept cfg as the empty dict. Downstream, any non-None cfg is applied as a full quantizer-attribute replacement, so an empty cfg on a disable entry silently resets the quantizer's attributes back to schema defaults — and if a later rule re-enables the quantizer, it comes back with defaults instead of the config it originally carried. Add a model_validator(mode="before") that rewrites cfg to None when enable=False and cfg is empty (empty dict, empty list, or list of empty dicts), so disable-only entries actually behave like disable-only. A non-empty cfg with enable=False is preserved (deliberate disable+replace). Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 27 +++++++++++++ .../quantization/test_config_validation.py | 38 +++++++++++++++++-- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 9c3595526f..beaf3cf864 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -188,6 +188,33 @@ class QuantizerCfgEntry(ModeloptBaseConfig): description="Toggle matched quantizers on/off; independent of ``cfg``.", ) + @model_validator(mode="before") + @classmethod + def _drop_empty_cfg_when_disabled(cls, values): + """Treat ``enable=False`` with an empty ``cfg`` as a pure disable. + + Downstream, any non-``None`` ``cfg`` is applied as a full quantizer-attribute + replacement. An entry like ``{cfg: {}, enable: False}`` would therefore reset + the quantizer's attributes back to schema defaults — and if a later rule + re-enables the quantizer, it would come back with defaults rather than the + config it originally carried. Normalise an empty ``cfg`` (empty dict, empty + list, or a list of empty dicts) to ``None`` so a disable-only entry behaves + like one. + """ + if not isinstance(values, dict): + return values + if values.get("enable") is False: + cfg = values.get("cfg") + cfg_is_empty = (isinstance(cfg, dict) and len(cfg) == 0) or ( + isinstance(cfg, list) + and ( + len(cfg) == 0 or all(isinstance(item, dict) and len(item) == 0 for item in cfg) + ) + ) + if cfg_is_empty: + values = {**values, "cfg": None} + return values + @model_validator(mode="after") def _validate_instruction(self): """Reject entries that carry no instruction beyond the path selector.""" diff --git a/tests/unit/torch/quantization/test_config_validation.py b/tests/unit/torch/quantization/test_config_validation.py index 88ef7faa37..6670ae983f 100644 --- a/tests/unit/torch/quantization/test_config_validation.py +++ b/tests/unit/torch/quantization/test_config_validation.py @@ -214,19 +214,49 @@ def test_error_on_cfg_list_with_non_dict_element_enable_true(self): [{"quantizer_name": "*weight_quantizer", "cfg": [42], "enable": True}] ) - def test_empty_cfg_dict_enable_false_accepted(self): - """Entry with cfg={} and enable=False is allowed (disable-only entry).""" + def test_empty_cfg_dict_enable_false_normalized_to_none(self): + """Entry with cfg={} and enable=False is normalised to cfg=None (disable-only). + + A non-``None`` cfg is applied as a full quantizer-attribute replacement, so an + empty cfg paired with enable=False would silently reset the quantizer's + attributes. Normalisation to ``None`` makes the entry behave like a pure + disable, preserving the existing attribute config. + """ result = normalize_quant_cfg_list( [{"quantizer_name": "*input_quantizer", "cfg": {}, "enable": False}] ) assert result[0]["enable"] is False + assert result[0]["cfg"] is None - def test_empty_cfg_list_enable_false_accepted(self): - """Entry with cfg=[] and enable=False is allowed (disable-only entry).""" + def test_empty_cfg_list_enable_false_normalized_to_none(self): + """Entry with cfg=[] and enable=False is normalised to cfg=None.""" result = normalize_quant_cfg_list( [{"quantizer_name": "*input_quantizer", "cfg": [], "enable": False}] ) assert result[0]["enable"] is False + assert result[0]["cfg"] is None + + def test_cfg_list_of_empty_dicts_enable_false_normalized_to_none(self): + """Entry with cfg=[{}] and enable=False is normalised to cfg=None.""" + result = normalize_quant_cfg_list( + [{"quantizer_name": "*input_quantizer", "cfg": [{}], "enable": False}] + ) + assert result[0]["enable"] is False + assert result[0]["cfg"] is None + + def test_nonempty_cfg_enable_false_preserved(self): + """Entry with a non-empty cfg and enable=False keeps the cfg (disable+replace).""" + result = normalize_quant_cfg_list( + [ + { + "quantizer_name": "*input_quantizer", + "cfg": {"num_bits": 4}, + "enable": False, + } + ] + ) + assert result[0]["enable"] is False + assert result[0]["cfg"] == {"num_bits": 4} def test_new_format_with_list_cfg(self): """cfg can be a list of dicts for SequentialQuantizer.""" From c43c0f0f6dc57a660ab57ed334053754524115bd Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 14 May 2026 17:55:26 -0700 Subject: [PATCH 11/23] fix(quant): keep shared cfg snippets as dicts in public constants After load_config() started returning schema instances, _base_disable_all and _default_disabled_quantizer_cfg held QuantizerCfgEntry objects, and splatting them into the public dict configs (INT4_AWQ_CFG, NVFP4_DEFAULT_CFG, INT8_DEFAULT_CFG, etc.) leaked schema instances into trees that have always been raw dict/list. Callers serialising those constants or doing isinstance(entry, dict) saw the difference. Dump each entry back to a plain dict with exclude_unset=True (matching the existing treatment of FP8_DEFAULT_CFG / FP8_KV_CFG) so the public constants stay raw and the dumped shape is byte-identical to the YAML snippet. Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index beaf3cf864..bad3a05530 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -1223,13 +1223,24 @@ class _QuantizeExportConfig(ModeloptBaseConfig): """An empty config.""" -_base_disable_all: list[QuantizerCfgEntry] = [load_config("configs/ptq/units/base_disable_all")] +# Shared snippet constants are dumped back to plain dicts before being spliced into +# the public quant config constants below. ``load_config`` returns validated +# ``QuantizerCfgEntry`` instances for schema-tagged files, but the public constants +# (``INT4_AWQ_CFG``, ``NVFP4_DEFAULT_CFG``, etc.) have always been raw dict/list trees; +# splatting schema instances into them would surprise callers that serialise the +# constants or do ``isinstance(entry, dict)`` checks. ``exclude_unset=True`` keeps the +# sparse YAML shape (only the explicitly set fields) so the dumped dicts are +# byte-identical to what authors wrote in the YAML snippets. +_base_disable_all: list[dict[str, Any]] = [ + load_config("configs/ptq/units/base_disable_all").model_dump(exclude_unset=True) +] -_default_disabled_quantizer_cfg: list[QuantizerCfgEntry] = load_config( - "configs/ptq/units/default_disabled_quantizers" -) +_default_disabled_quantizer_cfg: list[dict[str, Any]] = [ + entry.model_dump(exclude_unset=True) + for entry in load_config("configs/ptq/units/default_disabled_quantizers") +] -_mamba_moe_disabled_quantizer_cfg: list[QuantizerCfgEntry | dict[str, Any]] = [ +_mamba_moe_disabled_quantizer_cfg: list[dict[str, Any]] = [ {"quantizer_name": "*fc1_latent_proj*", "enable": False}, # Skip Latent MOE {"quantizer_name": "*fc2_latent_proj*", "enable": False}, # Skip Latent MOE {"quantizer_name": "*q_proj*", "enable": False}, # Skip QKV Linear (HF naming) @@ -1558,7 +1569,7 @@ def _nvfp4_selective_quant_cfg( algorithm: str | dict = "max", ) -> dict: """Build an NVFP4 config that quantizes only the specified layer patterns.""" - quant_cfg: list[QuantizerCfgEntry | dict[str, Any]] = [] + quant_cfg: list[dict[str, Any]] = [] quant_cfg.extend(_base_disable_all) for pattern in layer_patterns: # Deep-copy the quantizer dict so each config constant gets its own instance. From a63d42016a6c7ce879c0a5f5a27836337cc3e725 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 14 May 2026 17:55:31 -0700 Subject: [PATCH 12/23] fix(opt): __setitem__ raises KeyError for unknown keys When ModeloptBaseConfig started inheriting from MutableMapping, __getitem__ was updated to raise KeyError for missing keys, but __setitem__ still propagated AttributeError from get_field_name_from_key. Direct writes like cfg["unknown"] = value, and inherited mixin helpers like setdefault that route through __setitem__, leaked AttributeError instead of mapping-style KeyError. Translate at the boundary so both read and write halves of the protocol agree. Signed-off-by: Shengliang Xu --- modelopt/torch/opt/config.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/opt/config.py b/modelopt/torch/opt/config.py index f033596b24..fce2eb36f6 100644 --- a/modelopt/torch/opt/config.py +++ b/modelopt/torch/opt/config.py @@ -129,8 +129,17 @@ def __getitem__(self, key: str) -> Any: raise KeyError(key) from None def __setitem__(self, key: str, value: Any) -> None: - """Set the value for the given key (can be name or alias of field).""" - setattr(self, self.get_field_name_from_key(key), value) + """Set the value for the given key (can be name or alias of field). + + Raises :class:`KeyError` (not :class:`AttributeError`) for unknown keys so the + class matches the :class:`MutableMapping` protocol — both for direct + ``cfg["unknown"] = value`` writes and for inherited mixin helpers like + ``setdefault`` that write through ``__setitem__``. + """ + try: + setattr(self, self.get_field_name_from_key(key), value) + except AttributeError: + raise KeyError(key) from None def __delitem__(self, key: str) -> None: """Reject key deletion. From d7b6e0a217661a6410285fa4d38035a86a84c90f Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Fri, 15 May 2026 08:04:04 -0700 Subject: [PATCH 13/23] test(quant): tighten cfg-shape rejection assertions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two tests previously asserted only that ValueError was raised when cfg had an invalid python type (cfg=42, cfg=[42]) — pydantic's field-type check fires before QuantizerCfgEntry's model validator, so the older match="non-empty dict" pattern stopped matching and was dropped. A bare pytest.raises(ValueError) accepts any ValueError anywhere in the call path, which is weaker than the intent. Restore a regex that requires the error message to implicate the cfg field and identify a type/shape problem. Accepts either path: - pydantic: "cfg.dict[...]\n Input should be a valid dictionary" - validator: "'cfg' must be a non-empty dict ..." Signed-off-by: Shengliang Xu --- .../quantization/test_config_validation.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/unit/torch/quantization/test_config_validation.py b/tests/unit/torch/quantization/test_config_validation.py index 6670ae983f..00f7cb3605 100644 --- a/tests/unit/torch/quantization/test_config_validation.py +++ b/tests/unit/torch/quantization/test_config_validation.py @@ -186,11 +186,14 @@ def test_error_on_empty_cfg_list_enable_true(self): def test_error_on_non_dict_non_list_cfg_enable_true(self): """Entry with cfg of invalid type (e.g. int) and enable=True is rejected. - Pydantic's field-type check fires before the QuantizerCfgEntry model validator, - so this surfaces as a type error rather than the 'non-empty dict' message — - either is acceptable here as long as the entry is rejected. + Two error paths are acceptable here, and the assertion accepts either: + pydantic's field-type check (``cfg`` must be a dict or list) fires first when + ``cfg`` is the wrong python type, while ``QuantizerCfgEntry``'s model validator + emits the "non-empty dict" message when ``cfg`` is the right type but empty. + Either way the message must implicate the ``cfg`` field, not just any + ``ValueError``. """ - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=r"(?s)cfg.*(non-empty|valid dictionary|valid list)"): normalize_quant_cfg_list( [{"quantizer_name": "*weight_quantizer", "cfg": 42, "enable": True}] ) @@ -205,11 +208,12 @@ def test_error_on_cfg_list_with_empty_dict_enable_true(self): def test_error_on_cfg_list_with_non_dict_element_enable_true(self): """Entry with cfg=[42] and enable=True is rejected. - Pydantic's field-type check fires before the QuantizerCfgEntry model validator, - so the message may report a type error instead of 'non-empty dict' — either is - acceptable, as long as the entry is rejected. + Same dual-path acceptance as :meth:`test_error_on_non_dict_non_list_cfg_enable_true`: + pydantic may report a list-element type error, or the model validator may report + "non-empty dict"; the assertion accepts either as long as the message names the + ``cfg`` field. """ - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=r"(?s)cfg.*(non-empty|valid dictionary|valid list)"): normalize_quant_cfg_list( [{"quantizer_name": "*weight_quantizer", "cfg": [42], "enable": True}] ) From 251ea06728312f0cfe9246e87328ce889fd332cc Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Fri, 15 May 2026 09:58:21 -0700 Subject: [PATCH 14/23] refactor(quant): schematize QuantizerCfgEntry.cfg as QuantizerAttributeConfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QuantizerCfgEntry.cfg was typed as dict[str, Any] | list[dict[str, Any]] | None even though every value is supposed to be QuantizerAttributeConfig- shaped. Schematize the field so the schema layer matches the intent: cfg: QuantizerAttributeConfig | list[QuantizerAttributeConfig] | None Pydantic auto-coerces dicts, so YAML loading, public dict-form CFG constants, and dict-input callers keep working untouched. Knock-on cleanups in the same file: - _drop_empty_cfg_when_disabled is folded into a single mode="before" validator (_normalize_cfg_shape) that handles both empty-cfg rules: disabled+empty → cfg=None (normalize), enabled+empty → ValueError with a clearer message. Both rules must run on the raw input before pydantic coerces {} into a default-filled QAC, otherwise "user gave no attributes (typo)" and "user wants schema defaults" become indistinguishable. - The duplicate "non-empty dict" check in _validate_instruction is removed; the responsibility lives entirely in _normalize_cfg_shape now. - QuantizeConfig.validate_quant_cfg_entries is deleted. It used to do a second-pass QuantizerAttributeConfig.model_validate(c); now that cfg *is* QuantizerAttributeConfig, pydantic's field-type validation catches the same attribute-level errors on the first pass. Tests: - Direct dict comparisons of entry["cfg"] become entry["cfg"].model_dump(exclude_unset=True) == {...} (cfg is a model now, not a raw dict). Nested cases like result[0]["cfg"][0]["num_bits"] keep working via the MutableMapping interface inherited from ModeloptBaseConfig. - Error-message regex updated from "non-empty dict" to "at least one quantizer attribute" to match the rewritten message. - One test using load_config without a schema (raw dict tree) is left comparing against a plain dict, with a comment explaining why. Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 96 +++++++------------ tests/unit/recipe/test_loader.py | 55 +++++++---- .../quantization/test_config_validation.py | 30 +++--- 3 files changed, 87 insertions(+), 94 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index bad3a05530..acee22fa99 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -176,11 +176,12 @@ class QuantizerCfgEntry(ModeloptBaseConfig): description="If provided, only quantizers whose parent module matches this PyTorch class " "name (e.g. ``'nn.Linear'``) are affected.", ) - cfg: dict[str, Any] | list[dict[str, Any]] | None = ModeloptField( + cfg: "QuantizerAttributeConfig | list[QuantizerAttributeConfig] | None" = ModeloptField( default=None, title="Quantizer attribute config.", - description="A ``QuantizerAttributeConfig``-shaped dict, or a list of such dicts for " - "sequential quantizers. ``None`` leaves the existing attribute config untouched.", + description="A :class:`QuantizerAttributeConfig` (or a mapping that validates as one), " + "or a list of such for sequential quantizers. ``None`` leaves the existing attribute " + "config untouched.", ) enable: bool = ModeloptField( default=True, @@ -190,29 +191,42 @@ class QuantizerCfgEntry(ModeloptBaseConfig): @model_validator(mode="before") @classmethod - def _drop_empty_cfg_when_disabled(cls, values): - """Treat ``enable=False`` with an empty ``cfg`` as a pure disable. - - Downstream, any non-``None`` ``cfg`` is applied as a full quantizer-attribute - replacement. An entry like ``{cfg: {}, enable: False}`` would therefore reset - the quantizer's attributes back to schema defaults — and if a later rule - re-enables the quantizer, it would come back with defaults rather than the - config it originally carried. Normalise an empty ``cfg`` (empty dict, empty - list, or a list of empty dicts) to ``None`` so a disable-only entry behaves - like one. + def _normalize_cfg_shape(cls, values): + """Pre-validation shape rules for ``cfg``. + + Runs against the raw input mapping, before pydantic coerces ``cfg`` into a + :class:`QuantizerAttributeConfig` (which would fill in schema defaults and erase the + distinction between "user typed nothing" and "user typed `{}`"). Two rules: + + 1. ``enable=False`` with an empty ``cfg`` — empty dict, empty list, or list of empty + dicts — is normalized to ``cfg=None``. Downstream applies any non-``None`` ``cfg`` + as a full quantizer-attribute replacement, so without this an entry like + ``{cfg: {}, enable: False}`` would reset attributes to schema defaults and a later + re-enable would bring the quantizer back with defaults instead of its original config. + + 2. ``enable=True`` (explicit or implicit) with an empty ``cfg`` — same shapes — is + rejected. Pydantic would otherwise coerce ``{}`` into ``QuantizerAttributeConfig()`` + with all defaults, silently turning a likely typo (``cfg: {}``) into "quantize with + schema defaults." Callers who really want defaults should drop ``cfg`` entirely and + rely on ``enable=True``; an empty ``cfg`` always indicates missing input. """ if not isinstance(values, dict): return values - if values.get("enable") is False: - cfg = values.get("cfg") - cfg_is_empty = (isinstance(cfg, dict) and len(cfg) == 0) or ( - isinstance(cfg, list) - and ( - len(cfg) == 0 or all(isinstance(item, dict) and len(item) == 0 for item in cfg) - ) - ) - if cfg_is_empty: + cfg = values.get("cfg") + cfg_is_empty = (isinstance(cfg, dict) and len(cfg) == 0) or ( + isinstance(cfg, list) + and (len(cfg) == 0 or all(isinstance(item, dict) and len(item) == 0 for item in cfg)) + ) + if cfg_is_empty: + if values.get("enable") is False: values = {**values, "cfg": None} + else: + raise ValueError( + f"QuantizerCfgEntry 'cfg' must specify at least one quantizer attribute; " + f"got an empty mapping/list for quantizer " + f"{values.get('quantizer_name')!r}. To keep existing attributes, drop " + f"'cfg' and rely on 'enable=True'; to disable, set 'enable=False'." + ) return values @model_validator(mode="after") @@ -225,23 +239,6 @@ def _validate_instruction(self): f"'quantizer_name'={self.quantizer_name!r} has no effect (implicit enable=True " "is not allowed; set it explicitly)." ) - - if self.enable and self.cfg is not None: - if isinstance(self.cfg, dict): - is_invalid = len(self.cfg) == 0 - elif isinstance(self.cfg, list): - is_invalid = len(self.cfg) == 0 or any( - not isinstance(item, dict) or len(item) == 0 for item in self.cfg - ) - else: - is_invalid = True - if is_invalid: - raise ValueError( - f"QuantizerCfgEntry 'cfg' must be a non-empty dict or a non-empty list of " - f"non-empty dicts when enabling quantizer {self.quantizer_name!r}, got " - f"{type(self.cfg).__name__}: {self.cfg!r}. Either provide quantizer " - "attributes in 'cfg' or remove 'cfg' and set 'enable' explicitly." - ) return self @@ -1178,27 +1175,6 @@ def normalize_quant_cfg( """ return normalize_quant_cfg_list(v) - @field_validator("quant_cfg", mode="after") - @classmethod - def validate_quant_cfg_entries(cls, v: QuantizeQuantCfgType) -> QuantizeQuantCfgType: - """Validate each entry's ``cfg`` against :class:`QuantizerAttributeConfig`. - - Runs after the ``mode="before"`` normalizer and pydantic's field-type check, so - every element here is already a :class:`QuantizerCfgEntry`. This second pass - surfaces attribute-level errors (e.g. invalid ``axis`` / ``block_sizes``) that the - per-entry ``QuantizerCfgEntry`` validator doesn't inspect. - """ - qac_fields = set(QuantizerAttributeConfig.model_fields.keys()) - for entry in v: - cfg = entry.cfg - if cfg is None: - continue - cfgs: list[dict[str, Any]] = cfg if isinstance(cfg, list) else [cfg] - for c in cfgs: - if isinstance(c, dict) and qac_fields & set(c.keys()): - QuantizerAttributeConfig.model_validate(c) - return v - class CompressConfig(ModeloptBaseConfig): """Default configuration for ``compress`` mode.""" diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index 759b629f2d..e8d0d33b6c 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -317,7 +317,7 @@ def test_import_resolves_cfg_reference(tmp_path): ) recipe = load_recipe(recipe_file) entry = recipe.quantize["quant_cfg"][0] - assert entry["cfg"] == {"num_bits": (4, 3), "axis": None} + assert entry["cfg"].model_dump(exclude_unset=True) == {"num_bits": (4, 3), "axis": None} def test_import_same_name_used_twice(tmp_path): @@ -390,7 +390,10 @@ def test_import_inline_cfg_not_affected(tmp_path): f" axis: 0\n" ) recipe = load_recipe(recipe_file) - assert recipe.quantize["quant_cfg"][1]["cfg"] == {"num_bits": 8, "axis": 0} + assert recipe.quantize["quant_cfg"][1]["cfg"].model_dump(exclude_unset=True) == { + "num_bits": 8, + "axis": 0, + } def test_import_unknown_reference_raises(tmp_path): @@ -619,7 +622,7 @@ def test_import_cfg_extend(tmp_path): ) recipe = load_recipe(recipe_file) cfg = recipe.quantize["quant_cfg"][0]["cfg"] - assert cfg == {"num_bits": (4, 3), "axis": 0} + assert cfg.model_dump(exclude_unset=True) == {"num_bits": (4, 3), "axis": 0} def test_import_cfg_inline_overrides_import(tmp_path): @@ -682,6 +685,7 @@ def test_import_in_multiple_dict_values(tmp_path): ) data = load_config(config_file) entry = data["quant_cfg"][0] + # load_config has no schema here — data is a raw dict tree, so entry["cfg"] is a dict. assert entry["cfg"] == {"num_bits": (4, 3)} assert entry["my_field"] == {"fake_quant": False} @@ -706,7 +710,7 @@ def test_import_cfg_multi_import(tmp_path): ) recipe = load_recipe(recipe_file) cfg = recipe.quantize["quant_cfg"][0]["cfg"] - assert cfg == {"num_bits": (4, 3), "axis": 0} + assert cfg.model_dump(exclude_unset=True) == {"num_bits": (4, 3), "axis": 0} def test_import_cfg_multi_import_later_overrides_earlier(tmp_path): @@ -755,7 +759,11 @@ def test_import_cfg_multi_import_with_extend(tmp_path): ) recipe = load_recipe(recipe_file) cfg = recipe.quantize["quant_cfg"][0]["cfg"] - assert cfg == {"num_bits": (4, 3), "fake_quant": False, "axis": 0} + assert cfg.model_dump(exclude_unset=True) == { + "num_bits": (4, 3), + "fake_quant": False, + "axis": 0, + } def test_import_dir_format(tmp_path): @@ -772,7 +780,10 @@ def test_import_dir_format(tmp_path): " $import: fp8\n" ) recipe = load_recipe(tmp_path) - assert recipe.quantize["quant_cfg"][0]["cfg"] == {"num_bits": (4, 3), "axis": None} + assert recipe.quantize["quant_cfg"][0]["cfg"].model_dump(exclude_unset=True) == { + "num_bits": (4, 3), + "axis": None, + } def test_import_dir_format_metadata_imports_do_not_apply_to_quantize(tmp_path): @@ -826,7 +837,9 @@ def test_import_multi_document_list_snippet(tmp_path): recipe = load_recipe(recipe_file) assert len(recipe.quantize["quant_cfg"]) == 1 assert recipe.quantize["quant_cfg"][0]["quantizer_name"] == "*[kv]_bmm_quantizer" - assert recipe.quantize["quant_cfg"][0]["cfg"] == {"num_bits": (4, 3)} + assert recipe.quantize["quant_cfg"][0]["cfg"].model_dump(exclude_unset=True) == { + "num_bits": (4, 3) + } def test_import_builtin_kv_fp8_snippet(): @@ -938,7 +951,7 @@ def test_import_mixed_tree(tmp_path): ) data = load_config(config_file) # Dict import inside list entry - assert data["quant_cfg"][0]["cfg"] == {"num_bits": (4, 3)} + assert data["quant_cfg"][0]["cfg"].model_dump(exclude_unset=True) == {"num_bits": (4, 3)} # List splice — entries are normalized by QuantizeConfig.quant_cfg's validator, # which fills in defaults for missing ``enable`` / ``cfg`` keys. Entries are now # QuantizerCfgEntry pydantic instances, so compare via model_dump. @@ -986,7 +999,7 @@ def test_import_recursive(tmp_path): ) recipe = load_recipe(recipe_file) cfg = recipe.quantize["quant_cfg"][0]["cfg"] - assert cfg == {"num_bits": (4, 3)} + assert cfg.model_dump(exclude_unset=True) == {"num_bits": (4, 3)} def test_import_circular_raises(tmp_path): @@ -1086,9 +1099,14 @@ def test_import_cross_file_same_name_no_conflict(tmp_path): ) recipe = load_recipe(recipe_file) # Parent's "fmt" resolves to fp8 (e4m3), not child's nvfp4. - assert recipe.quantize["quant_cfg"][0]["cfg"] == {"num_bits": (4, 3)} + assert recipe.quantize["quant_cfg"][0]["cfg"].model_dump(exclude_unset=True) == { + "num_bits": (4, 3) + } # Child's "fmt" resolves to nvfp4 (e2m1), not parent's fp8. - assert recipe.quantize["quant_cfg"][1]["cfg"] == {"num_bits": (2, 1), "axis": 0} + assert recipe.quantize["quant_cfg"][1]["cfg"].model_dump(exclude_unset=True) == { + "num_bits": (2, 1), + "axis": 0, + } # --------------------------------------------------------------------------- @@ -1179,13 +1197,12 @@ def test_modelopt_schema_comment_validates_after_import_resolution(tmp_path): f" $import: fp8\n" ) data = load_config(config_file) - # data is a list of QuantizerCfgEntry pydantic instances, not raw dicts. + # data is a list of QuantizerCfgEntry pydantic instances, not raw dicts. Dump with + # exclude_unset=True so the inner QuantizerAttributeConfig stays sparse (cascades). assert len(data) == 1 - assert data[0].model_dump() == { + assert data[0].model_dump(exclude_unset=True) == { "quantizer_name": "*weight_quantizer", - "parent_class": None, "cfg": {"num_bits": (4, 3)}, - "enable": True, } @@ -1291,12 +1308,12 @@ def test_load_config_list_valued_yaml(tmp_path): data = load_config(cfg_file) assert isinstance(data, list) assert len(data) == 2 - # Entries are QuantizerCfgEntry pydantic instances after schema validation. - assert data[0].model_dump() == { + # Entries are QuantizerCfgEntry pydantic instances after schema validation; dump + # with exclude_unset=True so the inner QuantizerAttributeConfig stays in sparse + # form (pydantic cascades exclude_unset to nested models). + assert data[0].model_dump(exclude_unset=True) == { "quantizer_name": "*weight_quantizer", - "parent_class": None, "cfg": {"num_bits": 8}, - "enable": True, } diff --git a/tests/unit/torch/quantization/test_config_validation.py b/tests/unit/torch/quantization/test_config_validation.py index 00f7cb3605..ce98f989f5 100644 --- a/tests/unit/torch/quantization/test_config_validation.py +++ b/tests/unit/torch/quantization/test_config_validation.py @@ -81,7 +81,7 @@ def test_new_format_passthrough(self): result = normalize_quant_cfg_list(raw) assert len(result) == 1 assert result[0]["quantizer_name"] == "*weight_quantizer" - assert result[0]["cfg"] == {"num_bits": 8, "axis": 0} + assert result[0]["cfg"].model_dump(exclude_unset=True) == {"num_bits": 8, "axis": 0} assert result[0]["enable"] is True # defaulted def test_new_format_enable_false(self): @@ -103,7 +103,7 @@ def test_legacy_single_key_dict(self): raw = [{"*weight_quantizer": {"num_bits": 8, "axis": 0}}] result = normalize_quant_cfg_list(raw) assert result[0]["quantizer_name"] == "*weight_quantizer" - assert result[0]["cfg"] == {"num_bits": 8, "axis": 0} + assert result[0]["cfg"].model_dump(exclude_unset=True) == {"num_bits": 8, "axis": 0} assert result[0]["enable"] is True # defaulted def test_legacy_single_key_dict_with_enable(self): @@ -166,19 +166,19 @@ def test_error_on_multi_key_legacy_dict(self): def test_error_on_empty_cfg_dict_implicit_enable(self): """Entry with cfg={} and implicit enable=True is rejected.""" - with pytest.raises(ValueError, match="non-empty dict"): + with pytest.raises(ValueError, match=r"at least one quantizer attribute"): normalize_quant_cfg_list([{"quantizer_name": "*weight_quantizer", "cfg": {}}]) def test_error_on_empty_cfg_dict_explicit_enable_true(self): """Entry with cfg={} and explicit enable=True is rejected.""" - with pytest.raises(ValueError, match="non-empty dict"): + with pytest.raises(ValueError, match=r"at least one quantizer attribute"): normalize_quant_cfg_list( [{"quantizer_name": "*weight_quantizer", "cfg": {}, "enable": True}] ) def test_error_on_empty_cfg_list_enable_true(self): """Entry with cfg=[] and enable=True is rejected.""" - with pytest.raises(ValueError, match="non-empty dict"): + with pytest.raises(ValueError, match=r"at least one quantizer attribute"): normalize_quant_cfg_list( [{"quantizer_name": "*weight_quantizer", "cfg": [], "enable": True}] ) @@ -200,7 +200,7 @@ def test_error_on_non_dict_non_list_cfg_enable_true(self): def test_error_on_cfg_list_with_empty_dict_enable_true(self): """Entry with cfg=[{}] and enable=True is rejected (empty dict element).""" - with pytest.raises(ValueError, match="non-empty dict"): + with pytest.raises(ValueError, match=r"at least one quantizer attribute"): normalize_quant_cfg_list( [{"quantizer_name": "*weight_quantizer", "cfg": [{}], "enable": True}] ) @@ -260,7 +260,7 @@ def test_nonempty_cfg_enable_false_preserved(self): ] ) assert result[0]["enable"] is False - assert result[0]["cfg"] == {"num_bits": 4} + assert result[0]["cfg"].model_dump(exclude_unset=True) == {"num_bits": 4} def test_new_format_with_list_cfg(self): """cfg can be a list of dicts for SequentialQuantizer.""" @@ -275,7 +275,7 @@ def test_new_format_with_list_cfg(self): ] result = normalize_quant_cfg_list(raw) assert len(result) == 1 - assert result[0]["cfg"] == raw[0]["cfg"] + assert [c.model_dump(exclude_unset=True) for c in result[0]["cfg"]] == raw[0]["cfg"] assert result[0]["enable"] is True def test_legacy_flat_dict_conversion(self): @@ -287,7 +287,7 @@ def test_legacy_flat_dict_conversion(self): assert result[0]["enable"] is False assert result[0]["cfg"] is None assert result[1]["quantizer_name"] == "*weight_quantizer" - assert result[1]["cfg"] == {"num_bits": 8, "axis": 0} + assert result[1]["cfg"].model_dump(exclude_unset=True) == {"num_bits": 8, "axis": 0} assert result[1]["enable"] is True def test_legacy_enable_only_produces_cfg_none(self): @@ -318,7 +318,7 @@ def test_legacy_default_key_with_cfg(self): raw = [{"default": {"num_bits": 8, "axis": None}}] result = normalize_quant_cfg_list(raw) assert result[0]["quantizer_name"] == "*" - assert result[0]["cfg"] == {"num_bits": 8, "axis": None} + assert result[0]["cfg"].model_dump(exclude_unset=True) == {"num_bits": 8, "axis": None} assert result[0]["enable"] is True def test_legacy_flat_dict_with_default_key(self): @@ -353,7 +353,7 @@ def test_legacy_nn_class_with_cfg(self): assert len(result) == 1 assert result[0]["parent_class"] == "nn.Linear" assert result[0]["quantizer_name"] == "*weight_quantizer" - assert result[0]["cfg"] == {"num_bits": 4, "axis": 0} + assert result[0]["cfg"].model_dump(exclude_unset=True) == {"num_bits": 4, "axis": 0} assert result[0]["enable"] is True def test_legacy_list_valued_cfg(self): @@ -387,7 +387,7 @@ def test_finds_last_match(self): ] ) result = find_quant_cfg_entry_by_path(entries, "*weight_quantizer") - assert result["cfg"] == {"num_bits": 4} + assert result["cfg"].model_dump(exclude_unset=True) == {"num_bits": 4} def test_exact_match_only(self): """Does not do fnmatch — only exact string equality on quantizer_name.""" @@ -444,7 +444,7 @@ def test_wildcard_matches_bare_name(self): [{"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": 8}}] ) matched, enable = _match_quantizer_cfg(quant_cfg, "weight_quantizer") - assert matched == {"num_bits": 8} + assert matched.model_dump(exclude_unset=True) == {"num_bits": 8} assert enable is True def test_star_matches_any_bare_name(self): @@ -464,7 +464,7 @@ def test_path_scoped_pattern_matches_matching_suffix(self): [{"quantizer_name": "*mlp*weight_quantizer", "cfg": {"num_bits": 4}}] ) matched, enable = _match_quantizer_cfg(quant_cfg, "weight_quantizer") - assert matched == {"num_bits": 4} + assert matched.model_dump(exclude_unset=True) == {"num_bits": 4} def test_path_scoped_pattern_does_not_match_different_suffix(self): """'*mlp*weight_quantizer' does NOT match bare 'input_quantizer'.""" @@ -488,7 +488,7 @@ def test_last_match_wins(self): ] ) matched, _ = _match_quantizer_cfg(quant_cfg, "weight_quantizer") - assert matched == {"num_bits": 4} + assert matched.model_dump(exclude_unset=True) == {"num_bits": 4} def test_no_match_returns_none(self): """No matching entry returns (None, None).""" From a117cdc71ea77804b8e55bea7000c954c4c1dbe8 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Tue, 5 May 2026 17:47:53 -0700 Subject: [PATCH 15/23] yaml for all hard coded PTQ configs Signed-off-by: Shengliang Xu --- modelopt/torch/opt/config_loader.py | 16 +- modelopt/torch/quantization/config.py | 698 ++++-------------- modelopt_recipes/configs/numerics/int4.yaml | 22 + modelopt_recipes/configs/numerics/int8.yaml | 20 + modelopt_recipes/configs/numerics/mxfp4.yaml | 23 + modelopt_recipes/configs/numerics/mxfp6.yaml | 23 + modelopt_recipes/configs/numerics/mxfp8.yaml | 23 + modelopt_recipes/configs/numerics/mxint8.yaml | 23 + .../configs/numerics/nvfp4_bs32.yaml | 23 + .../configs/ptq/presets/README.md | 2 +- .../configs/ptq/presets/kv/fp8_affine.yaml | 26 + .../configs/ptq/presets/kv/nvfp4.yaml | 25 + .../configs/ptq/presets/kv/nvfp4_affine.yaml | 29 + .../configs/ptq/presets/kv/nvfp4_rotate.yaml | 34 + .../model/fp8_2d_blockwise_weight_only.yaml | 34 + .../model/fp8_per_channel_per_token.yaml | 36 + .../configs/ptq/presets/model/int4_awq.yaml | 34 + .../model/int4_blockwise_weight_only.yaml | 33 + .../configs/ptq/presets/model/int8.yaml | 34 + .../ptq/presets/model/int8_smoothquant.yaml | 34 + .../ptq/presets/model/int8_weight_only.yaml | 32 + .../model/mamba_moe_fp8_aggressive.yaml | 35 + .../model/mamba_moe_fp8_conservative.yaml | 39 + .../model/mamba_moe_nvfp4_aggressive.yaml | 35 + .../model/mamba_moe_nvfp4_conservative.yaml | 39 + .../configs/ptq/presets/model/mxfp4.yaml | 33 + .../presets/model/mxfp4_mlp_weight_only.yaml | 33 + .../configs/ptq/presets/model/mxfp6.yaml | 33 + .../configs/ptq/presets/model/mxfp8.yaml | 33 + .../configs/ptq/presets/model/mxint8.yaml | 33 + .../configs/ptq/presets/model/nvfp4.yaml | 33 + .../ptq/presets/model/nvfp4_awq_clip.yaml | 34 + .../ptq/presets/model/nvfp4_awq_full.yaml | 35 + .../ptq/presets/model/nvfp4_awq_lite.yaml | 33 + .../ptq/presets/model/nvfp4_experts_only.yaml | 45 ++ .../ptq/presets/model/nvfp4_fp8_mha.yaml | 48 ++ .../ptq/presets/model/nvfp4_mlp_only.yaml | 45 ++ .../presets/model/nvfp4_mlp_weight_only.yaml | 33 + .../ptq/presets/model/nvfp4_omlp_only.yaml | 45 ++ .../ptq/presets/model/nvfp4_svdquant.yaml | 35 + .../nvfp4_w4a4_weight_local_hessian.yaml | 36 + .../nvfp4_w4a4_weight_mse_fp8_sweep.yaml | 36 + .../ptq/presets/model/w4a8_awq_beta.yaml | 35 + .../ptq/presets/model/w4a8_mxfp4_fp8.yaml | 34 + .../ptq/presets/model/w4a8_nvfp4_fp8.yaml | 33 + modelopt_recipes/configs/ptq/units/README.md | 1 + .../units/mamba_moe_disabled_quantizers.yaml | 34 + tests/unit/recipe/test_loader.py | 64 +- 48 files changed, 1625 insertions(+), 571 deletions(-) create mode 100644 modelopt_recipes/configs/numerics/int4.yaml create mode 100644 modelopt_recipes/configs/numerics/int8.yaml create mode 100644 modelopt_recipes/configs/numerics/mxfp4.yaml create mode 100644 modelopt_recipes/configs/numerics/mxfp6.yaml create mode 100644 modelopt_recipes/configs/numerics/mxfp8.yaml create mode 100644 modelopt_recipes/configs/numerics/mxint8.yaml create mode 100644 modelopt_recipes/configs/numerics/nvfp4_bs32.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/int8.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_aggressive.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_conservative.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_aggressive.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_conservative.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/mxfp4.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/mxfp4_mlp_weight_only.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/mxfp6.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/mxfp8.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/mxint8.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/nvfp4.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_clip.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_full.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_lite.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/nvfp4_experts_only.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_only.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_weight_only.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/nvfp4_omlp_only.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/nvfp4_svdquant.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/w4a8_mxfp4_fp8.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml create mode 100644 modelopt_recipes/configs/ptq/units/mamba_moe_disabled_quantizers.yaml diff --git a/modelopt/torch/opt/config_loader.py b/modelopt/torch/opt/config_loader.py index 76ed2bb650..1bf518c208 100644 --- a/modelopt/torch/opt/config_loader.py +++ b/modelopt/torch/opt/config_loader.py @@ -336,7 +336,21 @@ def _schema_equal(left: Any | None, right: Any | None) -> bool: def _list_element_schema(schema_type: Any | None) -> Any | None: """Return the element schema for a typed ``list[T]`` annotation.""" schema_type = _unwrap_schema_type(schema_type) - if get_origin(schema_type) is not list: + origin = get_origin(schema_type) + if origin in (UnionType, Union): + element_schemas = [ + element_schema + for arg in get_args(schema_type) + if (element_schema := _list_element_schema(arg)) is not None + ] + if len(element_schemas) == 1: + return element_schemas[0] + if element_schemas and all( + _schema_equal(element_schemas[0], item) for item in element_schemas[1:] + ): + return element_schemas[0] + return None + if origin is not list: return None args = get_args(schema_type) if len(args) != 1 or args[0] is Any: diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index acee22fa99..de27167865 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -150,7 +150,6 @@ """ -import copy import warnings from collections.abc import Mapping, Sequence from typing import Any, Literal @@ -1199,577 +1198,164 @@ class _QuantizeExportConfig(ModeloptBaseConfig): """An empty config.""" -# Shared snippet constants are dumped back to plain dicts before being spliced into -# the public quant config constants below. ``load_config`` returns validated -# ``QuantizerCfgEntry`` instances for schema-tagged files, but the public constants -# (``INT4_AWQ_CFG``, ``NVFP4_DEFAULT_CFG``, etc.) have always been raw dict/list trees; -# splatting schema instances into them would surprise callers that serialise the -# constants or do ``isinstance(entry, dict)`` checks. ``exclude_unset=True`` keeps the -# sparse YAML shape (only the explicitly set fields) so the dumped dicts are -# byte-identical to what authors wrote in the YAML snippets. -_base_disable_all: list[dict[str, Any]] = [ - load_config("configs/ptq/units/base_disable_all").model_dump(exclude_unset=True) -] - -_default_disabled_quantizer_cfg: list[dict[str, Any]] = [ - entry.model_dump(exclude_unset=True) - for entry in load_config("configs/ptq/units/default_disabled_quantizers") -] - -_mamba_moe_disabled_quantizer_cfg: list[dict[str, Any]] = [ - {"quantizer_name": "*fc1_latent_proj*", "enable": False}, # Skip Latent MOE - {"quantizer_name": "*fc2_latent_proj*", "enable": False}, # Skip Latent MOE - {"quantizer_name": "*q_proj*", "enable": False}, # Skip QKV Linear (HF naming) - {"quantizer_name": "*k_proj*", "enable": False}, # Skip QKV Linear (HF naming) - {"quantizer_name": "*v_proj*", "enable": False}, # Skip QKV Linear (HF naming) - {"quantizer_name": "*o_proj*", "enable": False}, # Skip QKV Output Projection (HF naming) - { - "quantizer_name": "*self_attention.linear_qkv*", - "enable": False, - }, # Skip QKV Linear (Mcore naming) - { - "quantizer_name": "*self_attention.linear_proj*", - "enable": False, - }, # Skip QKV Output Projection (Mcore naming) -] - -INT8_DEFAULT_CFG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, - {"quantizer_name": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "max", -} - -INT8_SMOOTHQUANT_CFG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, - {"quantizer_name": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "smoothquant", -} +def _load_quantize_config(config_path: str) -> QuantizeConfig: + """Load a schema-backed QuantizeConfig YAML.""" + config = load_config(config_path, schema_type=QuantizeConfig) + if isinstance(config, QuantizeConfig): + return config + if isinstance(config, Mapping): + return QuantizeConfig.model_validate(config) + raise TypeError(f"{config_path} must declare QuantizeConfig.") + + +def _load_quantizer_attribute_dict(config_path: str) -> dict[str, Any]: + """Load a schema-backed QuantizerAttributeConfig YAML as a public dict.""" + config = load_config(config_path, schema_type=QuantizerAttributeConfig) + if isinstance(config, QuantizerAttributeConfig): + return config.model_dump(exclude_unset=True) + if isinstance(config, Mapping): + return dict(config) + raise TypeError(f"{config_path} must declare QuantizerAttributeConfig.") + + +def _quantizer_cfg_entry_to_dict(entry: QuantizerCfgEntry | Mapping[str, Any]) -> dict[str, Any]: + """Dump a typed quant_cfg entry back to the public legacy dict shape.""" + if isinstance(entry, QuantizerCfgEntry): + return entry.model_dump(exclude_unset=True) + if isinstance(entry, Mapping): + return dict(entry) + raise TypeError(f"Expected QuantizerCfgEntry or mapping, got {type(entry).__name__}.") + + +def _load_quantizer_cfg_dict_list(config_path: str) -> list[dict[str, Any]]: + """Load a QuantizerCfgEntry or QuantizerCfgListConfig snippet as public dict entries.""" + config = load_config(config_path) + if isinstance(config, QuantizerCfgEntry): + return [_quantizer_cfg_entry_to_dict(config)] + if isinstance(config, list): + entries = [] + for entry in config: + if not isinstance(entry, (QuantizerCfgEntry, Mapping)): + raise TypeError( + f"Expected QuantizerCfgEntry or mapping, got {type(entry).__name__}." + ) + entries.append(_quantizer_cfg_entry_to_dict(entry)) + return entries + if isinstance(config, Mapping): + return [_quantizer_cfg_entry_to_dict(config)] + raise TypeError(f"{config_path} must declare QuantizerCfgEntry or QuantizerCfgListConfig.") -INT8_WEIGHT_ONLY_CFG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, - {"quantizer_name": "*input_quantizer", "enable": False}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "max", -} -FP8_DEFAULT_CFG: dict[str, Any] = load_config("configs/ptq/presets/model/fp8").model_dump( - exclude_unset=True +_base_disable_all: list[dict[str, Any]] = _load_quantizer_cfg_dict_list( + "configs/ptq/units/base_disable_all" ) -MAMBA_MOE_FP8_AGGRESSIVE_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": {"num_bits": (4, 3), "axis": None}, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": {"num_bits": (4, 3), "axis": None}, - }, - *_default_disabled_quantizer_cfg, - *_mamba_moe_disabled_quantizer_cfg, - ], - "algorithm": "max", -} - -MAMBA_MOE_FP8_CONSERVATIVE_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": {"num_bits": (4, 3), "axis": None}, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": {"num_bits": (4, 3), "axis": None}, - }, - *_default_disabled_quantizer_cfg, - *_mamba_moe_disabled_quantizer_cfg, - {"quantizer_name": "*mixer.in_proj*", "enable": False}, # Skip mamba linear - {"quantizer_name": "*mixer.out_proj*", "enable": False}, # Skip mamba linear - ], - "algorithm": "max", -} - -FP8_PER_CHANNEL_PER_TOKEN_CFG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": 0}}, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (4, 3), - "type": "dynamic", - "block_sizes": {-1: None}, - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "max", -} - -# FP8 2D blockwise fake quantization config for deepseek models -FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (4, 3), - "block_sizes": {-1: 128, -2: 128}, - }, - }, - {"quantizer_name": "*input_quantizer", "enable": False}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "max", -} - -INT4_BLOCKWISE_WEIGHT_ONLY_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": 4, - "block_sizes": {-1: 128}, - }, - }, - {"quantizer_name": "*input_quantizer", "enable": False}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "max", -} - - -INT4_AWQ_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": 4, - "block_sizes": {-1: 128, "type": "static"}, - }, - }, - {"quantizer_name": "*input_quantizer", "enable": False}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": {"method": "awq_lite", "alpha_step": 0.1}, - # "algorithm": {"method": "awq_full", "alpha_step": 0.1, "max_co_batch_size": 1024}, - # "algorithm": {"method": "awq_clip", "max_co_batch_size": 2048}, -} - -# W4A8 currently uses INT4 blockwise quantization (block size = 128) followed by FP8 quantization -# for weights. This could change in the future -W4A8_AWQ_BETA_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": [ - { - "num_bits": 4, - "block_sizes": {-1: 128, "type": "static"}, - }, - { - "num_bits": (4, 3), - }, - ], - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "awq_lite", -} - -MXFP8_DEFAULT_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (4, 3), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (4, 3), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": None, -} +_default_disabled_quantizer_cfg: list[dict[str, Any]] = _load_quantizer_cfg_dict_list( + "configs/ptq/units/default_disabled_quantizers" +) -MXFP6_DEFAULT_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (3, 2), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (3, 2), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": None, -} +_mamba_moe_disabled_quantizer_cfg: list[dict[str, Any]] = _load_quantizer_cfg_dict_list( + "configs/ptq/units/mamba_moe_disabled_quantizers" +) -MXFP4_DEFAULT_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": None, -} +_nvfp4_cfg: dict[str, Any] = _load_quantizer_attribute_dict("configs/numerics/nvfp4") -W4A8_MXFP4_FP8_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": {"num_bits": (4, 3), "axis": None}, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": None, -} +_nvfp4_cfg_bs32: dict[str, Any] = _load_quantizer_attribute_dict("configs/numerics/nvfp4_bs32") -MXINT8_DEFAULT_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": 8, - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": 8, - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": None, -} +INT8_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/int8") +INT8_SMOOTHQUANT_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/int8_smoothquant" +) +INT8_WEIGHT_ONLY_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/int8_weight_only" +) +FP8_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/fp8") +MAMBA_MOE_FP8_AGGRESSIVE_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/mamba_moe_fp8_aggressive" +) +MAMBA_MOE_FP8_CONSERVATIVE_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/mamba_moe_fp8_conservative" +) +FP8_PER_CHANNEL_PER_TOKEN_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/fp8_per_channel_per_token" +) +FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/fp8_2d_blockwise_weight_only" +) +INT4_BLOCKWISE_WEIGHT_ONLY_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/int4_blockwise_weight_only" +) +INT4_AWQ_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/int4_awq") +W4A8_AWQ_BETA_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/w4a8_awq_beta" +) +MXFP8_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/mxfp8") +MXFP6_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/mxfp6") +MXFP4_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/mxfp4") +W4A8_MXFP4_FP8_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/w4a8_mxfp4_fp8" +) +MXINT8_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/mxint8") # KV-cache configs are designed to be merged with a primary quantization config (e.g. # FP8_DEFAULT_CFG) that already contains _base_disable_all. They intentionally omit both # _base_disable_all and "algorithm" because these are provided by the primary config. -FP8_KV_CFG: dict[str, Any] = load_config("configs/ptq/presets/kv/fp8").model_dump( - exclude_unset=True -) - -FP8_AFFINE_KV_CFG = { - "quant_cfg": [ - { - "quantizer_name": "*[kv]_bmm_quantizer", - "cfg": { - "num_bits": (4, 3), - "bias": {-2: None, -4: None, "type": "static"}, - }, - }, - ] -} - -_nvfp4_cfg = { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, -} +FP8_KV_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/kv/fp8") +FP8_AFFINE_KV_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/kv/fp8_affine") -_nvfp4_cfg_bs32 = { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (4, 3)}, -} - - -def _nvfp4_selective_quant_cfg( - layer_patterns: list[str], - *, - quantizer: dict = _nvfp4_cfg, - weight_only: bool = False, - algorithm: str | dict = "max", -) -> dict: - """Build an NVFP4 config that quantizes only the specified layer patterns.""" - quant_cfg: list[dict[str, Any]] = [] - quant_cfg.extend(_base_disable_all) - for pattern in layer_patterns: - # Deep-copy the quantizer dict so each config constant gets its own instance. - quant_cfg.append( - {"quantizer_name": f"{pattern}weight_quantizer", "cfg": copy.deepcopy(quantizer)} - ) - if not weight_only: - quant_cfg.append( - {"quantizer_name": f"{pattern}input_quantizer", "cfg": copy.deepcopy(quantizer)} - ) - quant_cfg.extend(_default_disabled_quantizer_cfg) - return {"quant_cfg": quant_cfg, "algorithm": algorithm} - - -NVFP4_DEFAULT_CFG = _nvfp4_selective_quant_cfg(["*"]) - -NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, - }, - }, - {"quantizer_name": "*input_quantizer", "cfg": _nvfp4_cfg}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": { - "method": "mse", - "fp8_scale_sweep": True, - }, -} - -NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, - }, - }, - {"quantizer_name": "*input_quantizer", "cfg": _nvfp4_cfg}, - *_default_disabled_quantizer_cfg, - ], - "algorithm": { - "method": "local_hessian", - "fp8_scale_sweep": True, - }, -} - -MAMBA_MOE_NVFP4_AGGRESSIVE_CFG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": _nvfp4_cfg}, - {"quantizer_name": "*input_quantizer", "cfg": _nvfp4_cfg}, - *_default_disabled_quantizer_cfg, - *_mamba_moe_disabled_quantizer_cfg, - ], - "algorithm": "max", -} -MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": _nvfp4_cfg}, - {"quantizer_name": "*input_quantizer", "cfg": _nvfp4_cfg}, - *_default_disabled_quantizer_cfg, - *_mamba_moe_disabled_quantizer_cfg, - {"quantizer_name": "*mixer.in_proj*", "enable": False}, # Skip mamba linear - {"quantizer_name": "*mixer.out_proj*", "enable": False}, # Skip mamba linear - ], - "algorithm": "max", -} - -NVFP4_AWQ_LITE_CFG = _nvfp4_selective_quant_cfg(["*"], algorithm="awq_lite") - -NVFP4_AWQ_CLIP_CFG = _nvfp4_selective_quant_cfg(["*"], algorithm={"method": "awq_clip"}) - -NVFP4_AWQ_FULL_CFG = _nvfp4_selective_quant_cfg( - ["*"], algorithm={"method": "awq_full", "alpha_step": 0.1} +NVFP4_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/nvfp4") +NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep" ) - -# See comment above FP8_KV_CFG — KV-cache configs omit _base_disable_all and "algorithm". -NVFP4_AFFINE_KV_CFG = { - "quant_cfg": [ - { - "quantizer_name": "*[kv]_bmm_quantizer", - "cfg": { - **_nvfp4_cfg, - "bias": {-2: None, -4: None, "type": "static"}, - }, - }, - ] -} - -NVFP4_KV_CFG = { - "quant_cfg": [ - {"quantizer_name": "*[kv]_bmm_quantizer", "cfg": _nvfp4_cfg}, - ] -} - -# Moved from examples/diffusers/quantization/config.py to here -NVFP4_FP8_MHA_CONFIG = { - "quant_cfg": [ - *_base_disable_all, - {"quantizer_name": "*weight_quantizer", "cfg": _nvfp4_cfg}, - {"quantizer_name": "*input_quantizer", "cfg": _nvfp4_cfg}, - {"quantizer_name": "*output_quantizer", "enable": False}, - { - "quantizer_name": "*q_bmm_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - { - "quantizer_name": "*k_bmm_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - { - "quantizer_name": "*v_bmm_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - { - "quantizer_name": "*softmax_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - { - "quantizer_name": "transformer_blocks*bmm2_output_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - ], - "algorithm": "max", -} - -# See comment above FP8_KV_CFG — KV-cache configs omit _base_disable_all and "algorithm". -NVFP4_KV_ROTATE_CFG = { - "quant_cfg": [ - { - # q_bmm is disabled but pre-configured with rotate=True so that downstream - # code can inspect the rotate flag even while the quantizer is off. - "quantizer_name": "*q_bmm_quantizer", - "cfg": { - "rotate": True, - }, - "enable": False, - }, - { - "quantizer_name": "*k_bmm_quantizer", - "cfg": { - **_nvfp4_cfg, - "rotate": True, - }, - }, - {"quantizer_name": "*v_bmm_quantizer", "cfg": _nvfp4_cfg}, - ], - "algorithm": "max", -} - -NVFP4_SVDQUANT_DEFAULT_CFG = _nvfp4_selective_quant_cfg( - ["*"], algorithm={"method": "svdquant", "lowrank": 32} +NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian" ) - -W4A8_NVFP4_FP8_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (4, 3)}, - }, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (4, 3), - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": "max", -} - -MXFP4_MLP_WEIGHT_ONLY_CFG = { - "quant_cfg": [ - *_base_disable_all, - { - "quantizer_name": "*mlp*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - { - "quantizer_name": "*block_sparse_moe*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - }, - }, - *_default_disabled_quantizer_cfg, - ], - "algorithm": None, -} - -NVFP4_MLP_WEIGHT_ONLY_CFG = _nvfp4_selective_quant_cfg( - ["*mlp*", "*block_sparse_moe*"], quantizer=_nvfp4_cfg_bs32, weight_only=True +MAMBA_MOE_NVFP4_AGGRESSIVE_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/mamba_moe_nvfp4_aggressive" +) +MAMBA_MOE_NVFP4_CONSERVATIVE_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/mamba_moe_nvfp4_conservative" +) +NVFP4_AWQ_LITE_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/nvfp4_awq_lite" +) +NVFP4_AWQ_CLIP_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/nvfp4_awq_clip" +) +NVFP4_AWQ_FULL_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/nvfp4_awq_full" +) +NVFP4_AFFINE_KV_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/kv/nvfp4_affine" +) +NVFP4_KV_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/kv/nvfp4") +NVFP4_FP8_MHA_CONFIG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/nvfp4_fp8_mha" +) +NVFP4_KV_ROTATE_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/kv/nvfp4_rotate" +) +NVFP4_SVDQUANT_DEFAULT_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/nvfp4_svdquant" +) +W4A8_NVFP4_FP8_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/w4a8_nvfp4_fp8" +) +MXFP4_MLP_WEIGHT_ONLY_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/mxfp4_mlp_weight_only" +) +NVFP4_MLP_WEIGHT_ONLY_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/nvfp4_mlp_weight_only" +) +NVFP4_EXPERTS_ONLY_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/nvfp4_experts_only" +) +NVFP4_MLP_ONLY_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/nvfp4_mlp_only" ) -NVFP4_EXPERTS_ONLY_CFG = _nvfp4_selective_quant_cfg( - ["*mlp.experts*", "*block_sparse_moe*", "*.experts.*"] +NVFP4_OMLP_ONLY_CFG: QuantizeConfig = _load_quantize_config( + "configs/ptq/presets/model/nvfp4_omlp_only" ) -NVFP4_MLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp*", "*block_sparse_moe*", "*.experts.*"]) -NVFP4_OMLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*o_proj*", "*mlp*", "*block_sparse_moe*"]) # DO NOT ADD NEW CONFIGS HERE. If you want to add a new general recipe, add it to # modelopt_recipes/general/ptq/ as a yaml file @@ -1785,6 +1371,7 @@ def _nvfp4_selective_quant_cfg( "INT8_SMOOTHQUANT_CFG", "INT8_WEIGHT_ONLY_CFG", "MXFP4_DEFAULT_CFG", + "MXFP6_DEFAULT_CFG", "MXFP8_DEFAULT_CFG", "MXINT8_DEFAULT_CFG", "NVFP4_AFFINE_KV_CFG", @@ -1808,6 +1395,7 @@ def _nvfp4_selective_quant_cfg( "MAMBA_MOE_NVFP4_AGGRESSIVE_CFG", "MAMBA_MOE_FP8_CONSERVATIVE_CFG", "MAMBA_MOE_FP8_AGGRESSIVE_CFG", + "NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG", "NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG", } diff --git a/modelopt_recipes/configs/numerics/int4.yaml b/modelopt_recipes/configs/numerics/int4.yaml new file mode 100644 index 0000000000..a21f7827cf --- /dev/null +++ b/modelopt_recipes/configs/numerics/int4.yaml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# INT4 static blockwise quantizer attributes. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: 4 +block_sizes: + -1: 128 + type: static diff --git a/modelopt_recipes/configs/numerics/int8.yaml b/modelopt_recipes/configs/numerics/int8.yaml new file mode 100644 index 0000000000..9e00034903 --- /dev/null +++ b/modelopt_recipes/configs/numerics/int8.yaml @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# INT8 per-channel quantizer attributes. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: 8 +axis: 0 diff --git a/modelopt_recipes/configs/numerics/mxfp4.yaml b/modelopt_recipes/configs/numerics/mxfp4.yaml new file mode 100644 index 0000000000..f5fe94cf4a --- /dev/null +++ b/modelopt_recipes/configs/numerics/mxfp4.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MXFP4 dynamic block quantizer attributes with E8M0 scales. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: e2m1 +block_sizes: + -1: 32 + type: dynamic + scale_bits: e8m0 diff --git a/modelopt_recipes/configs/numerics/mxfp6.yaml b/modelopt_recipes/configs/numerics/mxfp6.yaml new file mode 100644 index 0000000000..b114c141c9 --- /dev/null +++ b/modelopt_recipes/configs/numerics/mxfp6.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MXFP6 dynamic block quantizer attributes with E8M0 scales. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: e3m2 +block_sizes: + -1: 32 + type: dynamic + scale_bits: e8m0 diff --git a/modelopt_recipes/configs/numerics/mxfp8.yaml b/modelopt_recipes/configs/numerics/mxfp8.yaml new file mode 100644 index 0000000000..44f42ab719 --- /dev/null +++ b/modelopt_recipes/configs/numerics/mxfp8.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MXFP8 dynamic block quantizer attributes with E8M0 scales. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: e4m3 +block_sizes: + -1: 32 + type: dynamic + scale_bits: e8m0 diff --git a/modelopt_recipes/configs/numerics/mxint8.yaml b/modelopt_recipes/configs/numerics/mxint8.yaml new file mode 100644 index 0000000000..cbf8eaa161 --- /dev/null +++ b/modelopt_recipes/configs/numerics/mxint8.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MXINT8 dynamic block quantizer attributes with E8M0 scales. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: 8 +block_sizes: + -1: 32 + type: dynamic + scale_bits: e8m0 diff --git a/modelopt_recipes/configs/numerics/nvfp4_bs32.yaml b/modelopt_recipes/configs/numerics/nvfp4_bs32.yaml new file mode 100644 index 0000000000..57392366cd --- /dev/null +++ b/modelopt_recipes/configs/numerics/nvfp4_bs32.yaml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 E2M1 blockwise quantizer attributes with FP8 E4M3 scales and block size 32. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: e2m1 +block_sizes: + -1: 32 + type: dynamic + scale_bits: e4m3 diff --git a/modelopt_recipes/configs/ptq/presets/README.md b/modelopt_recipes/configs/ptq/presets/README.md index 3ab307fe45..b07f989ffe 100644 --- a/modelopt_recipes/configs/ptq/presets/README.md +++ b/modelopt_recipes/configs/ptq/presets/README.md @@ -1,7 +1,7 @@ # PTQ Preset Configs This directory holds preset quantization configurations that serve as the -YAML source of truth for the hardcoded `*_CFG` dicts in +YAML source of truth for the `*_CFG` dicts exposed from `modelopt.torch.quantization.config` (e.g., `FP8_DEFAULT_CFG`, `FP8_KV_CFG`). diff --git a/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml b/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml new file mode 100644 index 0000000000..b0d95fbf00 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# FP8 E4M3 affine KV cache quantization fragment. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +quant_cfg: + - quantizer_name: '*[kv]_bmm_quantizer' + cfg: + num_bits: e4m3 + bias: + -2: + -4: + type: static diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml new file mode 100644 index 0000000000..5b61d3ba9e --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 KV cache quantization fragment. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + nvfp4: configs/numerics/nvfp4 + +quant_cfg: + - quantizer_name: '*[kv]_bmm_quantizer' + cfg: + $import: nvfp4 diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml new file mode 100644 index 0000000000..4e306994d5 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 affine KV cache quantization fragment. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + nvfp4: configs/numerics/nvfp4 + +quant_cfg: + - quantizer_name: '*[kv]_bmm_quantizer' + cfg: + $import: nvfp4 + bias: + -2: + -4: + type: static diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml new file mode 100644 index 0000000000..462f62f54d --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 KV cache rotate preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + nvfp4: configs/numerics/nvfp4 + +algorithm: max +quant_cfg: + - quantizer_name: '*q_bmm_quantizer' + cfg: + rotate: true + enable: false + - quantizer_name: '*k_bmm_quantizer' + cfg: + $import: nvfp4 + rotate: true + - quantizer_name: '*v_bmm_quantizer' + cfg: + $import: nvfp4 diff --git a/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml new file mode 100644 index 0000000000..d39e57451a --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# FP8 2D blockwise fake quantization config for DeepSeek models. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + num_bits: e4m3 + block_sizes: + -1: 128 + -2: 128 + - quantizer_name: '*input_quantizer' + enable: false + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml b/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml new file mode 100644 index 0000000000..4df9cb1c82 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# FP8 per-channel weights and per-token dynamic activations. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + num_bits: e4m3 + axis: 0 + - quantizer_name: '*input_quantizer' + cfg: + num_bits: e4m3 + type: dynamic + block_sizes: + -1: + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml b/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml new file mode 100644 index 0000000000..60c792801e --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# INT4 AWQ weight-only preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + int4: configs/numerics/int4 + +algorithm: + method: awq_lite + alpha_step: 0.1 +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: int4 + - quantizer_name: '*input_quantizer' + enable: false + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml new file mode 100644 index 0000000000..2ade851eee --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# INT4 blockwise weight-only preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + num_bits: 4 + block_sizes: + -1: 128 + - quantizer_name: '*input_quantizer' + enable: false + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int8.yaml b/modelopt_recipes/configs/ptq/presets/model/int8.yaml new file mode 100644 index 0000000000..fc45fab956 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/int8.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# INT8 per-channel weights and per-tensor activations, max calibration. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + int8: configs/numerics/int8 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: int8 + - quantizer_name: '*input_quantizer' + cfg: + num_bits: 8 + axis: + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml b/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml new file mode 100644 index 0000000000..274a607bb8 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# INT8 SmoothQuant preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + int8: configs/numerics/int8 + +algorithm: smoothquant +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: int8 + - quantizer_name: '*input_quantizer' + cfg: + num_bits: 8 + axis: + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml new file mode 100644 index 0000000000..002876cc9c --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# INT8 weight-only preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + int8: configs/numerics/int8 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: int8 + - quantizer_name: '*input_quantizer' + enable: false + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_aggressive.yaml b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_aggressive.yaml new file mode 100644 index 0000000000..0b9f5b1b20 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_aggressive.yaml @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Mamba-MoE FP8 aggressive preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + fp8: configs/numerics/fp8 + mamba_moe_disabled_quantizers: configs/ptq/units/mamba_moe_disabled_quantizers + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: fp8 + - quantizer_name: '*input_quantizer' + cfg: + $import: fp8 + - $import: default_disabled_quantizers + - $import: mamba_moe_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_conservative.yaml b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_conservative.yaml new file mode 100644 index 0000000000..be44d6dec8 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_conservative.yaml @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Mamba-MoE FP8 conservative preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + fp8: configs/numerics/fp8 + mamba_moe_disabled_quantizers: configs/ptq/units/mamba_moe_disabled_quantizers + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: fp8 + - quantizer_name: '*input_quantizer' + cfg: + $import: fp8 + - $import: default_disabled_quantizers + - $import: mamba_moe_disabled_quantizers + - quantizer_name: '*mixer.in_proj*' + enable: false + - quantizer_name: '*mixer.out_proj*' + enable: false diff --git a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_aggressive.yaml b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_aggressive.yaml new file mode 100644 index 0000000000..79d929a101 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_aggressive.yaml @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Mamba-MoE NVFP4 aggressive preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mamba_moe_disabled_quantizers: configs/ptq/units/mamba_moe_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers + - $import: mamba_moe_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_conservative.yaml b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_conservative.yaml new file mode 100644 index 0000000000..fb525f402b --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_conservative.yaml @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Mamba-MoE NVFP4 conservative preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mamba_moe_disabled_quantizers: configs/ptq/units/mamba_moe_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers + - $import: mamba_moe_disabled_quantizers + - quantizer_name: '*mixer.in_proj*' + enable: false + - quantizer_name: '*mixer.out_proj*' + enable: false diff --git a/modelopt_recipes/configs/ptq/presets/model/mxfp4.yaml b/modelopt_recipes/configs/ptq/presets/model/mxfp4.yaml new file mode 100644 index 0000000000..e9f3724297 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mxfp4.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MXFP4 dynamic block quantization preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mxfp4: configs/numerics/mxfp4 + +algorithm: +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: mxfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: mxfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mxfp4_mlp_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/mxfp4_mlp_weight_only.yaml new file mode 100644 index 0000000000..e97d4c7d54 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mxfp4_mlp_weight_only.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MXFP4 MLP weight-only preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mxfp4: configs/numerics/mxfp4 + +algorithm: +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*mlp*weight_quantizer' + cfg: + $import: mxfp4 + - quantizer_name: '*block_sparse_moe*weight_quantizer' + cfg: + $import: mxfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mxfp6.yaml b/modelopt_recipes/configs/ptq/presets/model/mxfp6.yaml new file mode 100644 index 0000000000..c131388a8e --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mxfp6.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MXFP6 dynamic block quantization preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mxfp6: configs/numerics/mxfp6 + +algorithm: +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: mxfp6 + - quantizer_name: '*input_quantizer' + cfg: + $import: mxfp6 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mxfp8.yaml b/modelopt_recipes/configs/ptq/presets/model/mxfp8.yaml new file mode 100644 index 0000000000..17d4e64945 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mxfp8.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MXFP8 dynamic block quantization preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mxfp8: configs/numerics/mxfp8 + +algorithm: +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: mxfp8 + - quantizer_name: '*input_quantizer' + cfg: + $import: mxfp8 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/mxint8.yaml b/modelopt_recipes/configs/ptq/presets/model/mxint8.yaml new file mode 100644 index 0000000000..2d0bb9959d --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/mxint8.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MXINT8 dynamic block quantization preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + mxint8: configs/numerics/mxint8 + +algorithm: +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: mxint8 + - quantizer_name: '*input_quantizer' + cfg: + $import: mxint8 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4.yaml new file mode 100644 index 0000000000..733d504807 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 default W4A4 preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_clip.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_clip.yaml new file mode 100644 index 0000000000..b0af7b5f9c --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_clip.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 AWQ clip preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + +algorithm: + method: awq_clip +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_full.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_full.yaml new file mode 100644 index 0000000000..f3be6c9ef7 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_full.yaml @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 AWQ full preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + +algorithm: + method: awq_full + alpha_step: 0.1 +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_lite.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_lite.yaml new file mode 100644 index 0000000000..b1915216ec --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_lite.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 AWQ lite preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + +algorithm: awq_lite +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_experts_only.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_experts_only.yaml new file mode 100644 index 0000000000..fe1e10c374 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_experts_only.yaml @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 experts-only preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*mlp.experts*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*mlp.experts*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*block_sparse_moe*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*block_sparse_moe*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*.experts.*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*.experts.*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml new file mode 100644 index 0000000000..831391416a --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diffusers NVFP4 preset with FP8 MHA quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + nvfp4: configs/numerics/nvfp4 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*output_quantizer' + enable: false + - quantizer_name: '*q_bmm_quantizer' + cfg: + num_bits: e4m3 + - quantizer_name: '*k_bmm_quantizer' + cfg: + num_bits: e4m3 + - quantizer_name: '*v_bmm_quantizer' + cfg: + num_bits: e4m3 + - quantizer_name: '*softmax_quantizer' + cfg: + num_bits: e4m3 + - quantizer_name: 'transformer_blocks*bmm2_output_quantizer' + cfg: + num_bits: e4m3 diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_only.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_only.yaml new file mode 100644 index 0000000000..46947a228d --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_only.yaml @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 MLP-only preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*mlp*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*mlp*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*block_sparse_moe*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*block_sparse_moe*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*.experts.*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*.experts.*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_weight_only.yaml new file mode 100644 index 0000000000..01aa438e2b --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_weight_only.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 MLP weight-only preset with block size 32. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4_bs32: configs/numerics/nvfp4_bs32 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*mlp*weight_quantizer' + cfg: + $import: nvfp4_bs32 + - quantizer_name: '*block_sparse_moe*weight_quantizer' + cfg: + $import: nvfp4_bs32 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_omlp_only.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_omlp_only.yaml new file mode 100644 index 0000000000..eee8c5cdb4 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_omlp_only.yaml @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 output-projection and MLP preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*o_proj*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*o_proj*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*mlp*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*mlp*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*block_sparse_moe*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*block_sparse_moe*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_svdquant.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_svdquant.yaml new file mode 100644 index 0000000000..85f9aec7a5 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_svdquant.yaml @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 SVDQuant preset. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + +algorithm: + method: svdquant + lowrank: 32 +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian.yaml new file mode 100644 index 0000000000..c31e1619da --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian.yaml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 W4A4 with static weight scales optimized by local Hessian calibration. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + nvfp4_static: configs/numerics/nvfp4_static + +algorithm: + method: local_hessian + fp8_scale_sweep: true +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4_static + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep.yaml new file mode 100644 index 0000000000..0c4b23ad53 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep.yaml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 W4A4 with static weight scales optimized by MSE FP8 scale sweep. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + nvfp4_static: configs/numerics/nvfp4_static + +algorithm: + method: mse + fp8_scale_sweep: true +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4_static + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml b/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml new file mode 100644 index 0000000000..1b43258320 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# W4A8 AWQ beta: INT4 blockwise weights followed by FP8 weights, FP8 activations. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + fp8: configs/numerics/fp8 + int4: configs/numerics/int4 + +algorithm: awq_lite +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + - $import: int4 + - $import: fp8 + - quantizer_name: '*input_quantizer' + cfg: + $import: fp8 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/w4a8_mxfp4_fp8.yaml b/modelopt_recipes/configs/ptq/presets/model/w4a8_mxfp4_fp8.yaml new file mode 100644 index 0000000000..d4e11d34ad --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/w4a8_mxfp4_fp8.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MXFP4 weights with FP8 activations. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + fp8: configs/numerics/fp8 + mxfp4: configs/numerics/mxfp4 + +algorithm: +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: mxfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: fp8 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml b/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml new file mode 100644 index 0000000000..97053fe7c1 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 block-size-32 weights with FP8 activations. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4_bs32: configs/numerics/nvfp4_bs32 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4_bs32 + - quantizer_name: '*input_quantizer' + cfg: + num_bits: e4m3 + - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/units/README.md b/modelopt_recipes/configs/ptq/units/README.md index b7a7421f9f..26d1aa568d 100644 --- a/modelopt_recipes/configs/ptq/units/README.md +++ b/modelopt_recipes/configs/ptq/units/README.md @@ -21,5 +21,6 @@ recipes (under `general/` or `models/`) or presets (under `presets/`). | `kv_fp8.yaml` | FP8 E4M3 KV cache quantizer entry; supported on Hopper+ GPUs | | `kv_fp8_cast.yaml` | FP8 E4M3 KV cache with constant amax (skips KV calibration); supported on Hopper+ GPUs | | `kv_nvfp4_cast.yaml` | NVFP4 KV cache with constant amax (skips KV calibration); supported on Blackwell+ GPUs | +| `mamba_moe_disabled_quantizers.yaml` | Shared Mamba-MoE quantizer exclusions | | `w8a8_fp8_fp8.yaml` | FP8 weight + activation quantizer entries (W8A8); supported on Hopper+ GPUs | | `w4a4_nvfp4_nvfp4.yaml` | NVFP4 weight + activation quantizer entries (W4A4); supported on Blackwell+ GPUs | diff --git a/modelopt_recipes/configs/ptq/units/mamba_moe_disabled_quantizers.yaml b/modelopt_recipes/configs/ptq/units/mamba_moe_disabled_quantizers.yaml new file mode 100644 index 0000000000..b079c54f89 --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/mamba_moe_disabled_quantizers.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Mamba-MoE exclusions shared by the aggressive and conservative presets. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig + - quantizer_name: '*fc1_latent_proj*' + enable: false + - quantizer_name: '*fc2_latent_proj*' + enable: false + - quantizer_name: '*q_proj*' + enable: false + - quantizer_name: '*k_proj*' + enable: false + - quantizer_name: '*v_proj*' + enable: false + - quantizer_name: '*o_proj*' + enable: false + - quantizer_name: '*self_attention.linear_qkv*' + enable: false + - quantizer_name: '*self_attention.linear_proj*' + enable: false diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index e8d0d33b6c..bf6d5ef5ac 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -16,6 +16,7 @@ """Unit tests for modelopt.recipe.loader and modelopt.recipe.loader.load_config.""" import re +from importlib.resources import files import pytest @@ -1114,20 +1115,20 @@ def test_import_cross_file_same_name_no_conflict(tmp_path): # --------------------------------------------------------------------------- -_BUILTIN_CONFIG_SNIPPETS = [ - "configs/numerics/fp8", - "configs/numerics/nvfp4", - "configs/numerics/nvfp4_static", - "configs/ptq/units/base_disable_all", - "configs/ptq/units/default_disabled_quantizers", - "configs/ptq/units/kv_fp8", - "configs/ptq/units/kv_fp8_cast", - "configs/ptq/units/kv_nvfp4_cast", - "configs/ptq/units/w4a4_nvfp4_nvfp4", - "configs/ptq/units/w8a8_fp8_fp8", - "configs/ptq/presets/kv/fp8", - "configs/ptq/presets/model/fp8", -] +def _iter_builtin_config_snippets(root): + """Yield built-in config YAML files that declare a modelopt schema.""" + for child in sorted(root.iterdir(), key=lambda path: path.name): + if child.is_dir(): + yield from _iter_builtin_config_snippets(child) + elif child.name.endswith((".yaml", ".yml")) and "modelopt-schema:" in child.read_text( + encoding="utf-8" + ): + yield child + + +_BUILTIN_CONFIG_SNIPPETS = list( + _iter_builtin_config_snippets(files("modelopt_recipes").joinpath("configs")) +) @pytest.mark.parametrize("config_path", _BUILTIN_CONFIG_SNIPPETS) @@ -1206,6 +1207,41 @@ def test_modelopt_schema_comment_validates_after_import_resolution(tmp_path): } +def test_import_dict_snippet_in_union_typed_list_field(tmp_path): + """A bare import can append into QuantizerCfgEntry.cfg's list branch.""" + (tmp_path / "int4.yaml").write_text( + "# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig\n" + "num_bits: 4\n" + "block_sizes:\n" + " -1: 128\n" + " type: static\n" + ) + (tmp_path / "fp8.yaml").write_text( + "# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig\n" + "num_bits: e4m3\n" + ) + config_file = tmp_path / "config.yaml" + config_file.write_text( + f"# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig\n" + f"imports:\n" + f" int4: {tmp_path / 'int4.yaml'}\n" + f" fp8: {tmp_path / 'fp8.yaml'}\n" + f"algorithm: awq_lite\n" + f"quant_cfg:\n" + f" - quantizer_name: '*weight_quantizer'\n" + f" cfg:\n" + f" - $import: int4\n" + f" - $import: fp8\n" + ) + + data = load_config(config_file) + + assert _cfg_to_dict(data["quant_cfg"][0]["cfg"]) == [ + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}}, + {"num_bits": (4, 3)}, + ] + + # --------------------------------------------------------------------------- # Coverage: _load_raw_config edge cases # --------------------------------------------------------------------------- From 2bc2d2cda369ad7b5282084c144fb4e7396a8d24 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 7 May 2026 15:26:31 -0700 Subject: [PATCH 16/23] numerics yaml Signed-off-by: Shengliang Xu --- examples/llm_autodeploy/run_auto_quantize.py | 5 ++-- modelopt/torch/opt/config_loader.py | 28 +++++++++++--------- modelopt_recipes/configs/numerics/int4.yaml | 2 +- modelopt_recipes/configs/numerics/int8.yaml | 2 +- modelopt_recipes/configs/numerics/mxfp4.yaml | 2 +- modelopt_recipes/configs/numerics/mxfp6.yaml | 2 +- modelopt_recipes/configs/numerics/mxfp8.yaml | 2 +- tests/unit/recipe/test_loader.py | 27 ++++++++++++++++++- 8 files changed, 50 insertions(+), 20 deletions(-) diff --git a/examples/llm_autodeploy/run_auto_quantize.py b/examples/llm_autodeploy/run_auto_quantize.py index ebd7c1090b..f899300192 100644 --- a/examples/llm_autodeploy/run_auto_quantize.py +++ b/examples/llm_autodeploy/run_auto_quantize.py @@ -21,10 +21,11 @@ import modelopt.torch.opt as mto import modelopt.torch.quantization as mtq +from modelopt.torch.quantization.config import QuantizeConfig from modelopt.torch.utils import create_forward_loop from modelopt.torch.utils.dataset_utils import get_dataset_dataloader -SUPPORT_QUANT_FORMAT = { +SUPPORT_QUANT_FORMAT: dict[str, QuantizeConfig] = { "fp8": mtq.FP8_DEFAULT_CFG, "nvfp4": mtq.NVFP4_DEFAULT_CFG, } @@ -87,7 +88,7 @@ def loss_func(output, data): data_loader=calib_dataloader, forward_step=lambda model, batch: model(**batch), loss_func=loss_func, - quantization_formats=[SUPPORT_QUANT_FORMAT[format] for format in qformat_list], + quantization_formats=[SUPPORT_QUANT_FORMAT[quant_format] for quant_format in qformat_list], num_calib_steps=len(calib_dataloader), num_score_steps=min( len(calib_dataloader), 128 // batch_size diff --git a/modelopt/torch/opt/config_loader.py b/modelopt/torch/opt/config_loader.py index 1bf518c208..80864523e5 100644 --- a/modelopt/torch/opt/config_loader.py +++ b/modelopt/torch/opt/config_loader.py @@ -338,18 +338,16 @@ def _list_element_schema(schema_type: Any | None) -> Any | None: schema_type = _unwrap_schema_type(schema_type) origin = get_origin(schema_type) if origin in (UnionType, Union): - element_schemas = [ - element_schema - for arg in get_args(schema_type) - if (element_schema := _list_element_schema(arg)) is not None - ] - if len(element_schemas) == 1: - return element_schemas[0] - if element_schemas and all( - _schema_equal(element_schemas[0], item) for item in element_schemas[1:] - ): - return element_schemas[0] - return None + element_schemas = [] + for arg in get_args(schema_type): + if arg is NoneType: + continue + element_schema = _list_element_schema(arg) + if element_schema is None: + continue + if not any(_schema_equal(element_schema, seen) for seen in element_schemas): + element_schemas.append(element_schema) + return element_schemas[0] if len(element_schemas) == 1 else None if origin is not list: return None args = get_args(schema_type) @@ -524,6 +522,12 @@ def _resolve_list_import( if _schema_equal(imported.schema_type, element_schema): return [imported.data] + element_schema_unwrapped = _unwrap_schema_type(element_schema) + if isinstance(imported.data, dict) and ( + element_schema_unwrapped is dict or get_origin(element_schema_unwrapped) is dict + ): + return [imported.data] + raise ValueError( f"$import {ref_name!r} in list at {context} has schema " f"{_schema_label(imported.schema_type, imported.schema)!r}; expected either " diff --git a/modelopt_recipes/configs/numerics/int4.yaml b/modelopt_recipes/configs/numerics/int4.yaml index a21f7827cf..3e229d977c 100644 --- a/modelopt_recipes/configs/numerics/int4.yaml +++ b/modelopt_recipes/configs/numerics/int4.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# INT4 static blockwise quantizer attributes. +# INT4 static block quantizer attributes. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: 4 diff --git a/modelopt_recipes/configs/numerics/int8.yaml b/modelopt_recipes/configs/numerics/int8.yaml index 9e00034903..25f8e9970e 100644 --- a/modelopt_recipes/configs/numerics/int8.yaml +++ b/modelopt_recipes/configs/numerics/int8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# INT8 per-channel quantizer attributes. +# INT8 per-channel weight quantizer attributes. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: 8 diff --git a/modelopt_recipes/configs/numerics/mxfp4.yaml b/modelopt_recipes/configs/numerics/mxfp4.yaml index f5fe94cf4a..cb347fb3e6 100644 --- a/modelopt_recipes/configs/numerics/mxfp4.yaml +++ b/modelopt_recipes/configs/numerics/mxfp4.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXFP4 dynamic block quantizer attributes with E8M0 scales. +# MXFP4 E2M1 dynamic block quantizer attributes with E8M0 scales. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e2m1 diff --git a/modelopt_recipes/configs/numerics/mxfp6.yaml b/modelopt_recipes/configs/numerics/mxfp6.yaml index b114c141c9..655f48215f 100644 --- a/modelopt_recipes/configs/numerics/mxfp6.yaml +++ b/modelopt_recipes/configs/numerics/mxfp6.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXFP6 dynamic block quantizer attributes with E8M0 scales. +# MXFP6 E3M2 dynamic block quantizer attributes with E8M0 scales. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e3m2 diff --git a/modelopt_recipes/configs/numerics/mxfp8.yaml b/modelopt_recipes/configs/numerics/mxfp8.yaml index 44f42ab719..5ce6197fc1 100644 --- a/modelopt_recipes/configs/numerics/mxfp8.yaml +++ b/modelopt_recipes/configs/numerics/mxfp8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXFP8 dynamic block quantizer attributes with E8M0 scales. +# MXFP8 E4M3 dynamic block quantizer attributes with E8M0 scales. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e4m3 diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index bf6d5ef5ac..d5e04864c6 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -1207,7 +1207,7 @@ def test_modelopt_schema_comment_validates_after_import_resolution(tmp_path): } -def test_import_dict_snippet_in_union_typed_list_field(tmp_path): +def test_import_dict_snippet_imports_in_union_typed_list_field(tmp_path): """A bare import can append into QuantizerCfgEntry.cfg's list branch.""" (tmp_path / "int4.yaml").write_text( "# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig\n" @@ -1242,6 +1242,31 @@ def test_import_dict_snippet_in_union_typed_list_field(tmp_path): ] +def test_import_dict_snippet_in_union_typed_list_field_with_inline_item(tmp_path): + """A dict snippet can be imported as one item inside QuantizerCfgEntry.cfg list.""" + _write_quantizer_attribute( + tmp_path / "int4.yaml", + "num_bits: 4\nblock_sizes:\n -1: 128\n type: static\n", + ) + config_file = tmp_path / "config.yaml" + config_file.write_text( + f"# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig\n" + f"imports:\n" + f" int4: {tmp_path / 'int4.yaml'}\n" + f"algorithm: awq_lite\n" + f"quant_cfg:\n" + f" - quantizer_name: '*weight_quantizer'\n" + f" cfg:\n" + f" - $import: int4\n" + f" - num_bits: e4m3\n" + ) + data = load_config(config_file) + assert _cfg_to_dict(data["quant_cfg"][0]["cfg"]) == [ + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}}, + {"num_bits": (4, 3)}, + ] + + # --------------------------------------------------------------------------- # Coverage: _load_raw_config edge cases # --------------------------------------------------------------------------- From 054efe1b43d840e64c6397516e5810f88f07a7a3 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 7 May 2026 17:17:09 -0700 Subject: [PATCH 17/23] Remove quantize config loader wrapper Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 195 ++++++++++++++++---------- 1 file changed, 122 insertions(+), 73 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index de27167865..64d44c5cf7 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -1198,16 +1198,6 @@ class _QuantizeExportConfig(ModeloptBaseConfig): """An empty config.""" -def _load_quantize_config(config_path: str) -> QuantizeConfig: - """Load a schema-backed QuantizeConfig YAML.""" - config = load_config(config_path, schema_type=QuantizeConfig) - if isinstance(config, QuantizeConfig): - return config - if isinstance(config, Mapping): - return QuantizeConfig.model_validate(config) - raise TypeError(f"{config_path} must declare QuantizeConfig.") - - def _load_quantizer_attribute_dict(config_path: str) -> dict[str, Any]: """Load a schema-backed QuantizerAttributeConfig YAML as a public dict.""" config = load_config(config_path, schema_type=QuantizerAttributeConfig) @@ -1262,99 +1252,158 @@ def _load_quantizer_cfg_dict_list(config_path: str) -> list[dict[str, Any]]: _nvfp4_cfg_bs32: dict[str, Any] = _load_quantizer_attribute_dict("configs/numerics/nvfp4_bs32") -INT8_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/int8") -INT8_SMOOTHQUANT_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/int8_smoothquant" +INT8_DEFAULT_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/int8", + schema_type=QuantizeConfig, +) +INT8_SMOOTHQUANT_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/int8_smoothquant", + schema_type=QuantizeConfig, +) +INT8_WEIGHT_ONLY_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/int8_weight_only", + schema_type=QuantizeConfig, +) +FP8_DEFAULT_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/fp8", + schema_type=QuantizeConfig, +) +MAMBA_MOE_FP8_AGGRESSIVE_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/mamba_moe_fp8_aggressive", + schema_type=QuantizeConfig, ) -INT8_WEIGHT_ONLY_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/int8_weight_only" +MAMBA_MOE_FP8_CONSERVATIVE_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/mamba_moe_fp8_conservative", + schema_type=QuantizeConfig, ) -FP8_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/fp8") -MAMBA_MOE_FP8_AGGRESSIVE_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/mamba_moe_fp8_aggressive" +FP8_PER_CHANNEL_PER_TOKEN_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/fp8_per_channel_per_token", + schema_type=QuantizeConfig, ) -MAMBA_MOE_FP8_CONSERVATIVE_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/mamba_moe_fp8_conservative" +FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/fp8_2d_blockwise_weight_only", + schema_type=QuantizeConfig, ) -FP8_PER_CHANNEL_PER_TOKEN_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/fp8_per_channel_per_token" +INT4_BLOCKWISE_WEIGHT_ONLY_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/int4_blockwise_weight_only", + schema_type=QuantizeConfig, ) -FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/fp8_2d_blockwise_weight_only" +INT4_AWQ_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/int4_awq", + schema_type=QuantizeConfig, ) -INT4_BLOCKWISE_WEIGHT_ONLY_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/int4_blockwise_weight_only" +W4A8_AWQ_BETA_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/w4a8_awq_beta", + schema_type=QuantizeConfig, ) -INT4_AWQ_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/int4_awq") -W4A8_AWQ_BETA_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/w4a8_awq_beta" +MXFP8_DEFAULT_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/mxfp8", + schema_type=QuantizeConfig, ) -MXFP8_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/mxfp8") -MXFP6_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/mxfp6") -MXFP4_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/mxfp4") -W4A8_MXFP4_FP8_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/w4a8_mxfp4_fp8" +MXFP6_DEFAULT_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/mxfp6", + schema_type=QuantizeConfig, +) +MXFP4_DEFAULT_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/mxfp4", + schema_type=QuantizeConfig, +) +W4A8_MXFP4_FP8_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/w4a8_mxfp4_fp8", + schema_type=QuantizeConfig, +) +MXINT8_DEFAULT_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/mxint8", + schema_type=QuantizeConfig, ) -MXINT8_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/mxint8") # KV-cache configs are designed to be merged with a primary quantization config (e.g. # FP8_DEFAULT_CFG) that already contains _base_disable_all. They intentionally omit both # _base_disable_all and "algorithm" because these are provided by the primary config. -FP8_KV_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/kv/fp8") -FP8_AFFINE_KV_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/kv/fp8_affine") +FP8_KV_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/kv/fp8", + schema_type=QuantizeConfig, +) +FP8_AFFINE_KV_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/kv/fp8_affine", + schema_type=QuantizeConfig, +) -NVFP4_DEFAULT_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/model/nvfp4") -NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep" +NVFP4_DEFAULT_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/nvfp4", + schema_type=QuantizeConfig, +) +NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep", + schema_type=QuantizeConfig, +) +NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian", + schema_type=QuantizeConfig, ) -NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian" +MAMBA_MOE_NVFP4_AGGRESSIVE_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/mamba_moe_nvfp4_aggressive", + schema_type=QuantizeConfig, ) -MAMBA_MOE_NVFP4_AGGRESSIVE_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/mamba_moe_nvfp4_aggressive" +MAMBA_MOE_NVFP4_CONSERVATIVE_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/mamba_moe_nvfp4_conservative", + schema_type=QuantizeConfig, ) -MAMBA_MOE_NVFP4_CONSERVATIVE_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/mamba_moe_nvfp4_conservative" +NVFP4_AWQ_LITE_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/nvfp4_awq_lite", + schema_type=QuantizeConfig, ) -NVFP4_AWQ_LITE_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/nvfp4_awq_lite" +NVFP4_AWQ_CLIP_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/nvfp4_awq_clip", + schema_type=QuantizeConfig, ) -NVFP4_AWQ_CLIP_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/nvfp4_awq_clip" +NVFP4_AWQ_FULL_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/nvfp4_awq_full", + schema_type=QuantizeConfig, ) -NVFP4_AWQ_FULL_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/nvfp4_awq_full" +NVFP4_AFFINE_KV_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/kv/nvfp4_affine", + schema_type=QuantizeConfig, ) -NVFP4_AFFINE_KV_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/kv/nvfp4_affine" +NVFP4_KV_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/kv/nvfp4", + schema_type=QuantizeConfig, ) -NVFP4_KV_CFG: QuantizeConfig = _load_quantize_config("configs/ptq/presets/kv/nvfp4") -NVFP4_FP8_MHA_CONFIG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/nvfp4_fp8_mha" +NVFP4_FP8_MHA_CONFIG: QuantizeConfig = load_config( + "configs/ptq/presets/model/nvfp4_fp8_mha", + schema_type=QuantizeConfig, ) -NVFP4_KV_ROTATE_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/kv/nvfp4_rotate" +NVFP4_KV_ROTATE_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/kv/nvfp4_rotate", + schema_type=QuantizeConfig, ) -NVFP4_SVDQUANT_DEFAULT_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/nvfp4_svdquant" +NVFP4_SVDQUANT_DEFAULT_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/nvfp4_svdquant", + schema_type=QuantizeConfig, ) -W4A8_NVFP4_FP8_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/w4a8_nvfp4_fp8" +W4A8_NVFP4_FP8_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/w4a8_nvfp4_fp8", + schema_type=QuantizeConfig, ) -MXFP4_MLP_WEIGHT_ONLY_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/mxfp4_mlp_weight_only" +MXFP4_MLP_WEIGHT_ONLY_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/mxfp4_mlp_weight_only", + schema_type=QuantizeConfig, ) -NVFP4_MLP_WEIGHT_ONLY_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/nvfp4_mlp_weight_only" +NVFP4_MLP_WEIGHT_ONLY_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/nvfp4_mlp_weight_only", + schema_type=QuantizeConfig, ) -NVFP4_EXPERTS_ONLY_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/nvfp4_experts_only" +NVFP4_EXPERTS_ONLY_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/nvfp4_experts_only", + schema_type=QuantizeConfig, ) -NVFP4_MLP_ONLY_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/nvfp4_mlp_only" +NVFP4_MLP_ONLY_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/nvfp4_mlp_only", + schema_type=QuantizeConfig, ) -NVFP4_OMLP_ONLY_CFG: QuantizeConfig = _load_quantize_config( - "configs/ptq/presets/model/nvfp4_omlp_only" +NVFP4_OMLP_ONLY_CFG: QuantizeConfig = load_config( + "configs/ptq/presets/model/nvfp4_omlp_only", + schema_type=QuantizeConfig, ) # DO NOT ADD NEW CONFIGS HERE. If you want to add a new general recipe, add it to From 233db2a25db9f92e44b4cb5725b930d80af40f30 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 7 May 2026 17:39:55 -0700 Subject: [PATCH 18/23] Add KV quantization config units Signed-off-by: Shengliang Xu --- .../configs/ptq/presets/kv/fp8_affine.yaml | 11 +++---- .../configs/ptq/presets/kv/nvfp4.yaml | 6 ++-- .../configs/ptq/presets/kv/nvfp4_affine.yaml | 10 ++---- .../configs/ptq/presets/kv/nvfp4_rotate.yaml | 14 ++------ modelopt_recipes/configs/ptq/units/README.md | 4 +++ .../configs/ptq/units/kv_fp8_affine.yaml | 30 +++++++++++++++++ .../configs/ptq/units/kv_nvfp4.yaml | 24 ++++++++++++++ .../configs/ptq/units/kv_nvfp4_affine.yaml | 30 +++++++++++++++++ .../configs/ptq/units/kv_nvfp4_rotate.yaml | 32 +++++++++++++++++++ 9 files changed, 130 insertions(+), 31 deletions(-) create mode 100644 modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml create mode 100644 modelopt_recipes/configs/ptq/units/kv_nvfp4.yaml create mode 100644 modelopt_recipes/configs/ptq/units/kv_nvfp4_affine.yaml create mode 100644 modelopt_recipes/configs/ptq/units/kv_nvfp4_rotate.yaml diff --git a/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml b/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml index b0d95fbf00..6c17aa0f3e 100644 --- a/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml +++ b/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml @@ -16,11 +16,8 @@ # FP8 E4M3 affine KV cache quantization fragment. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + kv_fp8_affine: configs/ptq/units/kv_fp8_affine + quant_cfg: - - quantizer_name: '*[kv]_bmm_quantizer' - cfg: - num_bits: e4m3 - bias: - -2: - -4: - type: static + - $import: kv_fp8_affine diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml index 5b61d3ba9e..480ddebd61 100644 --- a/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml @@ -17,9 +17,7 @@ # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: - nvfp4: configs/numerics/nvfp4 + kv_nvfp4: configs/ptq/units/kv_nvfp4 quant_cfg: - - quantizer_name: '*[kv]_bmm_quantizer' - cfg: - $import: nvfp4 + - $import: kv_nvfp4 diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml index 4e306994d5..074ce9b6c3 100644 --- a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml @@ -17,13 +17,7 @@ # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: - nvfp4: configs/numerics/nvfp4 + kv_nvfp4_affine: configs/ptq/units/kv_nvfp4_affine quant_cfg: - - quantizer_name: '*[kv]_bmm_quantizer' - cfg: - $import: nvfp4 - bias: - -2: - -4: - type: static + - $import: kv_nvfp4_affine diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml index 462f62f54d..684c9677e2 100644 --- a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml @@ -17,18 +17,8 @@ # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: - nvfp4: configs/numerics/nvfp4 + kv_nvfp4_rotate: configs/ptq/units/kv_nvfp4_rotate algorithm: max quant_cfg: - - quantizer_name: '*q_bmm_quantizer' - cfg: - rotate: true - enable: false - - quantizer_name: '*k_bmm_quantizer' - cfg: - $import: nvfp4 - rotate: true - - quantizer_name: '*v_bmm_quantizer' - cfg: - $import: nvfp4 + - $import: kv_nvfp4_rotate diff --git a/modelopt_recipes/configs/ptq/units/README.md b/modelopt_recipes/configs/ptq/units/README.md index 26d1aa568d..91e3dab973 100644 --- a/modelopt_recipes/configs/ptq/units/README.md +++ b/modelopt_recipes/configs/ptq/units/README.md @@ -19,8 +19,12 @@ recipes (under `general/` or `models/`) or presets (under `presets/`). | `base_disable_all.yaml` | Deny-all entry: disables all quantizers as the first step | | `default_disabled_quantizers.yaml` | Standard exclusions (LM head, routers, BatchNorm, etc.) | | `kv_fp8.yaml` | FP8 E4M3 KV cache quantizer entry; supported on Hopper+ GPUs | +| `kv_fp8_affine.yaml` | FP8 E4M3 affine KV cache quantizer entries; supported on Hopper+ GPUs | | `kv_fp8_cast.yaml` | FP8 E4M3 KV cache with constant amax (skips KV calibration); supported on Hopper+ GPUs | +| `kv_nvfp4.yaml` | NVFP4 KV cache quantizer entry; supported on Blackwell+ GPUs | +| `kv_nvfp4_affine.yaml` | NVFP4 affine KV cache quantizer entries; supported on Blackwell+ GPUs | | `kv_nvfp4_cast.yaml` | NVFP4 KV cache with constant amax (skips KV calibration); supported on Blackwell+ GPUs | +| `kv_nvfp4_rotate.yaml` | NVFP4 rotated KV cache quantizer entries; supported on Blackwell+ GPUs | | `mamba_moe_disabled_quantizers.yaml` | Shared Mamba-MoE quantizer exclusions | | `w8a8_fp8_fp8.yaml` | FP8 weight + activation quantizer entries (W8A8); supported on Hopper+ GPUs | | `w4a4_nvfp4_nvfp4.yaml` | NVFP4 weight + activation quantizer entries (W4A4); supported on Blackwell+ GPUs | diff --git a/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml b/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml new file mode 100644 index 0000000000..9832ba6b09 --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# FP8 E4M3 affine KV cache quantization. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig +imports: + kv_fp8: configs/ptq/units/kv_fp8 +--- + - $import: kv_fp8 + - quantizer_name: '*[kv]_bmm_quantizer' + cfg: + num_bits: e4m3 + axis: + bias: + -2: + -4: + type: static diff --git a/modelopt_recipes/configs/ptq/units/kv_nvfp4.yaml b/modelopt_recipes/configs/ptq/units/kv_nvfp4.yaml new file mode 100644 index 0000000000..4c34a783a8 --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/kv_nvfp4.yaml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 KV cache quantization. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig +imports: + nvfp4: configs/numerics/nvfp4 +--- + - quantizer_name: '*[kv]_bmm_quantizer' + cfg: + $import: nvfp4 diff --git a/modelopt_recipes/configs/ptq/units/kv_nvfp4_affine.yaml b/modelopt_recipes/configs/ptq/units/kv_nvfp4_affine.yaml new file mode 100644 index 0000000000..e3988eecab --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/kv_nvfp4_affine.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 affine KV cache quantization. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig +imports: + kv_nvfp4: configs/ptq/units/kv_nvfp4 + nvfp4: configs/numerics/nvfp4 +--- + - $import: kv_nvfp4 + - quantizer_name: '*[kv]_bmm_quantizer' + cfg: + $import: nvfp4 + bias: + -2: + -4: + type: static diff --git a/modelopt_recipes/configs/ptq/units/kv_nvfp4_rotate.yaml b/modelopt_recipes/configs/ptq/units/kv_nvfp4_rotate.yaml new file mode 100644 index 0000000000..50e8efa468 --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/kv_nvfp4_rotate.yaml @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 KV cache quantization with rotation. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig +imports: + nvfp4: configs/numerics/nvfp4 +--- + - quantizer_name: '*q_bmm_quantizer' + cfg: + rotate: true + enable: false + - quantizer_name: '*k_bmm_quantizer' + cfg: + $import: nvfp4 + rotate: true + - quantizer_name: '*v_bmm_quantizer' + cfg: + $import: nvfp4 From e41a20e17789f03bc21502721a88e43648da1e05 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Sat, 9 May 2026 10:56:48 -0700 Subject: [PATCH 19/23] Remove stale FP8 config comments Signed-off-by: Shengliang Xu --- modelopt_recipes/configs/numerics/fp8.yaml | 2 -- modelopt_recipes/configs/ptq/presets/kv/fp8.yaml | 1 - modelopt_recipes/configs/ptq/presets/model/fp8.yaml | 1 - 3 files changed, 4 deletions(-) diff --git a/modelopt_recipes/configs/numerics/fp8.yaml b/modelopt_recipes/configs/numerics/fp8.yaml index ab1da6fad5..e3b49218f6 100644 --- a/modelopt_recipes/configs/numerics/fp8.yaml +++ b/modelopt_recipes/configs/numerics/fp8.yaml @@ -14,8 +14,6 @@ # limitations under the License. # FP8 E4M3 quantizer attributes (per-tensor; used for weight/activation/KV). -# ``axis: null`` is explicit to match the hardcoded ``FP8_DEFAULT_CFG`` shape — -# downstream code that keys on ``"axis" in cfg`` sees the same dict layout. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e4m3 diff --git a/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml b/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml index 7e97f0bc77..e68e9333cd 100644 --- a/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml +++ b/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml @@ -14,7 +14,6 @@ # limitations under the License. # FP8 E4M3 KV cache quantization preset. -# Equivalent to the hardcoded FP8_KV_CFG in config.py. # This is a partial config (no algorithm, no base_disable_all) — designed # to be merged with a primary model quantization config. diff --git a/modelopt_recipes/configs/ptq/presets/model/fp8.yaml b/modelopt_recipes/configs/ptq/presets/model/fp8.yaml index af80b57fe4..a1a8718982 100644 --- a/modelopt_recipes/configs/ptq/presets/model/fp8.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/fp8.yaml @@ -14,7 +14,6 @@ # limitations under the License. # FP8 per-tensor weight and activation (W8A8), max calibration. -# Equivalent to the hardcoded FP8_DEFAULT_CFG in config.py. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: From b1c9664e55e696ec1af7c4133689d22ab34ca365 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Sat, 9 May 2026 12:21:17 -0700 Subject: [PATCH 20/23] update int4 int8 Signed-off-by: Shengliang Xu --- modelopt_recipes/configs/numerics/fp8.yaml | 2 +- .../configs/numerics/{int4.yaml => int4_per_block.yaml} | 2 +- .../configs/numerics/{int8.yaml => int8_per_channel.yaml} | 2 +- modelopt_recipes/configs/numerics/nvfp4_static.yaml | 2 +- modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml | 4 ++-- modelopt_recipes/configs/ptq/presets/model/int8.yaml | 4 ++-- .../configs/ptq/presets/model/int8_smoothquant.yaml | 4 ++-- .../configs/ptq/presets/model/int8_weight_only.yaml | 4 ++-- modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml | 4 ++-- 9 files changed, 14 insertions(+), 14 deletions(-) rename modelopt_recipes/configs/numerics/{int4.yaml => int4_per_block.yaml} (94%) rename modelopt_recipes/configs/numerics/{int8.yaml => int8_per_channel.yaml} (94%) diff --git a/modelopt_recipes/configs/numerics/fp8.yaml b/modelopt_recipes/configs/numerics/fp8.yaml index e3b49218f6..ad85a6320e 100644 --- a/modelopt_recipes/configs/numerics/fp8.yaml +++ b/modelopt_recipes/configs/numerics/fp8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 E4M3 quantizer attributes (per-tensor; used for weight/activation/KV). +# FP8 E4M3 per-tensor quantizer attributes. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e4m3 diff --git a/modelopt_recipes/configs/numerics/int4.yaml b/modelopt_recipes/configs/numerics/int4_per_block.yaml similarity index 94% rename from modelopt_recipes/configs/numerics/int4.yaml rename to modelopt_recipes/configs/numerics/int4_per_block.yaml index 3e229d977c..1f64bdb155 100644 --- a/modelopt_recipes/configs/numerics/int4.yaml +++ b/modelopt_recipes/configs/numerics/int4_per_block.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# INT4 static block quantizer attributes. +# INT4 static per-block quantizer attributes. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: 4 diff --git a/modelopt_recipes/configs/numerics/int8.yaml b/modelopt_recipes/configs/numerics/int8_per_channel.yaml similarity index 94% rename from modelopt_recipes/configs/numerics/int8.yaml rename to modelopt_recipes/configs/numerics/int8_per_channel.yaml index 25f8e9970e..9e00034903 100644 --- a/modelopt_recipes/configs/numerics/int8.yaml +++ b/modelopt_recipes/configs/numerics/int8_per_channel.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# INT8 per-channel weight quantizer attributes. +# INT8 per-channel quantizer attributes. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: 8 diff --git a/modelopt_recipes/configs/numerics/nvfp4_static.yaml b/modelopt_recipes/configs/numerics/nvfp4_static.yaml index 32bd247b79..5f908c21e5 100644 --- a/modelopt_recipes/configs/numerics/nvfp4_static.yaml +++ b/modelopt_recipes/configs/numerics/nvfp4_static.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 E2M1 blockwise quantizer attributes with FP8 E4M3 scales (used for NVFP4 weights since weight scales can be static). +# NVFP4 E2M1 blockwise quantizer attributes with static FP8 E4M3 scales. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e2m1 diff --git a/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml b/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml index 60c792801e..70084426e6 100644 --- a/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml @@ -19,7 +19,7 @@ imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers - int4: configs/numerics/int4 + int4_per_block: configs/numerics/int4_per_block algorithm: method: awq_lite @@ -28,7 +28,7 @@ quant_cfg: - $import: base_disable_all - quantizer_name: '*weight_quantizer' cfg: - $import: int4 + $import: int4_per_block - quantizer_name: '*input_quantizer' enable: false - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int8.yaml b/modelopt_recipes/configs/ptq/presets/model/int8.yaml index fc45fab956..88910a2d09 100644 --- a/modelopt_recipes/configs/ptq/presets/model/int8.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/int8.yaml @@ -19,14 +19,14 @@ imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers - int8: configs/numerics/int8 + int8_per_channel: configs/numerics/int8_per_channel algorithm: max quant_cfg: - $import: base_disable_all - quantizer_name: '*weight_quantizer' cfg: - $import: int8 + $import: int8_per_channel - quantizer_name: '*input_quantizer' cfg: num_bits: 8 diff --git a/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml b/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml index 274a607bb8..1bf2dee3da 100644 --- a/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml @@ -19,14 +19,14 @@ imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers - int8: configs/numerics/int8 + int8_per_channel: configs/numerics/int8_per_channel algorithm: smoothquant quant_cfg: - $import: base_disable_all - quantizer_name: '*weight_quantizer' cfg: - $import: int8 + $import: int8_per_channel - quantizer_name: '*input_quantizer' cfg: num_bits: 8 diff --git a/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml index 002876cc9c..5a96a3e884 100644 --- a/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml @@ -19,14 +19,14 @@ imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers - int8: configs/numerics/int8 + int8_per_channel: configs/numerics/int8_per_channel algorithm: max quant_cfg: - $import: base_disable_all - quantizer_name: '*weight_quantizer' cfg: - $import: int8 + $import: int8_per_channel - quantizer_name: '*input_quantizer' enable: false - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml b/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml index 1b43258320..5bac7f27fa 100644 --- a/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml @@ -20,14 +20,14 @@ imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers fp8: configs/numerics/fp8 - int4: configs/numerics/int4 + int4_per_block: configs/numerics/int4_per_block algorithm: awq_lite quant_cfg: - $import: base_disable_all - quantizer_name: '*weight_quantizer' cfg: - - $import: int4 + - $import: int4_per_block - $import: fp8 - quantizer_name: '*input_quantizer' cfg: From 19a6f9e9ecdd9b5ac707748d51f5c0582390428e Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Sat, 9 May 2026 12:41:29 -0700 Subject: [PATCH 21/23] update descriptions Signed-off-by: Shengliang Xu --- modelopt_recipes/configs/numerics/fp8.yaml | 2 +- modelopt_recipes/configs/numerics/int4_per_block.yaml | 2 +- modelopt_recipes/configs/numerics/int8_per_channel.yaml | 2 +- modelopt_recipes/configs/numerics/mxfp4.yaml | 2 +- modelopt_recipes/configs/numerics/mxfp6.yaml | 2 +- modelopt_recipes/configs/numerics/mxfp8.yaml | 2 +- modelopt_recipes/configs/numerics/mxint8.yaml | 2 +- modelopt_recipes/configs/numerics/nvfp4.yaml | 2 +- modelopt_recipes/configs/numerics/nvfp4_bs32.yaml | 2 +- modelopt_recipes/configs/numerics/nvfp4_static.yaml | 2 +- modelopt_recipes/configs/ptq/presets/kv/fp8.yaml | 5 ++--- modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml | 2 +- modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml | 2 +- modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml | 2 +- modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml | 2 +- modelopt_recipes/configs/ptq/presets/model/fp8.yaml | 2 +- .../ptq/presets/model/fp8_2d_blockwise_weight_only.yaml | 2 +- .../ptq/presets/model/fp8_per_channel_per_token.yaml | 2 +- modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml | 2 +- .../ptq/presets/model/int4_blockwise_weight_only.yaml | 2 +- modelopt_recipes/configs/ptq/presets/model/int8.yaml | 2 +- .../configs/ptq/presets/model/int8_smoothquant.yaml | 2 +- .../configs/ptq/presets/model/int8_weight_only.yaml | 2 +- .../ptq/presets/model/mamba_moe_fp8_aggressive.yaml | 2 +- .../ptq/presets/model/mamba_moe_fp8_conservative.yaml | 2 +- .../ptq/presets/model/mamba_moe_nvfp4_aggressive.yaml | 2 +- .../ptq/presets/model/mamba_moe_nvfp4_conservative.yaml | 2 +- modelopt_recipes/configs/ptq/presets/model/mxfp4.yaml | 2 +- .../configs/ptq/presets/model/mxfp4_mlp_weight_only.yaml | 2 +- modelopt_recipes/configs/ptq/presets/model/mxfp6.yaml | 2 +- modelopt_recipes/configs/ptq/presets/model/mxfp8.yaml | 2 +- modelopt_recipes/configs/ptq/presets/model/mxint8.yaml | 2 +- modelopt_recipes/configs/ptq/presets/model/nvfp4.yaml | 2 +- .../configs/ptq/presets/model/nvfp4_awq_clip.yaml | 2 +- .../configs/ptq/presets/model/nvfp4_awq_full.yaml | 2 +- .../configs/ptq/presets/model/nvfp4_awq_lite.yaml | 2 +- .../configs/ptq/presets/model/nvfp4_experts_only.yaml | 2 +- .../configs/ptq/presets/model/nvfp4_fp8_mha.yaml | 2 +- .../configs/ptq/presets/model/nvfp4_mlp_only.yaml | 2 +- .../configs/ptq/presets/model/nvfp4_mlp_weight_only.yaml | 2 +- .../configs/ptq/presets/model/nvfp4_omlp_only.yaml | 2 +- .../configs/ptq/presets/model/nvfp4_svdquant.yaml | 2 +- .../presets/model/nvfp4_w4a4_weight_local_hessian.yaml | 2 +- .../presets/model/nvfp4_w4a4_weight_mse_fp8_sweep.yaml | 2 +- .../configs/ptq/presets/model/w4a8_awq_beta.yaml | 2 +- .../configs/ptq/presets/model/w4a8_mxfp4_fp8.yaml | 2 +- .../configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml | 2 +- modelopt_recipes/configs/ptq/units/base_disable_all.yaml | 2 +- .../configs/ptq/units/default_disabled_quantizers.yaml | 2 +- modelopt_recipes/configs/ptq/units/kv_fp8.yaml | 2 +- modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml | 2 +- modelopt_recipes/configs/ptq/units/kv_fp8_cast.yaml | 2 +- modelopt_recipes/configs/ptq/units/kv_nvfp4.yaml | 2 +- modelopt_recipes/configs/ptq/units/kv_nvfp4_affine.yaml | 2 +- modelopt_recipes/configs/ptq/units/kv_nvfp4_cast.yaml | 2 +- modelopt_recipes/configs/ptq/units/kv_nvfp4_rotate.yaml | 2 +- .../configs/ptq/units/mamba_moe_disabled_quantizers.yaml | 2 +- modelopt_recipes/configs/ptq/units/w4a4_nvfp4_nvfp4.yaml | 2 +- modelopt_recipes/configs/ptq/units/w8a8_fp8_fp8.yaml | 2 +- modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml | 5 ++++- modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml | 6 ++++-- modelopt_recipes/general/ptq/nvfp4_default-kv_fp8.yaml | 6 +++++- .../general/ptq/nvfp4_default-kv_fp8_cast.yaml | 6 ++++-- .../general/ptq/nvfp4_default-kv_none-gptq.yaml | 6 +++++- .../general/ptq/nvfp4_default-kv_nvfp4_cast.yaml | 8 ++++---- .../general/ptq/nvfp4_experts_only-kv_fp8.yaml | 6 +++++- .../general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml | 6 +++++- modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml | 6 +++++- .../general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml | 6 +++++- modelopt_recipes/general/ptq/nvfp4_omlp_only-kv_fp8.yaml | 6 +++++- modelopt_recipes/general/speculative_decoding/dflash.yaml | 2 +- modelopt_recipes/general/speculative_decoding/eagle3.yaml | 2 +- modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml | 6 +++++- 73 files changed, 118 insertions(+), 80 deletions(-) diff --git a/modelopt_recipes/configs/numerics/fp8.yaml b/modelopt_recipes/configs/numerics/fp8.yaml index ad85a6320e..7761dd106c 100644 --- a/modelopt_recipes/configs/numerics/fp8.yaml +++ b/modelopt_recipes/configs/numerics/fp8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 E4M3 per-tensor quantizer attributes. +# Per-tensor FP8 E4M3 quantizer attributes. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e4m3 diff --git a/modelopt_recipes/configs/numerics/int4_per_block.yaml b/modelopt_recipes/configs/numerics/int4_per_block.yaml index 1f64bdb155..35d9f53a17 100644 --- a/modelopt_recipes/configs/numerics/int4_per_block.yaml +++ b/modelopt_recipes/configs/numerics/int4_per_block.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# INT4 static per-block quantizer attributes. +# Static INT4 quantizer attributes with 128-value blocks on the last dimension. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: 4 diff --git a/modelopt_recipes/configs/numerics/int8_per_channel.yaml b/modelopt_recipes/configs/numerics/int8_per_channel.yaml index 9e00034903..31c10635fc 100644 --- a/modelopt_recipes/configs/numerics/int8_per_channel.yaml +++ b/modelopt_recipes/configs/numerics/int8_per_channel.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# INT8 per-channel quantizer attributes. +# Per-channel INT8 quantizer attributes with axis 0. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: 8 diff --git a/modelopt_recipes/configs/numerics/mxfp4.yaml b/modelopt_recipes/configs/numerics/mxfp4.yaml index cb347fb3e6..f32fde304f 100644 --- a/modelopt_recipes/configs/numerics/mxfp4.yaml +++ b/modelopt_recipes/configs/numerics/mxfp4.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXFP4 E2M1 dynamic block quantizer attributes with E8M0 scales. +# Dynamic MXFP4 E2M1 block quantizer attributes with E8M0 scales. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e2m1 diff --git a/modelopt_recipes/configs/numerics/mxfp6.yaml b/modelopt_recipes/configs/numerics/mxfp6.yaml index 655f48215f..f8849edd29 100644 --- a/modelopt_recipes/configs/numerics/mxfp6.yaml +++ b/modelopt_recipes/configs/numerics/mxfp6.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXFP6 E3M2 dynamic block quantizer attributes with E8M0 scales. +# Dynamic MXFP6 E3M2 block quantizer attributes with E8M0 scales. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e3m2 diff --git a/modelopt_recipes/configs/numerics/mxfp8.yaml b/modelopt_recipes/configs/numerics/mxfp8.yaml index 5ce6197fc1..46cb3d9f7c 100644 --- a/modelopt_recipes/configs/numerics/mxfp8.yaml +++ b/modelopt_recipes/configs/numerics/mxfp8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXFP8 E4M3 dynamic block quantizer attributes with E8M0 scales. +# Dynamic MXFP8 E4M3 block quantizer attributes with E8M0 scales. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e4m3 diff --git a/modelopt_recipes/configs/numerics/mxint8.yaml b/modelopt_recipes/configs/numerics/mxint8.yaml index cbf8eaa161..388b251de6 100644 --- a/modelopt_recipes/configs/numerics/mxint8.yaml +++ b/modelopt_recipes/configs/numerics/mxint8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXINT8 dynamic block quantizer attributes with E8M0 scales. +# Dynamic MXINT8 block quantizer attributes with E8M0 scales. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: 8 diff --git a/modelopt_recipes/configs/numerics/nvfp4.yaml b/modelopt_recipes/configs/numerics/nvfp4.yaml index 68629c009f..88598e36e8 100644 --- a/modelopt_recipes/configs/numerics/nvfp4.yaml +++ b/modelopt_recipes/configs/numerics/nvfp4.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 E2M1 blockwise quantizer attributes with FP8 E4M3 scales (dynamic calibration, the default). +# Dynamic NVFP4 E2M1 block quantizer attributes with FP8 E4M3 scales. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e2m1 diff --git a/modelopt_recipes/configs/numerics/nvfp4_bs32.yaml b/modelopt_recipes/configs/numerics/nvfp4_bs32.yaml index 57392366cd..a84b63a91d 100644 --- a/modelopt_recipes/configs/numerics/nvfp4_bs32.yaml +++ b/modelopt_recipes/configs/numerics/nvfp4_bs32.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 E2M1 blockwise quantizer attributes with FP8 E4M3 scales and block size 32. +# Dynamic NVFP4 E2M1 block quantizer attributes with FP8 E4M3 scales and block size 32. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e2m1 diff --git a/modelopt_recipes/configs/numerics/nvfp4_static.yaml b/modelopt_recipes/configs/numerics/nvfp4_static.yaml index 5f908c21e5..9f6ac62e11 100644 --- a/modelopt_recipes/configs/numerics/nvfp4_static.yaml +++ b/modelopt_recipes/configs/numerics/nvfp4_static.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 E2M1 blockwise quantizer attributes with static FP8 E4M3 scales. +# Static NVFP4 E2M1 block quantizer attributes with FP8 E4M3 scales. # modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig num_bits: e2m1 diff --git a/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml b/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml index e68e9333cd..21894ef9c0 100644 --- a/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml +++ b/modelopt_recipes/configs/ptq/presets/kv/fp8.yaml @@ -13,9 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 E4M3 KV cache quantization preset. -# This is a partial config (no algorithm, no base_disable_all) — designed -# to be merged with a primary model quantization config. +# Partial QuantizeConfig that enables FP8 E4M3 KV-cache quantizers. +# Merge this fragment with a primary model quantization preset. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml b/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml index 6c17aa0f3e..4540df34ea 100644 --- a/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml +++ b/modelopt_recipes/configs/ptq/presets/kv/fp8_affine.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 E4M3 affine KV cache quantization fragment. +# Partial QuantizeConfig that enables affine FP8 E4M3 KV-cache quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml index 480ddebd61..6d759e2c11 100644 --- a/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 KV cache quantization fragment. +# Partial QuantizeConfig that enables NVFP4 KV-cache quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml index 074ce9b6c3..1f2a871010 100644 --- a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_affine.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 affine KV cache quantization fragment. +# Partial QuantizeConfig that enables affine NVFP4 KV-cache quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml index 684c9677e2..2451ee1a35 100644 --- a/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml +++ b/modelopt_recipes/configs/ptq/presets/kv/nvfp4_rotate.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 KV cache rotate preset. +# Partial QuantizeConfig that enables rotated NVFP4 KV-cache quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/fp8.yaml b/modelopt_recipes/configs/ptq/presets/model/fp8.yaml index a1a8718982..423904a6e1 100644 --- a/modelopt_recipes/configs/ptq/presets/model/fp8.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/fp8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 per-tensor weight and activation (W8A8), max calibration. +# QuantizeConfig preset for W8A8 FP8 E4M3 with per-tensor weights and inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml index d39e57451a..136f956288 100644 --- a/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 2D blockwise fake quantization config for DeepSeek models. +# QuantizeConfig preset for FP8 E4M3 2D blockwise weight-only quantization. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml b/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml index 4df9cb1c82..8c3f1d78cc 100644 --- a/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 per-channel weights and per-token dynamic activations. +# QuantizeConfig preset for FP8 E4M3 per-channel weights and per-token dynamic inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml b/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml index 70084426e6..828aef7d06 100644 --- a/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/int4_awq.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# INT4 AWQ weight-only preset. +# QuantizeConfig preset for AWQ-lite INT4 weight-only quantization. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml index 2ade851eee..beb3f20718 100644 --- a/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# INT4 blockwise weight-only preset. +# QuantizeConfig preset for INT4 blockwise weight-only quantization. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/int8.yaml b/modelopt_recipes/configs/ptq/presets/model/int8.yaml index 88910a2d09..7610d74a0d 100644 --- a/modelopt_recipes/configs/ptq/presets/model/int8.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/int8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# INT8 per-channel weights and per-tensor activations, max calibration. +# QuantizeConfig preset for INT8 per-channel weights and per-tensor inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml b/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml index 1bf2dee3da..e560a62391 100644 --- a/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# INT8 SmoothQuant preset. +# QuantizeConfig preset for SmoothQuant INT8 per-channel weights and per-tensor inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml index 5a96a3e884..cc475ab610 100644 --- a/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/int8_weight_only.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# INT8 weight-only preset. +# QuantizeConfig preset for INT8 per-channel weight-only quantization. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_aggressive.yaml b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_aggressive.yaml index 0b9f5b1b20..24fb95897a 100644 --- a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_aggressive.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_aggressive.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Mamba-MoE FP8 aggressive preset. +# QuantizeConfig preset for FP8 W8A8 Mamba-MoE quantization with shared exclusions. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_conservative.yaml b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_conservative.yaml index be44d6dec8..b943b31dcd 100644 --- a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_conservative.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_fp8_conservative.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Mamba-MoE FP8 conservative preset. +# QuantizeConfig preset for FP8 W8A8 Mamba-MoE quantization with mixer projections disabled. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_aggressive.yaml b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_aggressive.yaml index 79d929a101..6346548eb8 100644 --- a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_aggressive.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_aggressive.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Mamba-MoE NVFP4 aggressive preset. +# QuantizeConfig preset for NVFP4 W4A4 Mamba-MoE quantization with shared exclusions. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_conservative.yaml b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_conservative.yaml index fb525f402b..f94a4b1fc4 100644 --- a/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_conservative.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/mamba_moe_nvfp4_conservative.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Mamba-MoE NVFP4 conservative preset. +# QuantizeConfig preset for NVFP4 W4A4 Mamba-MoE quantization with mixer projections disabled. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/mxfp4.yaml b/modelopt_recipes/configs/ptq/presets/model/mxfp4.yaml index e9f3724297..982e22144e 100644 --- a/modelopt_recipes/configs/ptq/presets/model/mxfp4.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/mxfp4.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXFP4 dynamic block quantization preset. +# QuantizeConfig preset for dynamic MXFP4 block quantization on weights and inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/mxfp4_mlp_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/mxfp4_mlp_weight_only.yaml index e97d4c7d54..8d03600e87 100644 --- a/modelopt_recipes/configs/ptq/presets/model/mxfp4_mlp_weight_only.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/mxfp4_mlp_weight_only.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXFP4 MLP weight-only preset. +# QuantizeConfig preset for dynamic MXFP4 block weight-only quantization on MLP/MoE layers. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/mxfp6.yaml b/modelopt_recipes/configs/ptq/presets/model/mxfp6.yaml index c131388a8e..e8d590f384 100644 --- a/modelopt_recipes/configs/ptq/presets/model/mxfp6.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/mxfp6.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXFP6 dynamic block quantization preset. +# QuantizeConfig preset for dynamic MXFP6 block quantization on weights and inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/mxfp8.yaml b/modelopt_recipes/configs/ptq/presets/model/mxfp8.yaml index 17d4e64945..7cf2832311 100644 --- a/modelopt_recipes/configs/ptq/presets/model/mxfp8.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/mxfp8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXFP8 dynamic block quantization preset. +# QuantizeConfig preset for dynamic MXFP8 block quantization on weights and inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/mxint8.yaml b/modelopt_recipes/configs/ptq/presets/model/mxint8.yaml index 2d0bb9959d..e6ef1ca3d0 100644 --- a/modelopt_recipes/configs/ptq/presets/model/mxint8.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/mxint8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXINT8 dynamic block quantization preset. +# QuantizeConfig preset for dynamic MXINT8 block quantization on weights and inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4.yaml index 733d504807..f569f50143 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 default W4A4 preset. +# QuantizeConfig preset for dynamic NVFP4 W4A4 quantization on weights and inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_clip.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_clip.yaml index b0af7b5f9c..d3cce28419 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_clip.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_clip.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 AWQ clip preset. +# QuantizeConfig preset for NVFP4 W4A4 quantization with AWQ clip calibration. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_full.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_full.yaml index f3be6c9ef7..38934b9c05 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_full.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_full.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 AWQ full preset. +# QuantizeConfig preset for NVFP4 W4A4 quantization with full AWQ calibration. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_lite.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_lite.yaml index b1915216ec..e69daf39e5 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_lite.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_awq_lite.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 AWQ lite preset. +# QuantizeConfig preset for NVFP4 W4A4 quantization with AWQ-lite calibration. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_experts_only.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_experts_only.yaml index fe1e10c374..e2b4a956b6 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4_experts_only.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_experts_only.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 experts-only preset. +# QuantizeConfig preset for dynamic NVFP4 W4A4 quantization on expert layers only. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml index 831391416a..fadb7b9bbc 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Diffusers NVFP4 preset with FP8 MHA quantizers. +# QuantizeConfig preset for Diffusers NVFP4 with FP8 attention quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_only.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_only.yaml index 46947a228d..dbe32c0b3a 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_only.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_only.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 MLP-only preset. +# QuantizeConfig preset for dynamic NVFP4 W4A4 quantization on MLP/MoE layers only. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_weight_only.yaml index 01aa438e2b..952ea3a90d 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_weight_only.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_mlp_weight_only.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 MLP weight-only preset with block size 32. +# QuantizeConfig preset for NVFP4 block-size-32 weight-only quantization on MLP/MoE layers. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_omlp_only.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_omlp_only.yaml index eee8c5cdb4..1b7e1cbd7c 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4_omlp_only.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_omlp_only.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 output-projection and MLP preset. +# QuantizeConfig preset for dynamic NVFP4 W4A4 quantization on output projections and MLP/MoE layers. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_svdquant.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_svdquant.yaml index 85f9aec7a5..8101d66621 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4_svdquant.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_svdquant.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 SVDQuant preset. +# QuantizeConfig preset for NVFP4 W4A4 quantization with SVDQuant low-rank calibration. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian.yaml index c31e1619da..ac6a3094b7 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_local_hessian.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 W4A4 with static weight scales optimized by local Hessian calibration. +# QuantizeConfig preset for NVFP4 W4A4 with static weight scales from local-Hessian calibration. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep.yaml index 0c4b23ad53..3ae22dbc3a 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_w4a4_weight_mse_fp8_sweep.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 W4A4 with static weight scales optimized by MSE FP8 scale sweep. +# QuantizeConfig preset for NVFP4 W4A4 with static weight scales from MSE FP8-scale sweep. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml b/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml index 5bac7f27fa..12073e1460 100644 --- a/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/w4a8_awq_beta.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# W4A8 AWQ beta: INT4 blockwise weights followed by FP8 weights, FP8 activations. +# QuantizeConfig preset for W4A8 AWQ-lite with INT4 block weights and FP8 inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/w4a8_mxfp4_fp8.yaml b/modelopt_recipes/configs/ptq/presets/model/w4a8_mxfp4_fp8.yaml index d4e11d34ad..428cb659da 100644 --- a/modelopt_recipes/configs/ptq/presets/model/w4a8_mxfp4_fp8.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/w4a8_mxfp4_fp8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# MXFP4 weights with FP8 activations. +# QuantizeConfig preset for W4A8 with MXFP4 block weights and FP8 inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml b/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml index 97053fe7c1..86b335cbc1 100644 --- a/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 block-size-32 weights with FP8 activations. +# QuantizeConfig preset for W4A8 with NVFP4 block-size-32 weights and FP8 inputs. # modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig imports: diff --git a/modelopt_recipes/configs/ptq/units/base_disable_all.yaml b/modelopt_recipes/configs/ptq/units/base_disable_all.yaml index 9a520ee207..ee96d00411 100644 --- a/modelopt_recipes/configs/ptq/units/base_disable_all.yaml +++ b/modelopt_recipes/configs/ptq/units/base_disable_all.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Disable all quantizers by default (deny-all-then-configure pattern). +# QuantizerCfgList snippet that disables every quantizer before selective re-enabling. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgEntry quantizer_name: '*' diff --git a/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml b/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml index 1508f94277..86d5a64c67 100644 --- a/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml +++ b/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Standard quantizer exclusions: layers that should not be quantized. +# QuantizerCfgList snippet for standard module patterns that should remain unquantized. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig - quantizer_name: '*block_sparse_moe.gate*' diff --git a/modelopt_recipes/configs/ptq/units/kv_fp8.yaml b/modelopt_recipes/configs/ptq/units/kv_fp8.yaml index 646be96709..86156e5e95 100644 --- a/modelopt_recipes/configs/ptq/units/kv_fp8.yaml +++ b/modelopt_recipes/configs/ptq/units/kv_fp8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 E4M3 KV cache quantization. +# QuantizerCfgList snippet that enables FP8 E4M3 KV-cache quantizers. # # This snippet uses multi-document YAML (separated by ---) because it is a # list-valued snippet that also needs to $import another snippet. YAML only diff --git a/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml b/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml index 9832ba6b09..5458e5511c 100644 --- a/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml +++ b/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 E4M3 affine KV cache quantization. +# QuantizerCfgList snippet that enables affine FP8 E4M3 KV-cache quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig imports: diff --git a/modelopt_recipes/configs/ptq/units/kv_fp8_cast.yaml b/modelopt_recipes/configs/ptq/units/kv_fp8_cast.yaml index 64cfbd47bc..606c969ab3 100644 --- a/modelopt_recipes/configs/ptq/units/kv_fp8_cast.yaml +++ b/modelopt_recipes/configs/ptq/units/kv_fp8_cast.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# FP8 E4M3 KV cache quantization with constant amax. +# QuantizerCfgList snippet that enables FP8 E4M3 KV-cache quantizers with constant amax. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig imports: diff --git a/modelopt_recipes/configs/ptq/units/kv_nvfp4.yaml b/modelopt_recipes/configs/ptq/units/kv_nvfp4.yaml index 4c34a783a8..a95b854a0a 100644 --- a/modelopt_recipes/configs/ptq/units/kv_nvfp4.yaml +++ b/modelopt_recipes/configs/ptq/units/kv_nvfp4.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 KV cache quantization. +# QuantizerCfgList snippet that enables NVFP4 KV-cache quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig imports: diff --git a/modelopt_recipes/configs/ptq/units/kv_nvfp4_affine.yaml b/modelopt_recipes/configs/ptq/units/kv_nvfp4_affine.yaml index e3988eecab..2122e8b343 100644 --- a/modelopt_recipes/configs/ptq/units/kv_nvfp4_affine.yaml +++ b/modelopt_recipes/configs/ptq/units/kv_nvfp4_affine.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 affine KV cache quantization. +# QuantizerCfgList snippet that enables affine NVFP4 KV-cache quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig imports: diff --git a/modelopt_recipes/configs/ptq/units/kv_nvfp4_cast.yaml b/modelopt_recipes/configs/ptq/units/kv_nvfp4_cast.yaml index 3fc5d597aa..b5658c2ff1 100644 --- a/modelopt_recipes/configs/ptq/units/kv_nvfp4_cast.yaml +++ b/modelopt_recipes/configs/ptq/units/kv_nvfp4_cast.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 KV cache quantization with constant amax. +# QuantizerCfgList snippet that enables NVFP4 KV-cache quantizers with constant amax. # # The deployment kernel upcasts NVFP4 KV values to FP8 before attention, so the # scale must land in the FP8 range. diff --git a/modelopt_recipes/configs/ptq/units/kv_nvfp4_rotate.yaml b/modelopt_recipes/configs/ptq/units/kv_nvfp4_rotate.yaml index 50e8efa468..b117edbf1b 100644 --- a/modelopt_recipes/configs/ptq/units/kv_nvfp4_rotate.yaml +++ b/modelopt_recipes/configs/ptq/units/kv_nvfp4_rotate.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# NVFP4 KV cache quantization with rotation. +# QuantizerCfgList snippet that enables rotated NVFP4 KV-cache quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig imports: diff --git a/modelopt_recipes/configs/ptq/units/mamba_moe_disabled_quantizers.yaml b/modelopt_recipes/configs/ptq/units/mamba_moe_disabled_quantizers.yaml index b079c54f89..c9b87f8d21 100644 --- a/modelopt_recipes/configs/ptq/units/mamba_moe_disabled_quantizers.yaml +++ b/modelopt_recipes/configs/ptq/units/mamba_moe_disabled_quantizers.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Mamba-MoE exclusions shared by the aggressive and conservative presets. +# QuantizerCfgList snippet with Mamba/MoE-specific exclusion patterns. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig - quantizer_name: '*fc1_latent_proj*' diff --git a/modelopt_recipes/configs/ptq/units/w4a4_nvfp4_nvfp4.yaml b/modelopt_recipes/configs/ptq/units/w4a4_nvfp4_nvfp4.yaml index 033cdf7669..010d81ab62 100644 --- a/modelopt_recipes/configs/ptq/units/w4a4_nvfp4_nvfp4.yaml +++ b/modelopt_recipes/configs/ptq/units/w4a4_nvfp4_nvfp4.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# W4A4 NVFP4: NVFP4 E2M1 dynamic weight and activation quantizers. +# QuantizerCfgList snippet that enables dynamic NVFP4 on weight and input quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig imports: diff --git a/modelopt_recipes/configs/ptq/units/w8a8_fp8_fp8.yaml b/modelopt_recipes/configs/ptq/units/w8a8_fp8_fp8.yaml index 07db59ff3b..068f38d149 100644 --- a/modelopt_recipes/configs/ptq/units/w8a8_fp8_fp8.yaml +++ b/modelopt_recipes/configs/ptq/units/w8a8_fp8_fp8.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# W8A8 FP8: FP8 E4M3 weight and activation quantizers. +# QuantizerCfgList snippet that enables per-tensor FP8 E4M3 on weight and input quantizers. # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig imports: diff --git a/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml b/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml index 4c6ba99e11..ea2ac56729 100644 --- a/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for W8A8 FP8 E4M3 model quantization with FP8 KV-cache quantization. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -21,7 +23,8 @@ imports: metadata: recipe_type: ptq - description: FP8 per-tensor weight and activation (W8A8), FP8 KV cache, max calibration. + description: >- + Composes W8A8 FP8 E4M3 model quantization with FP8 KV-cache quantization; uses max calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml index f99a716ced..4e24bf5327 100644 --- a/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml +++ b/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for W8A8 FP8 E4M3 model quantization with FP8 KV-cache cast mode. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -22,8 +24,8 @@ imports: metadata: recipe_type: ptq description: >- - FP8 per-tensor weight and activation (W8A8), FP8 KV cache with constant amax - (skips KV calibration; amax hardcoded to FP8 E4M3 max 448.0), max calibration. + Composes W8A8 FP8 E4M3 model quantization with FP8 KV-cache cast mode using constant amax; uses + max calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8.yaml index 63b6d673b9..6a65efef57 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for dynamic NVFP4 W4A4 model quantization with FP8 KV-cache quantization. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -21,7 +23,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 W4A4, FP8 KV cache, max calibration. + description: >- + Composes dynamic NVFP4 W4A4 model quantization with FP8 KV-cache quantization; uses max + calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8_cast.yaml index 1504f33d3c..312cdd16c8 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8_cast.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_default-kv_fp8_cast.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for dynamic NVFP4 W4A4 model quantization with FP8 KV-cache cast mode. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -22,8 +24,8 @@ imports: metadata: recipe_type: ptq description: >- - NVFP4 W4A4, FP8 KV cache with constant amax (skips KV calibration; amax - hardcoded to FP8 E4M3 max 448.0), max calibration. + Composes dynamic NVFP4 W4A4 model quantization with FP8 KV-cache cast mode using constant amax; + uses max calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/ptq/nvfp4_default-kv_none-gptq.yaml b/modelopt_recipes/general/ptq/nvfp4_default-kv_none-gptq.yaml index 6aabb04a15..6dee51857c 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-kv_none-gptq.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_default-kv_none-gptq.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for NVFP4 W4A4 model quantization with KV quantizers disabled and GPTQ calibration. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -21,7 +23,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 weight and activation (W4A4), gptq layerwise calibration. + description: >- + Applies NVFP4 W4A4 with static weight scales, dynamic inputs, KV quantizers disabled, and GPTQ + layerwise calibration. quantize: algorithm: method: gptq diff --git a/modelopt_recipes/general/ptq/nvfp4_default-kv_nvfp4_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_default-kv_nvfp4_cast.yaml index d9991e0b9c..0acdf6050d 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-kv_nvfp4_cast.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_default-kv_nvfp4_cast.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for dynamic NVFP4 W4A4 model quantization with NVFP4 KV-cache cast mode. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -22,10 +24,8 @@ imports: metadata: recipe_type: ptq description: >- - NVFP4 W4A4, NVFP4 KV cache with constant amax (skips KV calibration; amax - hardcoded to FP8 E4M3 max 448.0 — the deployment kernel upcasts NVFP4 KV - values to FP8 before attention, so the scale must land in the FP8 range), - max calibration. + Composes dynamic NVFP4 W4A4 model quantization with NVFP4 KV-cache cast mode using constant + amax; uses max calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml index 547cf31286..08864c8a50 100644 --- a/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only-kv_fp8.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for expert-only dynamic NVFP4 quantization with FP8 KV-cache quantization. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -21,7 +23,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 static weight and dynamic activation for expert layers only (W4A4), FP8 KV cache, max calibration. + description: >- + Applies dynamic NVFP4 only to expert-layer weight and input quantizers, plus FP8 KV-cache + quantization; uses max calibration. quantize: algorithm: method: max diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml index 5db1666402..5bf9a36dc3 100644 --- a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for expert-only NVFP4 quantization with MSE weight calibration and FP8 KV-cache cast mode. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -22,7 +24,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for expert layers only (W4A4), FP8 KV cache with constant amax. + description: >- + Applies static NVFP4 weight scales from MSE FP8-scale sweep and dynamic NVFP4 inputs to expert + layers only, plus FP8 KV-cache cast mode. quantize: algorithm: method: mse diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml index 60cba464e0..a4cf71a1db 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for MLP/MoE-only dynamic NVFP4 quantization with FP8 KV-cache quantization. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -21,7 +23,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 static weight and dynamic activation for all linear layers (W4A4), FP8 KV cache, max calibration. + description: >- + Applies dynamic NVFP4 only to MLP/MoE weight and input quantizers, plus FP8 KV-cache + quantization; uses max calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml index 875fb47c9b..2ea2c0ab13 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for MLP/MoE-only NVFP4 quantization with MSE weight calibration and FP8 KV-cache cast mode. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -22,7 +24,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for MLP/MoE linear layers (W4A4), FP8 KV cache with constant amax. + description: >- + Applies static NVFP4 weight scales from MSE FP8-scale sweep and dynamic NVFP4 inputs to MLP/MoE + layers, plus FP8 KV-cache cast mode. quantize: algorithm: method: mse diff --git a/modelopt_recipes/general/ptq/nvfp4_omlp_only-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_omlp_only-kv_fp8.yaml index 13c7cac079..5348e8c712 100644 --- a/modelopt_recipes/general/ptq/nvfp4_omlp_only-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_omlp_only-kv_fp8.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Composed PTQ recipe for output-projection and MLP/MoE dynamic NVFP4 quantization with FP8 KV-cache quantization. + imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers @@ -21,7 +23,9 @@ imports: metadata: recipe_type: ptq - description: NVFP4 static weight and dynamic activation for all linear layers including output projections, FP8 KV cache, max calibration. + description: >- + Applies dynamic NVFP4 to output-projection and MLP/MoE weight and input quantizers, plus + FP8 KV-cache quantization; uses max calibration. quantize: algorithm: max quant_cfg: diff --git a/modelopt_recipes/general/speculative_decoding/dflash.yaml b/modelopt_recipes/general/speculative_decoding/dflash.yaml index 3d43e0fe1d..d6458a9b26 100644 --- a/modelopt_recipes/general/speculative_decoding/dflash.yaml +++ b/modelopt_recipes/general/speculative_decoding/dflash.yaml @@ -1,4 +1,4 @@ -# Base config for DFlash training. Override fields via OmegaConf dotlist on the CLI. +# DFlash speculative-decoding training recipe. Override fields via OmegaConf dotlist on the CLI. # maps to ModelArguments (main.py) model: diff --git a/modelopt_recipes/general/speculative_decoding/eagle3.yaml b/modelopt_recipes/general/speculative_decoding/eagle3.yaml index a1b7ff7770..fb9484a909 100644 --- a/modelopt_recipes/general/speculative_decoding/eagle3.yaml +++ b/modelopt_recipes/general/speculative_decoding/eagle3.yaml @@ -1,4 +1,4 @@ -# Base config for EAGLE3 training. Override fields via OmegaConf dotlist on the CLI. +# EAGLE3 speculative-decoding training recipe. Override fields via OmegaConf dotlist on the CLI. # maps to ModelArguments (main.py) model: diff --git a/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml b/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml index c00aff7d44..17eb0d7a71 100644 --- a/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml +++ b/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml @@ -13,9 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Model-specific PTQ recipe for Step3.5-Flash NVFP4 MLP/MoE quantization with FP8 KV cache. + metadata: recipe_type: ptq - description: NVFP4 static weight and dynamic activation for MoE/MLP projections (W4A4), FP8 KV cache, max calibration. + description: >- + Step3.5-Flash PTQ recipe that enables dynamic NVFP4 on MoE/MLP weight and input quantizers, + enables FP8 KV-cache quantizers, and leaves other quantizers disabled. quantize: algorithm: max quant_cfg: From caae1fec0fe32c4da5e8f5c6afe53706e70b157d Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Fri, 15 May 2026 08:12:15 -0700 Subject: [PATCH 22/23] Move Diffusers quant configs to YAML --- examples/diffusers/quantization/config.py | 89 +++---------------- examples/diffusers/quantization/quantize.py | 26 ++++-- .../configs/ptq/presets/README.md | 4 + .../configs/ptq/presets/diffusers/fp8.yaml | 36 ++++++++ .../configs/ptq/presets/diffusers/int8.yaml | 34 +++++++ .../configs/ptq/presets/diffusers/nvfp4.yaml | 37 ++++++++ .../ptq/presets/diffusers/nvfp4_fp8_mha.yaml | 45 ++++++++++ .../models/Step3.5-Flash/nvfp4-mlp-only.yaml | 5 -- 8 files changed, 187 insertions(+), 89 deletions(-) create mode 100644 modelopt_recipes/configs/ptq/presets/diffusers/fp8.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/diffusers/int8.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/diffusers/nvfp4.yaml create mode 100644 modelopt_recipes/configs/ptq/presets/diffusers/nvfp4_fp8_mha.yaml diff --git a/examples/diffusers/quantization/config.py b/examples/diffusers/quantization/config.py index e15b8c7ba3..7b472565a6 100644 --- a/examples/diffusers/quantization/config.py +++ b/examples/diffusers/quantization/config.py @@ -16,82 +16,21 @@ import torch.nn as nn from calib.plugin_calib import PercentileCalibrator -FP8_DEFAULT_CONFIG = { - "quant_cfg": [ - {"quantizer_name": "*", "enable": False}, - {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - {"quantizer_name": "*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - {"quantizer_name": "*output_quantizer", "enable": False}, - {"quantizer_name": "*softmax_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - ], - "algorithm": "max", -} +from modelopt.torch.opt.config_loader import load_config +from modelopt.torch.quantization.config import QuantizeConfig -INT8_DEFAULT_CONFIG = { - "quant_cfg": [ - {"quantizer_name": "*", "enable": False}, - {"quantizer_name": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, - {"quantizer_name": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, - {"quantizer_name": "*output_quantizer", "enable": False}, - ], - "algorithm": "max", -} - -NVFP4_DEFAULT_CONFIG = { - "quant_cfg": [ - {"quantizer_name": "*", "enable": False}, - { - "quantizer_name": "*weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - }, - "enable": True, - }, - { - "quantizer_name": "*input_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - }, - "enable": True, - }, - {"quantizer_name": "*output_quantizer", "enable": False}, - {"quantizer_name": "*softmax_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - ], - "algorithm": "max", -} - -NVFP4_FP8_MHA_CONFIG = { - "quant_cfg": [ - {"quantizer_name": "*", "enable": False}, - { - "quantizer_name": "**weight_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - }, - "enable": True, - }, - { - "quantizer_name": "**input_quantizer", - "cfg": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - }, - "enable": True, - }, - {"quantizer_name": "*output_quantizer", "enable": False}, - {"quantizer_name": "*[qkv]_bmm_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - {"quantizer_name": "*softmax_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - {"quantizer_name": "*bmm2_output_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - ], - "algorithm": {"method": "svdquant", "lowrank": 32}, -} +FP8_DEFAULT_CONFIG = load_config( + "configs/ptq/presets/diffusers/fp8", schema_type=QuantizeConfig +).model_dump(exclude_unset=True) +INT8_DEFAULT_CONFIG = load_config( + "configs/ptq/presets/diffusers/int8", schema_type=QuantizeConfig +).model_dump(exclude_unset=True) +NVFP4_DEFAULT_CONFIG = load_config( + "configs/ptq/presets/diffusers/nvfp4", schema_type=QuantizeConfig +).model_dump(exclude_unset=True) +NVFP4_FP8_MHA_CONFIG = load_config( + "configs/ptq/presets/diffusers/nvfp4_fp8_mha", schema_type=QuantizeConfig +).model_dump(exclude_unset=True) def set_quant_config_attr(quant_config, trt_high_precision_dtype, quant_algo, **kwargs): diff --git a/examples/diffusers/quantization/quantize.py b/examples/diffusers/quantization/quantize.py index 2a3c947a2d..ea580f6380 100644 --- a/examples/diffusers/quantization/quantize.py +++ b/examples/diffusers/quantization/quantize.py @@ -14,6 +14,7 @@ # limitations under the License. import argparse +import copy import logging import sys import time as time @@ -114,19 +115,13 @@ def get_quant_config(self, n_steps: int, backbone: torch.nn.Module) -> Any: """ self.logger.info(f"Building quantization config for {self.config.format.value}") + apply_int8_percentile_calibrator = False if self.config.format == QuantFormat.INT8: if self.config.algo == QuantAlgo.SMOOTHQUANT: base_cfg = mtq.INT8_SMOOTHQUANT_CFG else: base_cfg = INT8_DEFAULT_CONFIG - if self.config.collect_method != CollectMethod.DEFAULT: - reset_set_int8_config( - base_cfg, - self.config.percentile, - n_steps, - collect_method=self.config.collect_method.value, - backbone=backbone, - ) + apply_int8_percentile_calibrator = self.config.collect_method != CollectMethod.DEFAULT elif self.config.format == QuantFormat.FP8: base_cfg = FP8_DEFAULT_CONFIG elif self.config.format == QuantFormat.FP4: @@ -137,7 +132,20 @@ def get_quant_config(self, n_steps: int, backbone: torch.nn.Module) -> Any: else: raise NotImplementedError(f"Unknown format {self.config.format}") - # Build a fresh config dict so we never mutate the global constants. + # Build a fresh config dict so runtime overrides never mutate the global constants. + base_cfg = copy.deepcopy(base_cfg) + if hasattr(base_cfg, "model_dump"): + base_cfg = base_cfg.model_dump(exclude_unset=True) + + if apply_int8_percentile_calibrator: + reset_set_int8_config( + base_cfg, + self.config.percentile, + n_steps, + collect_method=self.config.collect_method.value, + backbone=backbone, + ) + quant_cfg_list = list(base_cfg["quant_cfg"]) if self.config.format == QuantFormat.FP4: diff --git a/modelopt_recipes/configs/ptq/presets/README.md b/modelopt_recipes/configs/ptq/presets/README.md index b07f989ffe..9ef015cc86 100644 --- a/modelopt_recipes/configs/ptq/presets/README.md +++ b/modelopt_recipes/configs/ptq/presets/README.md @@ -25,6 +25,10 @@ own imports have been resolved. be merged on top of a `model/` preset via `$import` to produce a complete config. Example: `kv/fp8.yaml` (the YAML source of `FP8_KV_CFG`). +- **`diffusers/`** — Diffusers-specific full quantization presets. These + files are complete configs used by the Diffusers examples, including + attention and softmax quantizer choices that differ from the generic + `model/` presets. **Note:** The main purpose of these presets is to support the existing `hf_ptq.py` script's `--qformat` / `--kv_cache_qformat` flags and other diff --git a/modelopt_recipes/configs/ptq/presets/diffusers/fp8.yaml b/modelopt_recipes/configs/ptq/presets/diffusers/fp8.yaml new file mode 100644 index 0000000000..7cb89a7bfe --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/diffusers/fp8.yaml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diffusers FP8 preset with per-tensor E4M3 weights, inputs, and softmax quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + fp8: configs/numerics/fp8 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: fp8 + - quantizer_name: '*input_quantizer' + cfg: + $import: fp8 + - quantizer_name: '*output_quantizer' + enable: false + - quantizer_name: '*softmax_quantizer' + cfg: + $import: fp8 diff --git a/modelopt_recipes/configs/ptq/presets/diffusers/int8.yaml b/modelopt_recipes/configs/ptq/presets/diffusers/int8.yaml new file mode 100644 index 0000000000..5f3546618c --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/diffusers/int8.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diffusers INT8 preset with per-channel weights and per-tensor inputs. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + int8_per_channel: configs/numerics/int8_per_channel + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: int8_per_channel + - quantizer_name: '*input_quantizer' + cfg: + num_bits: 8 + axis: + - quantizer_name: '*output_quantizer' + enable: false diff --git a/modelopt_recipes/configs/ptq/presets/diffusers/nvfp4.yaml b/modelopt_recipes/configs/ptq/presets/diffusers/nvfp4.yaml new file mode 100644 index 0000000000..691defb2ae --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/diffusers/nvfp4.yaml @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diffusers NVFP4 preset with dynamic E2M1 block quantization and FP8 softmax. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + fp8: configs/numerics/fp8 + nvfp4: configs/numerics/nvfp4 + +algorithm: max +quant_cfg: + - $import: base_disable_all + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*output_quantizer' + enable: false + - quantizer_name: '*softmax_quantizer' + cfg: + $import: fp8 diff --git a/modelopt_recipes/configs/ptq/presets/diffusers/nvfp4_fp8_mha.yaml b/modelopt_recipes/configs/ptq/presets/diffusers/nvfp4_fp8_mha.yaml new file mode 100644 index 0000000000..aa081f1e29 --- /dev/null +++ b/modelopt_recipes/configs/ptq/presets/diffusers/nvfp4_fp8_mha.yaml @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diffusers Flux preset with dynamic NVFP4 weights/inputs and FP8 attention quantizers. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizeConfig +imports: + base_disable_all: configs/ptq/units/base_disable_all + fp8: configs/numerics/fp8 + nvfp4: configs/numerics/nvfp4 + +algorithm: + method: svdquant + lowrank: 32 +quant_cfg: + - $import: base_disable_all + - quantizer_name: '**weight_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '**input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*output_quantizer' + enable: false + - quantizer_name: '*[qkv]_bmm_quantizer' + cfg: + $import: fp8 + - quantizer_name: '*softmax_quantizer' + cfg: + $import: fp8 + - quantizer_name: '*bmm2_output_quantizer' + cfg: + $import: fp8 diff --git a/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml b/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml index 17eb0d7a71..1f541f978f 100644 --- a/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml +++ b/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml @@ -26,7 +26,6 @@ quantize: - quantizer_name: '*' enable: false - quantizer_name: '*moe*weight_quantizer' - enable: true cfg: block_sizes: -1: 16 @@ -34,7 +33,6 @@ quantize: scale_bits: e4m3 num_bits: e2m1 - quantizer_name: '*moe*input_quantizer' - enable: true cfg: block_sizes: -1: 16 @@ -42,7 +40,6 @@ quantize: scale_bits: e4m3 num_bits: e2m1 - quantizer_name: '*mlp*weight_quantizer' - enable: true cfg: block_sizes: -1: 16 @@ -50,7 +47,6 @@ quantize: scale_bits: e4m3 num_bits: e2m1 - quantizer_name: '*mlp*input_quantizer' - enable: true cfg: block_sizes: -1: 16 @@ -58,7 +54,6 @@ quantize: scale_bits: e4m3 num_bits: e2m1 - quantizer_name: '*[kv]_bmm_quantizer' - enable: true cfg: num_bits: e4m3 - quantizer_name: '*share_expert*' From aeaae95b433ae2b22080361dde68aed1bddb7d9c Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Fri, 15 May 2026 09:00:40 -0700 Subject: [PATCH 23/23] Use numerics imports in YAML configs --- modelopt_recipes/configs/numerics/int8.yaml | 20 +++++++++++++ .../configs/ptq/presets/diffusers/int8.yaml | 4 +-- .../model/fp8_2d_blockwise_weight_only.yaml | 3 +- .../model/fp8_per_channel_per_token.yaml | 5 ++-- .../model/int4_blockwise_weight_only.yaml | 5 ++-- .../configs/ptq/presets/model/int8.yaml | 4 +-- .../ptq/presets/model/int8_smoothquant.yaml | 4 +-- .../ptq/presets/model/nvfp4_fp8_mha.yaml | 11 +++---- .../ptq/presets/model/w4a8_nvfp4_fp8.yaml | 3 +- .../configs/ptq/units/kv_fp8_affine.yaml | 4 +-- .../models/Step3.5-Flash/nvfp4-mlp-only.yaml | 30 ++++++------------- 11 files changed, 52 insertions(+), 41 deletions(-) create mode 100644 modelopt_recipes/configs/numerics/int8.yaml diff --git a/modelopt_recipes/configs/numerics/int8.yaml b/modelopt_recipes/configs/numerics/int8.yaml new file mode 100644 index 0000000000..41e8835c37 --- /dev/null +++ b/modelopt_recipes/configs/numerics/int8.yaml @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Per-tensor INT8 quantizer attributes. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerAttributeConfig +num_bits: 8 +axis: diff --git a/modelopt_recipes/configs/ptq/presets/diffusers/int8.yaml b/modelopt_recipes/configs/ptq/presets/diffusers/int8.yaml index 5f3546618c..be12d71745 100644 --- a/modelopt_recipes/configs/ptq/presets/diffusers/int8.yaml +++ b/modelopt_recipes/configs/ptq/presets/diffusers/int8.yaml @@ -19,6 +19,7 @@ imports: base_disable_all: configs/ptq/units/base_disable_all int8_per_channel: configs/numerics/int8_per_channel + int8: configs/numerics/int8 algorithm: max quant_cfg: @@ -28,7 +29,6 @@ quant_cfg: $import: int8_per_channel - quantizer_name: '*input_quantizer' cfg: - num_bits: 8 - axis: + $import: int8 - quantizer_name: '*output_quantizer' enable: false diff --git a/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml index 136f956288..a8d6bbb03f 100644 --- a/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/fp8_2d_blockwise_weight_only.yaml @@ -19,13 +19,14 @@ imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + fp8: configs/numerics/fp8 algorithm: max quant_cfg: - $import: base_disable_all - quantizer_name: '*weight_quantizer' cfg: - num_bits: e4m3 + $import: fp8 block_sizes: -1: 128 -2: 128 diff --git a/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml b/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml index 8c3f1d78cc..98a42f4959 100644 --- a/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/fp8_per_channel_per_token.yaml @@ -19,17 +19,18 @@ imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + fp8: configs/numerics/fp8 algorithm: max quant_cfg: - $import: base_disable_all - quantizer_name: '*weight_quantizer' cfg: - num_bits: e4m3 + $import: fp8 axis: 0 - quantizer_name: '*input_quantizer' cfg: - num_bits: e4m3 + $import: fp8 type: dynamic block_sizes: -1: diff --git a/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml b/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml index beb3f20718..f55351812c 100644 --- a/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/int4_blockwise_weight_only.yaml @@ -19,15 +19,14 @@ imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + int4_per_block: configs/numerics/int4_per_block algorithm: max quant_cfg: - $import: base_disable_all - quantizer_name: '*weight_quantizer' cfg: - num_bits: 4 - block_sizes: - -1: 128 + $import: int4_per_block - quantizer_name: '*input_quantizer' enable: false - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int8.yaml b/modelopt_recipes/configs/ptq/presets/model/int8.yaml index 7610d74a0d..1bfc7b95f0 100644 --- a/modelopt_recipes/configs/ptq/presets/model/int8.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/int8.yaml @@ -20,6 +20,7 @@ imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers int8_per_channel: configs/numerics/int8_per_channel + int8: configs/numerics/int8 algorithm: max quant_cfg: @@ -29,6 +30,5 @@ quant_cfg: $import: int8_per_channel - quantizer_name: '*input_quantizer' cfg: - num_bits: 8 - axis: + $import: int8 - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml b/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml index e560a62391..d75522bfce 100644 --- a/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/int8_smoothquant.yaml @@ -20,6 +20,7 @@ imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers int8_per_channel: configs/numerics/int8_per_channel + int8: configs/numerics/int8 algorithm: smoothquant quant_cfg: @@ -29,6 +30,5 @@ quant_cfg: $import: int8_per_channel - quantizer_name: '*input_quantizer' cfg: - num_bits: 8 - axis: + $import: int8 - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml b/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml index fadb7b9bbc..f862d637a4 100644 --- a/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/nvfp4_fp8_mha.yaml @@ -19,6 +19,7 @@ imports: base_disable_all: configs/ptq/units/base_disable_all nvfp4: configs/numerics/nvfp4 + fp8: configs/numerics/fp8 algorithm: max quant_cfg: @@ -33,16 +34,16 @@ quant_cfg: enable: false - quantizer_name: '*q_bmm_quantizer' cfg: - num_bits: e4m3 + $import: fp8 - quantizer_name: '*k_bmm_quantizer' cfg: - num_bits: e4m3 + $import: fp8 - quantizer_name: '*v_bmm_quantizer' cfg: - num_bits: e4m3 + $import: fp8 - quantizer_name: '*softmax_quantizer' cfg: - num_bits: e4m3 + $import: fp8 - quantizer_name: 'transformer_blocks*bmm2_output_quantizer' cfg: - num_bits: e4m3 + $import: fp8 diff --git a/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml b/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml index 86b335cbc1..9b7e541abc 100644 --- a/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml +++ b/modelopt_recipes/configs/ptq/presets/model/w4a8_nvfp4_fp8.yaml @@ -20,6 +20,7 @@ imports: base_disable_all: configs/ptq/units/base_disable_all default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers nvfp4_bs32: configs/numerics/nvfp4_bs32 + fp8: configs/numerics/fp8 algorithm: max quant_cfg: @@ -29,5 +30,5 @@ quant_cfg: $import: nvfp4_bs32 - quantizer_name: '*input_quantizer' cfg: - num_bits: e4m3 + $import: fp8 - $import: default_disabled_quantizers diff --git a/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml b/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml index 5458e5511c..5276aff2d4 100644 --- a/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml +++ b/modelopt_recipes/configs/ptq/units/kv_fp8_affine.yaml @@ -18,12 +18,12 @@ # modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig imports: kv_fp8: configs/ptq/units/kv_fp8 + fp8: configs/numerics/fp8 --- - $import: kv_fp8 - quantizer_name: '*[kv]_bmm_quantizer' cfg: - num_bits: e4m3 - axis: + $import: fp8 bias: -2: -4: diff --git a/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml b/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml index 1f541f978f..d0adbe0047 100644 --- a/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml +++ b/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml @@ -15,6 +15,10 @@ # Model-specific PTQ recipe for Step3.5-Flash NVFP4 MLP/MoE quantization with FP8 KV cache. +imports: + fp8: configs/numerics/fp8 + nvfp4: configs/numerics/nvfp4 + metadata: recipe_type: ptq description: >- @@ -27,35 +31,19 @@ quantize: enable: false - quantizer_name: '*moe*weight_quantizer' cfg: - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 + $import: nvfp4 - quantizer_name: '*moe*input_quantizer' cfg: - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 + $import: nvfp4 - quantizer_name: '*mlp*weight_quantizer' cfg: - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 + $import: nvfp4 - quantizer_name: '*mlp*input_quantizer' cfg: - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 + $import: nvfp4 - quantizer_name: '*[kv]_bmm_quantizer' cfg: - num_bits: e4m3 + $import: fp8 - quantizer_name: '*share_expert*' enable: false - quantizer_name: '*moe.gate.*'