From 6e456b0d44b92c1b6c42fbe2cebfe7cd3abc30f9 Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Sat, 18 Apr 2026 15:27:31 -0700 Subject: [PATCH 01/12] fix: PTQ 1GPU, export PP divisibility, hidden states conversations key - megatron_lm_ptq.yaml: Qwen3-8B to single GPU for L40 clusters - quantize.sh: auto-find largest PP dividing model num_hidden_layers for export (Qwen3-8B has 36 layers, not divisible by 8) - compute_hidden_states_trtllm.py: use messages with conversations fallback (matching the HF version) Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Chenhan Yu --- .../compute_hidden_states_trtllm.py | 2 +- .../common/megatron_lm/quantize/quantize.sh | 17 ++++++++++++++--- .../examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml | 16 ++++++++-------- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py index 0bf68e430f..06531a1677 100644 --- a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py +++ b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py @@ -256,7 +256,7 @@ async def submit_generates(): for entry in dataset: conversation_id = entry.get("conversation_id", entry.get("uuid")) - conversations = entry["conversations"] + conversations = entry.get("messages") or entry.get("conversations") if not conversations or not isinstance(conversations, list): num_invalid += 1 continue diff --git a/tools/launcher/common/megatron_lm/quantize/quantize.sh b/tools/launcher/common/megatron_lm/quantize/quantize.sh index 1bb0d60e80..407a674378 100755 --- a/tools/launcher/common/megatron_lm/quantize/quantize.sh +++ b/tools/launcher/common/megatron_lm/quantize/quantize.sh @@ -41,11 +41,22 @@ TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1} ${QUANTIZE_EXE} ${MLM_MODEL_CF export MLM_EXTRA_ARGS="--mmlu-dataset ${MMLU_DATASET:-/hf-local/cais/mmlu} --fraction 0.01 --lower-bound ${MMLU_LOWER_BOUND:-0.38} --disable-tqdm" TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1} MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${MMLU_EXE} ${MLM_MODEL_CFG} -# Export quantized checkpoint to HF format (PP=all GPUs) +# Export quantized checkpoint to HF format +# Use largest PP <= total GPUs that divides the model's num_hidden_layers TOTAL_GPUS=$(python3 -c "import torch; print(torch.cuda.device_count())" 2>/dev/null || echo ${NUM_GPUS:-1}) -echo "=== Exporting ${MLM_MODEL_CFG} ${QUANT_CFG} (PP=${TOTAL_GPUS}) ===" +EXPORT_PP=$(python3 -c " +import json, os +cfg = os.path.join('${HF_MODEL_CKPT}', 'config.json') +n_layers = json.load(open(cfg)).get('num_hidden_layers', 1) if os.path.exists(cfg) else 1 +gpus = ${TOTAL_GPUS} +pp = gpus +while pp > 1 and n_layers % pp != 0: + pp -= 1 +print(pp) +" 2>/dev/null || echo ${TOTAL_GPUS}) +echo "=== Exporting ${MLM_MODEL_CFG} ${QUANT_CFG} (PP=${EXPORT_PP}, ${TOTAL_GPUS} GPUs) ===" export MLM_EXTRA_ARGS= -TP=1 PP=${TOTAL_GPUS} EP=1 ETP=1 MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${EXPORT_EXE} ${MLM_MODEL_CFG} +TP=1 PP=${EXPORT_PP} EP=1 ETP=1 MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${EXPORT_EXE} ${MLM_MODEL_CFG} ls ${EXPORT_DIR} cat ${EXPORT_DIR}/hf_quant_config.json diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml index 33b9da18e6..ff55a92e39 100644 --- a/tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml +++ b/tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml @@ -24,7 +24,7 @@ pipeline: config: model: Qwen/Qwen3-8B quant_cfg: NVFP4_DEFAULT_CFG - tp: 8 + tp: 1 calib_dataset: abisee/cnn_dailymail calib_size: 32 mmlu_dataset: cais/mmlu @@ -33,15 +33,15 @@ pipeline: slurm_config: _factory_: "slurm_factory" nodes: 1 - ntasks_per_node: 8 - gpus_per_node: 8 + ntasks_per_node: 1 + gpus_per_node: 1 task_1: _target_: common.megatron_lm.quantize.task.MegatronLMQuantizeTask config: model: Qwen/Qwen3-8B quant_cfg: FP8_DEFAULT_CFG - tp: 8 + tp: 1 calib_dataset: abisee/cnn_dailymail calib_size: 32 mmlu_dataset: cais/mmlu @@ -50,18 +50,18 @@ pipeline: slurm_config: _factory_: "slurm_factory" nodes: 1 - ntasks_per_node: 8 - gpus_per_node: 8 + ntasks_per_node: 1 + gpus_per_node: 1 # Step 3: TRT-LLM eval MMLU on all exported checkpoints task_2: script: common/tensorrt_llm/eval.sh environment: - HF_MODEL_CKPT: /scratchspace/export - - TP: "8" + - TP: "1" - EP: "1" slurm_config: _factory_: "slurm_factory" nodes: 1 ntasks_per_node: 1 - gpus_per_node: 8 + gpus_per_node: 1 From 48d0a378e5c68039d9853b6d261c1ddc2c4de8f5 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Wed, 22 Apr 2026 05:39:47 +0000 Subject: [PATCH 02/12] add nvfp4-w4a16 support Signed-off-by: Hung-Yueh Chiang --- examples/llm_ptq/hf_ptq.py | 27 +++++++++++++++++++ .../llm_ptq/scripts/huggingface_example.sh | 14 ++++++++-- modelopt/torch/export/convert_hf_config.py | 13 +++++++++ modelopt/torch/export/model_config.py | 1 + modelopt/torch/export/quant_utils.py | 13 +++++++++ modelopt/torch/export/unified_export_hf.py | 3 +++ modelopt/torch/quantization/config.py | 2 ++ 7 files changed, 71 insertions(+), 2 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 831d230a67..e44e6bcf9a 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -119,6 +119,7 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None: "nvfp4_svdquant": mtq.NVFP4_SVDQUANT_DEFAULT_CFG, "mxfp8": mtq.MXFP8_DEFAULT_CFG, "nvfp4_local_hessian": mtq.NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG, + "nvfp4_w4a16": mtq.NVFP4_W4A16_CFG, } KV_QUANT_CFG_CHOICES = { @@ -781,6 +782,12 @@ def export_quantized( extra_state_dict=mtp_state_dict, ) + if args.qformat == "nvfp4_w4a16": + warnings.warn( + "TensorRT-LLM and SGLang do not support this format. " + "To serve on vLLM, convert the NVFP4 W4A16 checkpoint to compressed-tensors format." + ) + # Restore default padding and export the tokenizer as well. if tokenizer is not None: tokenizer.padding_side = default_padding_side @@ -1106,6 +1113,15 @@ def quantize_main( quant_cfg["quant_cfg"].append({"quantizer_name": pattern, "enable": False}) print(f"Excluding MTP layer from quantization: {pattern}") + # Apply user-requested per-module exclusions (--exclude_modules). + if args.exclude_modules: + quant_cfg = copy.deepcopy(quant_cfg) + for mod in args.exclude_modules: + quant_cfg["quant_cfg"].append( + {"quantizer_name": f"*{mod}*_quantizer", "enable": False} + ) + print(f"Excluding module from quantization: {mod}") + # Use constant amax for KV quantizers when a cast format is selected. if args.kv_cache_qformat in _KV_CAST_FORMATS: quant_cfg = copy.deepcopy(quant_cfg) @@ -1268,6 +1284,17 @@ def parse_args() -> argparse.Namespace: default=False, action="store_true", ) + parser.add_argument( + "--exclude_modules", + nargs="+", + default=[], + metavar="MODULE", + help=( + "Module name patterns to exclude from quantization " + "(e.g. lm_head backbone.layers.0.mixer). " + "Appends a disable rule for each pattern's weight and input quantizers." + ), + ) parser.add_argument( "--gpu_max_mem_percentage", help=( diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh index d9c4ff8a7a..a24c99cd43 100755 --- a/examples/llm_ptq/scripts/huggingface_example.sh +++ b/examples/llm_ptq/scripts/huggingface_example.sh @@ -53,9 +53,9 @@ esac IFS="," for qformat in $QFORMAT; do case $qformat in - fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | nvfp4_mse | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_experts_only | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8 | nvfp4_local_hessian) ;; + fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | nvfp4_mse | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_experts_only | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8 | nvfp4_local_hessian | nvfp4_w4a16) ;; *) - echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, nvfp4_mse, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_experts_only, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8, nvfp4_local_hessian]" >&2 + echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, nvfp4_mse, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_experts_only, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8, nvfp4_local_hessian, nvfp4_w4a16]" >&2 exit 1 ;; esac @@ -127,6 +127,10 @@ if $TRUST_REMOTE_CODE; then PTQ_ARGS+=" --trust_remote_code " fi +if [ -n "${EXCLUDE_MODULES:-}" ]; then + PTQ_ARGS+=" --exclude_modules ${EXCLUDE_MODULES} " +fi + if $USE_SEQ_DEVICE_MAP; then PTQ_ARGS+=" --use_seq_device_map " fi @@ -199,6 +203,12 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH exit 0 fi + if [ "$QFORMAT" = "nvfp4_w4a16" ]; then + echo "nvfp4_w4a16 checkpoint exported to $SAVE_PATH" + echo "To serve on vLLM, convert to compressed-tensors" + exit 0 + fi + if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]]; then cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1) diff --git a/modelopt/torch/export/convert_hf_config.py b/modelopt/torch/export/convert_hf_config.py index 5f8c3f3b55..93528b86b3 100644 --- a/modelopt/torch/export/convert_hf_config.py +++ b/modelopt/torch/export/convert_hf_config.py @@ -57,6 +57,11 @@ def _quant_algo_to_group_config(quant_algo: str, group_size: int | None = None) return { "weights": {"dynamic": False, "num_bits": 4, "type": "int", "group_size": gs}, } + elif quant_algo == "NVFP4_W4A16": + gs = group_size or 16 + return { + "weights": {"dynamic": False, "num_bits": 4, "type": "float", "group_size": gs}, + } elif quant_algo in ("NVFP4_AWQ", "W4A8_AWQ"): gs = group_size or 128 return { @@ -183,6 +188,14 @@ def convert_hf_quant_config_format(input_config: dict[str, Any]) -> dict[str, An "targets": ["Linear"], } new_config["config_groups"] = {"group_0": config_group_details} + elif quant_algo_value == "NVFP4_W4A16": + # Weight-only FP4 + group_size = original_quantization_details.get("group_size", 16) + config_group_details = { + "weights": {"dynamic": False, "num_bits": 4, "type": "float", "group_size": group_size}, + "targets": ["Linear"], + } + new_config["config_groups"] = {"group_0": config_group_details} elif quant_algo_value == "MIXED_PRECISION": quantized_layers = original_quantization_details.get("quantized_layers", {}) diff --git a/modelopt/torch/export/model_config.py b/modelopt/torch/export/model_config.py index dce39767c7..ba4220a40d 100755 --- a/modelopt/torch/export/model_config.py +++ b/modelopt/torch/export/model_config.py @@ -39,6 +39,7 @@ QUANTIZATION_MXFP8 = "mxfp8" QUANTIZATION_W4A8_MXFP4_FP8 = "w4a8_mxfp4_fp8" QUANTIZATION_NVFP4_AWQ = "nvfp4_awq" +QUANTIZATION_NVFP4_W4A16 = "nvfp4_w4a16" # weight-only FP4 QUANTIZATION_FP8_PB_REAL = "fp8_pb_real" QUANTIZATION_FP8_PB_WO = "fp8_pb_wo" QUANTIZATION_FP8_PC_PT = "fp8_pc_pt" diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 4ceb51cd2c..0a5a73b2cb 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -65,6 +65,7 @@ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, + QUANTIZATION_NVFP4_W4A16, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_MXFP4_FP8, QUANTIZATION_W4A8_NVFP4_FP8, @@ -358,6 +359,7 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, + QUANTIZATION_NVFP4_W4A16, QUANTIZATION_W4A8_NVFP4_FP8, ]: # Calibrate weight quantizer if amax is not set @@ -402,6 +404,7 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight") QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, + QUANTIZATION_NVFP4_W4A16, QUANTIZATION_W4A8_NVFP4_FP8, ]: # Calibrate weight quantizer if amax is not set @@ -636,6 +639,10 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames return QUANTIZATION_NVFP4_AWQ if getattr(layer, "fused_with_prequant", False): return QUANTIZATION_NVFP4_AWQ + # W4A16 weight-only: input_quantizer absent or disabled + if input_quantizer is None or not input_quantizer.is_enabled: + if scale_bits == (4, 3): + return QUANTIZATION_NVFP4_W4A16 assert input_quantizer is not None, ( f"input_quantizer is None for {quantizer_attr_names}" ) @@ -803,6 +810,11 @@ def process_layer_quant_config(layer_config_dict): "quant_algo": "NVFP4", "group_size": block_size_value, } + elif v == "nvfp4_w4a16": + layer_config = { + "quant_algo": "NVFP4_W4A16", + "group_size": block_size_value, + } elif v == "nvfp4_awq": layer_config = { "quant_algo": "NVFP4_AWQ", @@ -980,6 +992,7 @@ def to_quantized_weight( if quantization in [ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_W4A16, QUANTIZATION_W4A8_NVFP4_FP8, QUANTIZATION_NVFP4_SVDQUANT, ]: diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 22d87e303f..00590c4c46 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -84,6 +84,7 @@ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, + QUANTIZATION_NVFP4_W4A16, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ) @@ -520,6 +521,7 @@ def _export_quantized_weight( QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, QUANTIZATION_NVFP4, + QUANTIZATION_NVFP4_W4A16, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ]: @@ -548,6 +550,7 @@ def _export_quantized_weight( QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, + QUANTIZATION_NVFP4_W4A16, ]: # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim) # for NVFP4 quantization functions that expect input_dim as the last dimension for block quantization diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 3f24ac09a4..066687da7c 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -794,6 +794,7 @@ def _nvfp4_selective_quant_cfg( NVFP4_EXPERTS_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp.experts*", "*block_sparse_moe*"]) NVFP4_MLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp*", "*block_sparse_moe*"]) NVFP4_OMLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*o_proj*", "*mlp*", "*block_sparse_moe*"]) +NVFP4_W4A16_CFG = _nvfp4_selective_quant_cfg(["*"], weight_only=True) # DO NOT ADD NEW CONFIGS HERE. If you want to add a new general recipe, add it to # modelopt_recipes/general/ptq/ as a yaml file @@ -828,6 +829,7 @@ def _nvfp4_selective_quant_cfg( "NVFP4_MLP_ONLY_CFG", "NVFP4_EXPERTS_ONLY_CFG", "NVFP4_OMLP_ONLY_CFG", + "NVFP4_W4A16_CFG", "MAMBA_MOE_NVFP4_CONSERVATIVE_CFG", "MAMBA_MOE_NVFP4_AGGRESSIVE_CFG", "MAMBA_MOE_FP8_CONSERVATIVE_CFG", From 2ee082535f12de0b3e2deed70fd030f663fc345b Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Wed, 22 Apr 2026 06:09:26 +0000 Subject: [PATCH 03/12] update CHANGELOG.rst and add test for nvfp4 w4a16 Signed-off-by: Hung-Yueh Chiang --- CHANGELOG.rst | 1 + .../torch/export/test_unified_hf_export_and_check_safetensors.py | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 20a677d0a0..81abb5d800 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,7 @@ Changelog **New Features** +- Add NVFP4 W4A16 weight-only quantization (``nvfp4_w4a16``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.NVFP4_W4A16_CFG`` or ``--qformat nvfp4_w4a16`` in ``hf_ptq.py``. Exported checkpoints can be served on vLLM after conversion to compressed-tensors format (see ``examples/llm_ptq/scripts/huggingface_example.sh``). Example with Qwen3-8B: - Support full Transformer Engine spec for Minitron pruning (``mcore_minitron``). Now we no longer need to use custom ModelOpt spec. Note that this does not affect the usage of the pruning workflow but makes pruning slightly faster and may result in slightly different pruned model because of different kernel and numerics. - Add Puzzletron - a new algorithm for heterogeneous pruning of LLM and VLM models. See `examples/puzzletron/README.md `_ for more details. - Added iterator interface using CalibrationDataReader in ONNX quantization workflow. diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py index 8bdf3f5e65..638aee0899 100644 --- a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py +++ b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py @@ -47,6 +47,7 @@ ("w4a8_awq", "tiny_llama-w4a8-awq", True, False, True, True, False), ("int8_wo", "tiny_llama-int8-wo", False, False, False, False, False), ("nvfp4_svdquant", "tiny_llama-nvfp4-svdquant", True, False, True, True, True), + ("nvfp4_w4a16", "tiny_llama-nvfp4-w4a16", False, False, False, False, False), # MoE models (fused experts: Qwen3 MoE, GPT-OSS) ("nvfp4", "tiny_qwen3_moe-nvfp4", True, False, True, True, False), ("fp8", "tiny_gpt_oss-fp8", True, False, True, True, False), From 0e8815d3f20db7eff92e5e451edbfefe5f7b246a Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Wed, 22 Apr 2026 06:13:25 +0000 Subject: [PATCH 04/12] tiny fix on CHANGELOG.rst Signed-off-by: Hung-Yueh Chiang --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 81abb5d800..3e2563c30c 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,7 +6,7 @@ Changelog **New Features** -- Add NVFP4 W4A16 weight-only quantization (``nvfp4_w4a16``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.NVFP4_W4A16_CFG`` or ``--qformat nvfp4_w4a16`` in ``hf_ptq.py``. Exported checkpoints can be served on vLLM after conversion to compressed-tensors format (see ``examples/llm_ptq/scripts/huggingface_example.sh``). Example with Qwen3-8B: +- Add NVFP4 W4A16 weight-only quantization (``nvfp4_w4a16``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.NVFP4_W4A16_CFG`` or ``--qformat nvfp4_w4a16`` in ``hf_ptq.py``. Exported checkpoints can be served on vLLM after conversion to compressed-tensors format - Support full Transformer Engine spec for Minitron pruning (``mcore_minitron``). Now we no longer need to use custom ModelOpt spec. Note that this does not affect the usage of the pruning workflow but makes pruning slightly faster and may result in slightly different pruned model because of different kernel and numerics. - Add Puzzletron - a new algorithm for heterogeneous pruning of LLM and VLM models. See `examples/puzzletron/README.md `_ for more details. - Added iterator interface using CalibrationDataReader in ONNX quantization workflow. From dc506c61db6e57682108b66b04271b55ad9d5905 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Wed, 22 Apr 2026 06:14:31 +0000 Subject: [PATCH 05/12] tiny fix on CHANGELOG.rst Signed-off-by: Hung-Yueh Chiang --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3e2563c30c..d2d3cdeb24 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,7 +6,7 @@ Changelog **New Features** -- Add NVFP4 W4A16 weight-only quantization (``nvfp4_w4a16``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.NVFP4_W4A16_CFG`` or ``--qformat nvfp4_w4a16`` in ``hf_ptq.py``. Exported checkpoints can be served on vLLM after conversion to compressed-tensors format +- Add NVFP4 W4A16 weight-only quantization (``nvfp4_w4a16``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.NVFP4_W4A16_CFG`` or ``--qformat nvfp4_w4a16`` in ``hf_ptq.py``. Exported checkpoints can be served on vLLM after conversion to compressed-tensors format. - Support full Transformer Engine spec for Minitron pruning (``mcore_minitron``). Now we no longer need to use custom ModelOpt spec. Note that this does not affect the usage of the pruning workflow but makes pruning slightly faster and may result in slightly different pruned model because of different kernel and numerics. - Add Puzzletron - a new algorithm for heterogeneous pruning of LLM and VLM models. See `examples/puzzletron/README.md `_ for more details. - Added iterator interface using CalibrationDataReader in ONNX quantization workflow. From d8611e281cd15fc1adc56e8835f1682c677fbd7e Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Wed, 22 Apr 2026 06:27:33 +0000 Subject: [PATCH 06/12] f*{mod}*_quantizer -> f*{mod}*.weight_quantizer and f*{mod}*.input_quantizer Signed-off-by: Hung-Yueh Chiang --- examples/llm_ptq/hf_ptq.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index e44e6bcf9a..f24545929f 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -1118,7 +1118,10 @@ def quantize_main( quant_cfg = copy.deepcopy(quant_cfg) for mod in args.exclude_modules: quant_cfg["quant_cfg"].append( - {"quantizer_name": f"*{mod}*_quantizer", "enable": False} + {"quantizer_name": f"*{mod}*.weight_quantizer", "enable": False} + ) + quant_cfg["quant_cfg"].append( + {"quantizer_name": f"*{mod}*.input_quantizer", "enable": False} ) print(f"Excluding module from quantization: {mod}") From c3b884a0f864a7c81bd0f4fcf4542fca6292406d Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Wed, 29 Apr 2026 16:20:32 +0000 Subject: [PATCH 07/12] rm --exclude_modules Signed-off-by: Hung-Yueh Chiang --- examples/llm_ptq/hf_ptq.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index df0c3596e1..1e65367764 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -1261,17 +1261,6 @@ def parse_args() -> argparse.Namespace: default=False, action="store_true", ) - parser.add_argument( - "--exclude_modules", - nargs="+", - default=[], - metavar="MODULE", - help=( - "Module name patterns to exclude from quantization " - "(e.g. lm_head backbone.layers.0.mixer). " - "Appends a disable rule for each pattern's weight and input quantizers." - ), - ) parser.add_argument( "--gpu_max_mem_percentage", help=( From 75ff6b2a965a0c0d4affd027775715a17ee857d0 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Wed, 29 Apr 2026 16:22:19 +0000 Subject: [PATCH 08/12] rm --exclude_modules Signed-off-by: Hung-Yueh Chiang --- examples/llm_ptq/hf_ptq.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 1e65367764..80056eea74 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -1080,18 +1080,6 @@ def quantize_main( quant_cfg["quant_cfg"].append({"quantizer_name": pattern, "enable": False}) print(f"Excluding MTP layer from quantization: {pattern}") - # Apply user-requested per-module exclusions (--exclude_modules). - if args.exclude_modules: - quant_cfg = copy.deepcopy(quant_cfg) - for mod in args.exclude_modules: - quant_cfg["quant_cfg"].append( - {"quantizer_name": f"*{mod}*.weight_quantizer", "enable": False} - ) - quant_cfg["quant_cfg"].append( - {"quantizer_name": f"*{mod}*.input_quantizer", "enable": False} - ) - print(f"Excluding module from quantization: {mod}") - # Use constant amax for KV quantizers when a cast format is selected. # Recipes are authoritative for KV cache config (including use_constant_amax), # so skip this post-hoc override when --recipe is used; rely on the YAML instead From 0fede961d72aea5aaefdf365a16bea82bdd8c660 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Tue, 5 May 2026 22:25:49 +0000 Subject: [PATCH 09/12] nvfp4_w4a16 -> w4a16_nvfp4 Signed-off-by: Hung-Yueh Chiang --- CHANGELOG.rst | 2 +- examples/llm_ptq/hf_ptq.py | 4 ++-- examples/llm_ptq/scripts/huggingface_example.sh | 8 ++++---- modelopt/torch/export/convert_hf_config.py | 4 ++-- modelopt/torch/export/model_config.py | 2 +- modelopt/torch/export/quant_utils.py | 14 +++++++------- modelopt/torch/export/unified_export_hf.py | 6 +++--- modelopt/torch/quantization/config.py | 4 ++-- ...test_unified_hf_export_and_check_safetensors.py | 2 +- tools/launcher/core.py | 3 +++ 10 files changed, 26 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7a5cc9efc1..1264c5dbe2 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -27,7 +27,7 @@ Changelog **New Features** -- Add NVFP4 W4A16 weight-only quantization (``nvfp4_w4a16``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.NVFP4_W4A16_CFG`` or ``--qformat nvfp4_w4a16`` in ``hf_ptq.py``. Exported checkpoints can be served on vLLM after conversion to compressed-tensors format. +- Add NVFP4 W4A16 weight-only quantization (``w4a16_nvfp4``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.W4A16_NVFP4_CFG`` or ``--qformat w4a16_nvfp4`` in ``hf_ptq.py``. Exported checkpoints can be served on vLLM after conversion to compressed-tensors format. - Support full Transformer Engine spec for Minitron pruning (``mcore_minitron``). Now we no longer need to use custom ModelOpt spec. Note that this does not affect the usage of the pruning workflow but makes pruning slightly faster and may result in slightly different pruned model because of different kernel and numerics. - Add end-to-end tutorial for Minitron pruning + distillation + quantization + evaluation + vLLM deployment for Nemotron-Nano-9B-v2 → Pruned 7B along with data blend preparation steps (and ablation study). See `examples/pruning/minitron/README.md `_ for details. - Add Puzzletron - a new algorithm for heterogeneous pruning of LLM and VLM models. See `examples/puzzletron/README.md `_ for more details. diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 476e15dc0b..5448684bac 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -113,6 +113,7 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None: "fp8_pb_wo": mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG, "fp8_pc_pt": mtq.FP8_PER_CHANNEL_PER_TOKEN_CFG, "w4a8_nvfp4_fp8": mtq.W4A8_NVFP4_FP8_CFG, + "w4a16_nvfp4": mtq.W4A16_NVFP4_CFG, "w4a8_mxfp4_fp8": mtq.W4A8_MXFP4_FP8_CFG, "nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG, "nvfp4_experts_only": mtq.NVFP4_EXPERTS_ONLY_CFG, @@ -120,7 +121,6 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None: "nvfp4_svdquant": mtq.NVFP4_SVDQUANT_DEFAULT_CFG, "mxfp8": mtq.MXFP8_DEFAULT_CFG, "nvfp4_local_hessian": mtq.NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG, - "nvfp4_w4a16": mtq.NVFP4_W4A16_CFG, } KV_QUANT_CFG_CHOICES = { @@ -786,7 +786,7 @@ def export_quantized( extra_state_dict=mtp_state_dict, ) - if args.qformat == "nvfp4_w4a16": + if args.qformat == "w4a16_nvfp4": warnings.warn( "TensorRT-LLM and SGLang do not support this format. " "To serve on vLLM, convert the NVFP4 W4A16 checkpoint to compressed-tensors format." diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh index 93e0d72512..2f80146bcb 100755 --- a/examples/llm_ptq/scripts/huggingface_example.sh +++ b/examples/llm_ptq/scripts/huggingface_example.sh @@ -53,9 +53,9 @@ esac IFS="," for qformat in $QFORMAT; do case $qformat in - fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | nvfp4_mse | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_experts_only | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8 | nvfp4_local_hessian | nvfp4_w4a16) ;; + fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | nvfp4_mse | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_experts_only | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8 | nvfp4_local_hessian | w4a16_nvfp4) ;; *) - echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, nvfp4_mse, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_experts_only, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8, nvfp4_local_hessian, nvfp4_w4a16]" >&2 + echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, nvfp4_mse, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_experts_only, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8, nvfp4_local_hessian, w4a16_nvfp4]" >&2 exit 1 ;; esac @@ -207,8 +207,8 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH exit 0 fi - if [ "$QFORMAT" = "nvfp4_w4a16" ]; then - echo "nvfp4_w4a16 checkpoint exported to $SAVE_PATH" + if [ "$QFORMAT" = "w4a16_nvfp4" ]; then + echo "w4a16_nvfp4 checkpoint exported to $SAVE_PATH" echo "To serve on vLLM, convert to compressed-tensors" exit 0 fi diff --git a/modelopt/torch/export/convert_hf_config.py b/modelopt/torch/export/convert_hf_config.py index 93528b86b3..06e5923a30 100644 --- a/modelopt/torch/export/convert_hf_config.py +++ b/modelopt/torch/export/convert_hf_config.py @@ -57,7 +57,7 @@ def _quant_algo_to_group_config(quant_algo: str, group_size: int | None = None) return { "weights": {"dynamic": False, "num_bits": 4, "type": "int", "group_size": gs}, } - elif quant_algo == "NVFP4_W4A16": + elif quant_algo == "W4A16_NVFP4": gs = group_size or 16 return { "weights": {"dynamic": False, "num_bits": 4, "type": "float", "group_size": gs}, @@ -188,7 +188,7 @@ def convert_hf_quant_config_format(input_config: dict[str, Any]) -> dict[str, An "targets": ["Linear"], } new_config["config_groups"] = {"group_0": config_group_details} - elif quant_algo_value == "NVFP4_W4A16": + elif quant_algo_value == "W4A16_NVFP4": # Weight-only FP4 group_size = original_quantization_details.get("group_size", 16) config_group_details = { diff --git a/modelopt/torch/export/model_config.py b/modelopt/torch/export/model_config.py index ba4220a40d..308a18daee 100755 --- a/modelopt/torch/export/model_config.py +++ b/modelopt/torch/export/model_config.py @@ -38,8 +38,8 @@ QUANTIZATION_MXFP4 = "mxfp4" QUANTIZATION_MXFP8 = "mxfp8" QUANTIZATION_W4A8_MXFP4_FP8 = "w4a8_mxfp4_fp8" +QUANTIZATION_W4A16_NVFP4 = "w4a16_nvfp4" # weight-only FP4 QUANTIZATION_NVFP4_AWQ = "nvfp4_awq" -QUANTIZATION_NVFP4_W4A16 = "nvfp4_w4a16" # weight-only FP4 QUANTIZATION_FP8_PB_REAL = "fp8_pb_real" QUANTIZATION_FP8_PB_WO = "fp8_pb_wo" QUANTIZATION_FP8_PC_PT = "fp8_pc_pt" diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 66bc1fe3a4..7fde63cc83 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -66,7 +66,7 @@ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, - QUANTIZATION_NVFP4_W4A16, + QUANTIZATION_W4A16_NVFP4, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_MXFP4_FP8, QUANTIZATION_W4A8_NVFP4_FP8, @@ -360,7 +360,7 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, - QUANTIZATION_NVFP4_W4A16, + QUANTIZATION_W4A16_NVFP4, QUANTIZATION_W4A8_NVFP4_FP8, ]: # Calibrate weight quantizer if amax is not set @@ -405,7 +405,7 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight") QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, - QUANTIZATION_NVFP4_W4A16, + QUANTIZATION_W4A16_NVFP4, QUANTIZATION_W4A8_NVFP4_FP8, ]: # Calibrate weight quantizer if amax is not set @@ -647,7 +647,7 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames # W4A16 weight-only: input_quantizer absent or disabled if input_quantizer is None or not input_quantizer.is_enabled: if scale_bits == (4, 3): - return QUANTIZATION_NVFP4_W4A16 + return QUANTIZATION_W4A16_NVFP4 assert input_quantizer is not None, ( f"input_quantizer is None for {quantizer_attr_names}" ) @@ -815,9 +815,9 @@ def process_layer_quant_config(layer_config_dict): "quant_algo": "NVFP4", "group_size": block_size_value, } - elif v == "nvfp4_w4a16": + elif v == "w4a16_nvfp4": layer_config = { - "quant_algo": "NVFP4_W4A16", + "quant_algo": "W4A16_NVFP4", "group_size": block_size_value, } elif v == "nvfp4_awq": @@ -997,7 +997,7 @@ def to_quantized_weight( if quantization in [ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, - QUANTIZATION_NVFP4_W4A16, + QUANTIZATION_W4A16_NVFP4, QUANTIZATION_W4A8_NVFP4_FP8, QUANTIZATION_NVFP4_SVDQUANT, ]: diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 95ab8cfdd8..0ab945ed1d 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -80,7 +80,7 @@ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, - QUANTIZATION_NVFP4_W4A16, + QUANTIZATION_W4A16_NVFP4, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ) @@ -518,7 +518,7 @@ def _export_quantized_weight( QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, QUANTIZATION_NVFP4, - QUANTIZATION_NVFP4_W4A16, + QUANTIZATION_W4A16_NVFP4, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ]: @@ -548,7 +548,7 @@ def _export_quantized_weight( QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, - QUANTIZATION_NVFP4_W4A16, + QUANTIZATION_W4A16_NVFP4, ]: # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim) # for NVFP4 quantization functions that expect input_dim as the last dimension for block quantization diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 0bdf95f441..c96fcde3d8 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -773,6 +773,7 @@ def _nvfp4_selective_quant_cfg( ], "algorithm": "max", } +W4A16_NVFP4_CFG = _nvfp4_selective_quant_cfg(["*"], weight_only=True) MXFP4_MLP_WEIGHT_ONLY_CFG = { "quant_cfg": [ @@ -804,7 +805,6 @@ def _nvfp4_selective_quant_cfg( ) NVFP4_MLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp*", "*block_sparse_moe*", "*.experts.*"]) NVFP4_OMLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*o_proj*", "*mlp*", "*block_sparse_moe*"]) -NVFP4_W4A16_CFG = _nvfp4_selective_quant_cfg(["*"], weight_only=True) # DO NOT ADD NEW CONFIGS HERE. If you want to add a new general recipe, add it to # modelopt_recipes/general/ptq/ as a yaml file @@ -839,7 +839,7 @@ def _nvfp4_selective_quant_cfg( "NVFP4_MLP_ONLY_CFG", "NVFP4_EXPERTS_ONLY_CFG", "NVFP4_OMLP_ONLY_CFG", - "NVFP4_W4A16_CFG", + "W4A16_NVFP4_CFG", "MAMBA_MOE_NVFP4_CONSERVATIVE_CFG", "MAMBA_MOE_NVFP4_AGGRESSIVE_CFG", "MAMBA_MOE_FP8_CONSERVATIVE_CFG", diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py index 638aee0899..6e0c56bfd1 100644 --- a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py +++ b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py @@ -47,7 +47,7 @@ ("w4a8_awq", "tiny_llama-w4a8-awq", True, False, True, True, False), ("int8_wo", "tiny_llama-int8-wo", False, False, False, False, False), ("nvfp4_svdquant", "tiny_llama-nvfp4-svdquant", True, False, True, True, True), - ("nvfp4_w4a16", "tiny_llama-nvfp4-w4a16", False, False, False, False, False), + ("w4a16_nvfp4", "tiny_llama-w4a16-nvfp4", False, False, False, False, False), # MoE models (fused experts: Qwen3 MoE, GPT-OSS) ("nvfp4", "tiny_qwen3_moe-nvfp4", True, False, True, True, False), ("fp8", "tiny_gpt_oss-fp8", True, False, True, True, False), diff --git a/tools/launcher/core.py b/tools/launcher/core.py index 8fd4e25ee7..bcade6e750 100644 --- a/tools/launcher/core.py +++ b/tools/launcher/core.py @@ -50,6 +50,9 @@ def get_default_env(experiment_title=None): "HF_HOME": f"/{title}/hf-cache", "HF_TOKEN": os.getenv("HF_TOKEN", ""), "MLM_SKIP_INSTALL": "1", + # DockerExecutor runs as the host UID, which may not be in the container's + # /etc/passwd; setting USER prevents getpass.getuser() from calling pwd.getpwuid(). + "USER": os.getenv("USER", "docker"), } return slurm_env, local_env From 2cc323083cae6c92ea32d2dab38d841960a6081a Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Wed, 6 May 2026 01:17:27 +0000 Subject: [PATCH 10/12] fix vllm comments Signed-off-by: Hung-Yueh Chiang --- CHANGELOG.rst | 2 +- examples/llm_ptq/hf_ptq.py | 2 +- modelopt/torch/export/quant_utils.py | 2 +- modelopt/torch/export/unified_export_hf.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1264c5dbe2..69334f9a11 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -27,7 +27,7 @@ Changelog **New Features** -- Add NVFP4 W4A16 weight-only quantization (``w4a16_nvfp4``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.W4A16_NVFP4_CFG`` or ``--qformat w4a16_nvfp4`` in ``hf_ptq.py``. Exported checkpoints can be served on vLLM after conversion to compressed-tensors format. +- Add NVFP4 W4A16 weight-only quantization (``w4a16_nvfp4``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.W4A16_NVFP4_CFG`` or ``--qformat w4a16_nvfp4`` in ``hf_ptq.py``. vLLM deployment support is in progress. - Support full Transformer Engine spec for Minitron pruning (``mcore_minitron``). Now we no longer need to use custom ModelOpt spec. Note that this does not affect the usage of the pruning workflow but makes pruning slightly faster and may result in slightly different pruned model because of different kernel and numerics. - Add end-to-end tutorial for Minitron pruning + distillation + quantization + evaluation + vLLM deployment for Nemotron-Nano-9B-v2 → Pruned 7B along with data blend preparation steps (and ablation study). See `examples/pruning/minitron/README.md `_ for details. - Add Puzzletron - a new algorithm for heterogeneous pruning of LLM and VLM models. See `examples/puzzletron/README.md `_ for more details. diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 5448684bac..8e4a814431 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -789,7 +789,7 @@ def export_quantized( if args.qformat == "w4a16_nvfp4": warnings.warn( "TensorRT-LLM and SGLang do not support this format. " - "To serve on vLLM, convert the NVFP4 W4A16 checkpoint to compressed-tensors format." + "vLLM deployment support is in progress." ) # Restore default padding and export the tokenizer as well. diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 7fde63cc83..95d9e288a2 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -66,10 +66,10 @@ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, - QUANTIZATION_W4A16_NVFP4, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_MXFP4_FP8, QUANTIZATION_W4A8_NVFP4_FP8, + QUANTIZATION_W4A16_NVFP4, ) logger = logging.getLogger(__name__) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 0ab945ed1d..2068c24438 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -80,9 +80,9 @@ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, - QUANTIZATION_W4A16_NVFP4, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, + QUANTIZATION_W4A16_NVFP4, ) from .model_utils import get_language_model_from_vl, is_multimodal_model from .moe_utils import _export_fused_experts From 8594574d8effd3f0211cf53a314380629d0d6487 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Wed, 6 May 2026 22:39:13 +0000 Subject: [PATCH 11/12] make huggingface_example.sh and parser.sh support recipes Signed-off-by: Hung-Yueh Chiang --- .../llm_ptq/scripts/huggingface_example.sh | 19 +++++++++++++++++-- examples/llm_ptq/scripts/parser.sh | 4 ++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh index 2f80146bcb..f0730f6643 100755 --- a/examples/llm_ptq/scripts/huggingface_example.sh +++ b/examples/llm_ptq/scripts/huggingface_example.sh @@ -62,6 +62,11 @@ for qformat in $QFORMAT; do done IFS=" " +if [ -n "$RECIPE" ] && [ -n "$QFORMAT" ]; then + echo "Error: --recipe and --quant are mutually exclusive." >&2 + exit 1 +fi + script_dir="$(dirname "$(readlink -f "$0")")" pushd $script_dir/.. @@ -72,7 +77,12 @@ fi QFORMAT_MODIFIED="${QFORMAT//,/_}" -MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}} +if [ -n "$RECIPE" ]; then + RECIPE_LABEL=$(basename "$RECIPE" .yaml | sed 's/[^0-9a-zA-Z\-]/_/g') + MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${RECIPE_LABEL} +else + MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}} +fi SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME} @@ -181,11 +191,16 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH if [[ "$MODEL_CONFIG_EXIST" == false ]]; then echo "Quantizing original model..." + if [ -n "$RECIPE" ]; then + QUANT_ARG="--recipe=$RECIPE" + else + QUANT_ARG="--qformat=${QFORMAT// /,}" + fi python hf_ptq.py \ --pyt_ckpt_path=$MODEL_PATH \ --export_path=$SAVE_PATH \ --sparsity_fmt=$SPARSITY_FMT \ - --qformat="${QFORMAT// /,}" \ + $QUANT_ARG \ --calib_size=$CALIB_SIZE \ --batch_size=$CALIB_BATCH_SIZE \ --inference_tensor_parallel=$TP \ diff --git a/examples/llm_ptq/scripts/parser.sh b/examples/llm_ptq/scripts/parser.sh index 3817c1dee7..09896bef7f 100644 --- a/examples/llm_ptq/scripts/parser.sh +++ b/examples/llm_ptq/scripts/parser.sh @@ -99,8 +99,8 @@ parse_options() { fi # Verify required options are provided - if [ -z "$MODEL_PATH" ] || [ -z "$QFORMAT" ] || [ -z "$TASKS" ]; then - echo "Usage: $0 --model= --quant= --tasks=" + if [ -z "$MODEL_PATH" ] || [ -z "$TASKS" ] || { [ -z "$QFORMAT" ] && [ -z "$RECIPE" ]; }; then + echo "Usage: $0 --model= (--quant= | RECIPE=) --tasks=" echo "Optional args: --sparsity= --awq_block_size= --calib=" exit 1 fi From 9808e01bc03b613bc11bb84bb5fb820e4d1f38b0 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Wed, 6 May 2026 22:40:57 +0000 Subject: [PATCH 12/12] add w4a16_nvfp4 recipes Signed-off-by: Hung-Yueh Chiang --- .../configs/ptq/units/w4a16_nvfp4.yaml | 24 +++++++++++++++ .../ptq/nvfp4_weight_only-kv_fp16.yaml | 29 +++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 modelopt_recipes/configs/ptq/units/w4a16_nvfp4.yaml create mode 100644 modelopt_recipes/general/ptq/nvfp4_weight_only-kv_fp16.yaml diff --git a/modelopt_recipes/configs/ptq/units/w4a16_nvfp4.yaml b/modelopt_recipes/configs/ptq/units/w4a16_nvfp4.yaml new file mode 100644 index 0000000000..b4676dbff3 --- /dev/null +++ b/modelopt_recipes/configs/ptq/units/w4a16_nvfp4.yaml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# W4A16 NVFP4: NVFP4 E2M1 dynamic weight quantizer only; activations remain in BF16. + +# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig +imports: + nvfp4: configs/numerics/nvfp4 +--- + - quantizer_name: '*weight_quantizer' + cfg: + $import: nvfp4 diff --git a/modelopt_recipes/general/ptq/nvfp4_weight_only-kv_fp16.yaml b/modelopt_recipes/general/ptq/nvfp4_weight_only-kv_fp16.yaml new file mode 100644 index 0000000000..03ee1b2236 --- /dev/null +++ b/modelopt_recipes/general/ptq/nvfp4_weight_only-kv_fp16.yaml @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + w4a16_nvfp4: configs/ptq/units/w4a16_nvfp4 + +metadata: + recipe_type: ptq + description: NVFP4 W4A16 weight-only, BF16 activations, max calibration. No calibration forward pass required. +quantize: + algorithm: max + quant_cfg: + - $import: base_disable_all + - $import: w4a16_nvfp4 + - $import: default_disabled_quantizers