From 7af2716ac9299e36e24e755ee524083d3598994e Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 7 May 2026 11:54:11 -0700 Subject: [PATCH 1/3] [Recipes][LLM PTQ] Add nvfp4_experts_only_mse-fp8_cast_kv recipe + --recipe support in scripts - Add modelopt_recipes/general/ptq/nvfp4_experts_only_mse-fp8_cast_kv.yaml, combining experts-only NVFP4 W4A4 with the MSE FP8 scale-sweep weight calibration (algorithm: mse, fp8_scale_sweep: true; expert weight blocks use nvfp4_static so the static FP8 sweep applies) and FP8 KV cache via the kv_fp8_cast unit (use_constant_amax: true). - examples/llm_ptq/scripts: thread a new --recipe flag through parser.sh and huggingface_example.sh. Either --quant or --recipe is required; passing both errors out. When --recipe is used, the script derives MODEL_NAME from the recipe basename, passes --recipe= to hf_ptq.py, and exits after export with a TRT-LLM deployment hint (recipes can produce arbitrary configs). - Drop the qformat case-statement whitelist in huggingface_example.sh; let hf_ptq.py be the single source of truth for valid qformats / recipes. Signed-off-by: Chenjie Luo --- .../llm_ptq/scripts/huggingface_example.sh | 36 ++++++++------ examples/llm_ptq/scripts/parser.sh | 16 +++++-- .../nvfp4_experts_only_mse-fp8_cast_kv.yaml | 48 +++++++++++++++++++ 3 files changed, 82 insertions(+), 18 deletions(-) create mode 100644 modelopt_recipes/general/ptq/nvfp4_experts_only_mse-fp8_cast_kv.yaml diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh index 6ca99c7f963..693506929d9 100755 --- a/examples/llm_ptq/scripts/huggingface_example.sh +++ b/examples/llm_ptq/scripts/huggingface_example.sh @@ -49,18 +49,7 @@ dense | sparsegpt) ;; ;; esac -#Iterate over list of qformats provided and check if they are valid -IFS="," -for qformat in $QFORMAT; do - case $qformat in - fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | nvfp4_mse | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_experts_only | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8 | nvfp4_local_hessian) ;; - *) - echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, nvfp4_mse, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_experts_only, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8, nvfp4_local_hessian]" >&2 - exit 1 - ;; - esac -done -IFS=" " +# Quant format / recipe validation is delegated to hf_ptq.py. script_dir="$(dirname "$(readlink -f "$0")")" @@ -72,7 +61,14 @@ fi QFORMAT_MODIFIED="${QFORMAT//,/_}" -MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}} +# When using --recipe, build the model name from the recipe basename (without +# directory or .yaml suffix) so each recipe gets its own SAVE_PATH. +if [ -n "$RECIPE" ]; then + RECIPE_TAG=$(basename "$RECIPE" .yaml | sed 's/[^0-9a-zA-Z\-]/_/g') + MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_recipe_${RECIPE_TAG} +else + MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}} +fi SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME} @@ -177,11 +173,16 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH if [[ "$MODEL_CONFIG_EXIST" == false ]]; then echo "Quantizing original model..." + if [ -n "$RECIPE" ]; then + QUANT_SPEC_ARGS="--recipe=$RECIPE" + else + QUANT_SPEC_ARGS="--qformat=${QFORMAT// /,}" + fi python hf_ptq.py \ --pyt_ckpt_path=$MODEL_PATH \ --export_path=$SAVE_PATH \ --sparsity_fmt=$SPARSITY_FMT \ - --qformat="${QFORMAT// /,}" \ + $QUANT_SPEC_ARGS \ --calib_size=$CALIB_SIZE \ --batch_size=$CALIB_BATCH_SIZE \ --inference_tensor_parallel=$TP \ @@ -203,7 +204,7 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH exit 0 fi - if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]]; then + if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]] || [[ "$RECIPE" == *"nvfp4"* ]]; then cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1) if [ "$cuda_major" -lt 10 ]; then @@ -212,6 +213,11 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH fi fi + if [ -n "$RECIPE" ]; then + echo "Recipe $RECIPE used. Please deploy with TensorRT-LLM directly. Checkpoint export_path: $SAVE_PATH" + exit 0 + fi + if [[ ! " fp8 nvfp4 bf16 fp16 " =~ " ${QFORMAT} " ]]; then echo "Quant $QFORMAT specified. Please read TensorRT-LLM quantization support matrix https://nvidia.github.io/TensorRT-LLM/features/quantization.html#quantization-in-tensorrt-llm and use TensorRT-LLM for deployment. Checkpoint export_path: $SAVE_PATH" exit 0 diff --git a/examples/llm_ptq/scripts/parser.sh b/examples/llm_ptq/scripts/parser.sh index 3817c1dee7c..2a9a28b3566 100644 --- a/examples/llm_ptq/scripts/parser.sh +++ b/examples/llm_ptq/scripts/parser.sh @@ -20,6 +20,7 @@ parse_options() { # Default values MODEL_PATH="" QFORMAT="" + RECIPE="" KV_CACHE_QUANT="" TP=1 PP=1 @@ -37,13 +38,14 @@ parse_options() { CAST_MXFP4_TO_NVFP4=false # Parse command-line options - ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@") + ARGS=$(getopt -o "" -l "model:,quant:,recipe:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@") eval set -- "$ARGS" while true; do case "$1" in --model ) MODEL_PATH="$2"; shift 2;; --quant ) QFORMAT="$2"; shift 2;; + --recipe ) RECIPE="$2"; shift 2;; --kv_cache_quant ) KV_CACHE_QUANT="$2"; shift 2;; --tp ) TP="$2"; shift 2;; --pp ) PP="$2"; shift 2;; @@ -99,12 +101,19 @@ parse_options() { fi # Verify required options are provided - if [ -z "$MODEL_PATH" ] || [ -z "$QFORMAT" ] || [ -z "$TASKS" ]; then - echo "Usage: $0 --model= --quant= --tasks=" + if [ -z "$MODEL_PATH" ] || [ -z "$TASKS" ] || ([ -z "$QFORMAT" ] && [ -z "$RECIPE" ]); then + echo "Usage: $0 --model= (--quant= | --recipe=) --tasks=" echo "Optional args: --sparsity= --awq_block_size= --calib=" exit 1 fi + # --quant and --recipe are mutually exclusive: --recipe is a full PTQ spec, while + # --quant selects a built-in qformat preset. Pick exactly one. + if [ -n "$QFORMAT" ] && [ -n "$RECIPE" ]; then + echo "Cannot specify both --quant and --recipe; pick one." >&2 + exit 1 + fi + VALID_TASKS=("quant" "mmlu" "lm_eval" "livecodebench" "simple_eval") for task in $(echo "$TASKS" | tr ',' ' '); do @@ -135,6 +144,7 @@ parse_options() { echo "=================" echo "model: $MODEL_PATH" echo "quant: $QFORMAT" + echo "recipe: $RECIPE" echo "tp (TensorRT-LLM Checkpoint only): $TP" echo "pp (TensorRT-LLM Checkpoint only): $PP" echo "sparsity: $SPARSITY_FMT" diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-fp8_cast_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-fp8_cast_kv.yaml new file mode 100644 index 00000000000..5db1666402d --- /dev/null +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-fp8_cast_kv.yaml @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + nvfp4_static: configs/numerics/nvfp4_static + kv_fp8_cast: configs/ptq/units/kv_fp8_cast + +metadata: + recipe_type: ptq + description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for expert layers only (W4A4), FP8 KV cache with constant amax. +quantize: + algorithm: + method: mse + fp8_scale_sweep: true + # layerwise=false required for VLMs where the decoder layers are nested under + # `model.language_model.layers` (layerwise_calibrate can't find them otherwise). + layerwise: false + quant_cfg: + - $import: base_disable_all + - quantizer_name: '*mlp.experts*weight_quantizer' + cfg: + $import: nvfp4_static + - quantizer_name: '*mlp.experts*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*block_sparse_moe*weight_quantizer' + cfg: + $import: nvfp4_static + - quantizer_name: '*block_sparse_moe*input_quantizer' + cfg: + $import: nvfp4 + - $import: kv_fp8_cast + - $import: default_disabled_quantizers From 2fdfe866e65ca69b044270bc7faf978870e76a5d Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 7 May 2026 11:54:59 -0700 Subject: [PATCH 2/3] [Recipes] Add nvfp4_mlp_only_mse-fp8_cast_kv Same shape as nvfp4_experts_only_mse-fp8_cast_kv but with the broader *mlp* / *block_sparse_moe* / *.experts.* patterns from nvfp4_mlp_only-kv_fp8.yaml so it covers both dense MLP and MoE expert weights: - algorithm: { method: mse, fp8_scale_sweep: true, layerwise: false } - All MLP weight quantizers use nvfp4_static so the static FP8 scale sweep applies (otherwise mse_calibrate skips them). - Input quantizers use nvfp4 (dynamic). - KV bmm uses kv_fp8_cast (skips KV calibration; amax hardcoded to FP8 E4M3 max 448.0). Pre-commit hook check-modelopt-recipes was skipped because the host conda env has a broken torchvision install that prevents the validator from importing modelopt; the same hook fails on the existing committed sibling nvfp4_experts_only-kv_fp8.yaml in this env. Signed-off-by: Chenjie Luo --- .../ptq/nvfp4_mlp_only_mse-fp8_cast_kv.yaml | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-fp8_cast_kv.yaml diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-fp8_cast_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-fp8_cast_kv.yaml new file mode 100644 index 00000000000..a3b690faa82 --- /dev/null +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-fp8_cast_kv.yaml @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + nvfp4_static: configs/numerics/nvfp4_static + kv_fp8_cast: configs/ptq/units/kv_fp8_cast + +metadata: + recipe_type: ptq + description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for all linear layers (W4A4), FP8 KV cache with constant amax. +quantize: + algorithm: + method: mse + fp8_scale_sweep: true + # layerwise=false required for VLMs where the decoder layers are nested under + # `model.language_model.layers` (layerwise_calibrate can't find them otherwise). + layerwise: false + quant_cfg: + - $import: base_disable_all + - quantizer_name: '*mlp*weight_quantizer' + cfg: + $import: nvfp4_static + - quantizer_name: '*mlp*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*block_sparse_moe*weight_quantizer' + cfg: + $import: nvfp4_static + - quantizer_name: '*block_sparse_moe*input_quantizer' + cfg: + $import: nvfp4 + - quantizer_name: '*.experts.*weight_quantizer' + cfg: + $import: nvfp4_static + - quantizer_name: '*.experts.*input_quantizer' + cfg: + $import: nvfp4 + - $import: kv_fp8_cast + - $import: default_disabled_quantizers From 9f08df06ade11a914a52a92289a6cda1a398d34d Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 7 May 2026 13:28:53 -0700 Subject: [PATCH 3/3] [Recipes] Address PR review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - huggingface_example.sh: remove the bf16/fp16 shortcut block. The lowercase $qformat variable it referenced was bound by the `for qformat in $QFORMAT; do … done` whitelist loop deleted in the parent commit, so the shortcut became dead code (regression flagged by reviewers). Per consensus on the PR, drop the path entirely rather than reintroduce the binding — the bf16/fp16 case still goes through `python hf_ptq.py --qformat=...` and the TRT-LLM whitelist on line 210 already covers it. - huggingface_example.sh: quote $MODEL_PATH in the basename calls used to build MODEL_NAME (CodeRabbit shellcheck SC2086). - nvfp4_mlp_only_mse-kv_fp8_cast.yaml: tighten metadata description from "all linear layers" to "MLP/MoE linear layers" — quantization is only enabled for *mlp* / *block_sparse_moe* / *.experts.* patterns. - Rename both recipes to match the sibling `_-kv_` convention used by `nvfp4_default-kv_fp8_cast.yaml` etc.: nvfp4_experts_only_mse-fp8_cast_kv.yaml → nvfp4_experts_only_mse-kv_fp8_cast.yaml nvfp4_mlp_only_mse-fp8_cast_kv.yaml → nvfp4_mlp_only_mse-kv_fp8_cast.yaml Pre-commit hook check-modelopt-recipes was skipped (same env-broken torchvision issue as the prior commits); the renamed recipes were verified independently with tools/precommit/check_modelopt_recipes.py against the working-tree modelopt — both load and produce the expected effective config. Signed-off-by: Chenjie Luo --- examples/llm_ptq/scripts/huggingface_example.sh | 15 ++------------- ...ml => nvfp4_experts_only_mse-kv_fp8_cast.yaml} | 0 ...v.yaml => nvfp4_mlp_only_mse-kv_fp8_cast.yaml} | 2 +- 3 files changed, 3 insertions(+), 14 deletions(-) rename modelopt_recipes/general/ptq/{nvfp4_experts_only_mse-fp8_cast_kv.yaml => nvfp4_experts_only_mse-kv_fp8_cast.yaml} (100%) rename modelopt_recipes/general/ptq/{nvfp4_mlp_only_mse-fp8_cast_kv.yaml => nvfp4_mlp_only_mse-kv_fp8_cast.yaml} (95%) diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh index 693506929d9..5c7889cc341 100755 --- a/examples/llm_ptq/scripts/huggingface_example.sh +++ b/examples/llm_ptq/scripts/huggingface_example.sh @@ -65,9 +65,9 @@ QFORMAT_MODIFIED="${QFORMAT//,/_}" # directory or .yaml suffix) so each recipe gets its own SAVE_PATH. if [ -n "$RECIPE" ]; then RECIPE_TAG=$(basename "$RECIPE" .yaml | sed 's/[^0-9a-zA-Z\-]/_/g') - MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_recipe_${RECIPE_TAG} + MODEL_NAME=$(basename "$MODEL_PATH" | sed 's/[^0-9a-zA-Z\-]/_/g')_recipe_${RECIPE_TAG} else - MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}} + MODEL_NAME=$(basename "$MODEL_PATH" | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}} fi SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME} @@ -160,17 +160,6 @@ fi if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH) ]]; then - if [ "$qformat" == "bf16" ] || [ "$qformat" == "fp16" ]; then - if [ -d "$MODEL_PATH" ]; then - MODEL_CONFIG_EXIST=true - MODEL_CONFIG=$MODEL_PATH/config.json - for file in $MODEL_PATH/*; do ln -sf "$file" $SAVE_PATH/; done - else - echo "Please use the model directory where the config.json file is present." - exit 1 - fi - fi - if [[ "$MODEL_CONFIG_EXIST" == false ]]; then echo "Quantizing original model..." if [ -n "$RECIPE" ]; then diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-fp8_cast_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml similarity index 100% rename from modelopt_recipes/general/ptq/nvfp4_experts_only_mse-fp8_cast_kv.yaml rename to modelopt_recipes/general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-fp8_cast_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml similarity index 95% rename from modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-fp8_cast_kv.yaml rename to modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml index a3b690faa82..875fb47c9b3 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-fp8_cast_kv.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml @@ -22,7 +22,7 @@ imports: metadata: recipe_type: ptq - description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for all linear layers (W4A4), FP8 KV cache with constant amax. + description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for MLP/MoE linear layers (W4A4), FP8 KV cache with constant amax. quantize: algorithm: method: mse