From 04e6e41ff3d6e0bb8845b9596b4dc5c71a11296e Mon Sep 17 00:00:00 2001 From: mchochowski Date: Fri, 13 Feb 2026 02:54:21 -0800 Subject: [PATCH 1/8] gpt-oss 20b support Signed-off-by: mchochowski --- .../gptoss-20b.yaml | 110 ++++ .../gptoss-20b_remove_experts_memory.yaml | 22 + .../pruning/ffn_pruning.yaml | 21 + .../pruning/pruning_defaults.yaml | 34 ++ .../validate_model_defaults.yaml | 18 + .../validate_solutions_defaults.yaml | 11 + .../anymodel/converter/converter.py | 17 +- .../gpt_oss_20b/gpt_oss_20b_converter.py | 1 + .../gpt_oss_20b_model_descriptor.py | 25 +- .../gpt_oss_20b/gpt_oss_pruned_to_mxfp4.py | 506 ++++++++++++++++++ 10 files changed, 759 insertions(+), 6 deletions(-) create mode 100644 examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b.yaml create mode 100644 examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b_remove_experts_memory.yaml create mode 100644 examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/ffn_pruning.yaml create mode 100644 examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/pruning_defaults.yaml create mode 100644 examples/puzzletron/configs/gptoss-20b_remove_experts_memory/validate_model_defaults.yaml create mode 100644 examples/puzzletron/configs/gptoss-20b_remove_experts_memory/validate_solutions_defaults.yaml create mode 100644 modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_pruned_to_mxfp4.py diff --git a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b.yaml b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b.yaml new file mode 100644 index 000000000..7de281e78 --- /dev/null +++ b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b.yaml @@ -0,0 +1,110 @@ +defaults: + - pruning: ffn_pruning + - scoring: ../validate_solutions_defaults + - realize_model: ../validate_solutions_defaults + - bypass: + - override hydra/hydra_logging: disabled + - _self_ + +puzzle_dir: ??? +descriptor: llama +teacher_dir: ${puzzle_dir}/ckpts/teacher/ +replacement_library_path: ${puzzle_dir}/replacement_library.json +dataset_path: ??? # path to Nemotron-Post-Training-Dataset-v2 + +skip_realize_model: false + +build_replacement_library: + add_ffn_no_ops: true + add_attention_no_ops: true + +calc_subblock_stats: + batch_sizes: [64, 96, 128] + prefill_seq_len: 4096 + generation_seq_len: 4096 + num_active_tokens_override: # Optional override for sequence lengths + prefill_queue_size: 0 + allocate_prefill_query: false + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + merge_with_existing_stats: false + subblock_stats_filename: "subblock_stats.json" + moe_stats_filename: "moe_stats.json" + runtime_stats: + backend: trt_torch + +scoring: + descriptor: ${descriptor} + solutions_to_validate: + skip_existing_solutions: true + + replacement_library_path: ${replacement_library_path} + solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json} + teacher_dir: ${to_path:${teacher_dir}} + output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation + + eval_samples: 128 + micro_batch_size: 1 + seed: 42 + shuffle_seed: 444 + dataset_path: ${dataset_path} + +mip: + single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}} + subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}} + output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions} + gathered_metrics_path: + puzzle_profile: + + # puzzle_profile: + objective: metrics.cosine_embedding_loss_hidden_states + bigger_is_better: false + + subblock_stats_args: + - batch_size: 96 + weights_dtype: torch.bfloat16 + activations_dtype: torch.bfloat16 + kv_cache_dtype: torch.bfloat16 + + report_additional_costs: + - stats.memory_mib + - stats.num_params + - stats.num_kv_heads + - stats.has_attention + - stats.has_ffn + - stats.kv_cache_memory_mib + - stats.attention_memory_mib + - stats.ffn_memory_mib + - stats.ffn_num_params + - stats.attention_num_params + + human_constraints: + target_memory: 45_000 + num_params: 3_000_000_000 + + mip_constraints: + metric_overrides: + max_seconds_per_solution: 60 + +realize_model: + descriptor: ${descriptor} + teacher_dir: ${to_path:${teacher_dir}} + tokenizer_name: ${to_path:${teacher_dir}} + replacement_library_path: ${replacement_library_path} + save_models: true + solutions_path: # Filled dynamically + + # Validate params + skip_validation: false # To enable validation of the model solution set `skip_validation` as False + eval_samples: 128 + micro_batch_size: 1 + seed: 42 + shuffle_seed: 444 + dataset_path: ${dataset_path} + +nccl_timeout_minutes: ${timedelta_minutes:10} + +# This section redirects Hydra outputs +hydra: + run: + dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S} + diff --git a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b_remove_experts_memory.yaml b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b_remove_experts_memory.yaml new file mode 100644 index 000000000..979803939 --- /dev/null +++ b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b_remove_experts_memory.yaml @@ -0,0 +1,22 @@ +defaults: + - gptoss-20b + - _self_ + +# Input Hugging Face model to compress +input_hf_model_path: /workspace/hf_models/openai/gpt-oss-20b + +# Dataset path for pruning and NAS scoring +dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 + +# Working directory for compression outputs +puzzle_dir: /workspace/puzzle_dir + +# MIP memory constraint (in MiB) +mip: + human_constraints: + target_memory: 45_000 # 45 GiB + +# FFN intermediate sizes to search over (heterogeneous architecture) +# teacher_intermediate_size is 8192, so we use proportionally smaller values +pruning: + intermediate_size_list: [2048, 4096, 6144] diff --git a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/ffn_pruning.yaml b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/ffn_pruning.yaml new file mode 100644 index 000000000..e9e15db32 --- /dev/null +++ b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/ffn_pruning.yaml @@ -0,0 +1,21 @@ +defaults: + - pruning_defaults + +eval_samples: 2500 #10 +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/expert_removal/${pruning.experiment_id} + +pruning_mixin: + _target_: modelopt.torch.puzzletron.pruning.expert_removal_pruning_mixin.ExpertRemovalPruningMixIn + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.gpt_oss_20b.gpt_oss_20b_model_descriptor.GptOss20bExpertRemovalLayerDescriptor + target_name: "mlp.router" + +hook_class: ${get_object:utils.activation_hooks.hooks.RankedChoiceVotingHook} +activation_hooks_kwargs: # Additional kwargs to pass to the hook init + +num_experts_to_keep_list: [24, 16, 8] # num_experts in teacher is 128 +mlp_init_mode: "ExpertRemoval" +mlp_init_config_yaml: + expert_scores_key: "expert_ranks" + layer_prefix_template: "model.layers.{layer_idx}.mlp.router" + diff --git a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/pruning_defaults.yaml b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/pruning_defaults.yaml new file mode 100644 index 000000000..cec781465 --- /dev/null +++ b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/pruning_defaults.yaml @@ -0,0 +1,34 @@ +defaults: + - /validate_model_defaults + +model_name_or_path: ${teacher_dir} +experiment_id: ${pruning.eval_samples}samples_diverse_mini +activations_log_dir: ??? +activation_hooks_kwargs: ??? + +descriptor: ${descriptor} + +# Data: +eval_samples: 10_000 +micro_batch_size: 1 +dataset_path: ${dataset_path} +val_dataset_name: train + +# Prune ckpts +pruned_ckpts_outpt_dir: ${puzzle_dir}/pruning/${pruning.experiment_id} + +## FFN pruning +ffn_list: +mlp_init_mode: "Truncate" # PruneByActivationsLog + +## KV-heads pruning +n_heads_in_group_list: +gqa_init_mode: "AverageKV" + +## Hidden dimension pruning +hidden_size_list: +hidden_size_init_mode: "PruneByChannelRanking" +linear_init_mode: "FromTeacher" + +mlp_init_config_yaml: + activations_log_dir: ${pruning.activations_log_dir} \ No newline at end of file diff --git a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/validate_model_defaults.yaml b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/validate_model_defaults.yaml new file mode 100644 index 000000000..b80faea5f --- /dev/null +++ b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/validate_model_defaults.yaml @@ -0,0 +1,18 @@ +model_dtype: torch.bfloat16 # dtype to cast the model for validate_model +autocast_dtype: torch.bfloat16 # dtype for torch.autocast for validate_model +block_size: 8192 +bos_rate: 0.5 +data_column: messages +val_dataset_name: valid +shuffle_seed: 81436 +seed: 42 +fim_rate: 0 +fim_spm_rate: 0 +source_datasets_to_discard: +varlen: false +write_results: false +calc_losses_on_cpu: false +activations_log_dir: +model_name_or_path: +load_dataset_fn: ${get_object:modelopt.torch.puzzletron.utils.data.dataloaders.load_from_disk_fn} + diff --git a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/validate_solutions_defaults.yaml b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/validate_solutions_defaults.yaml new file mode 100644 index 000000000..ab8c89218 --- /dev/null +++ b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/validate_solutions_defaults.yaml @@ -0,0 +1,11 @@ +defaults: + - /validate_model_defaults + - _self_ + +solutions_to_validate: +skip_validation: false +save_models: false +bigger_is_better: false +sort_solutions_by: +calculate_full_score_ablations: false + diff --git a/modelopt/torch/puzzletron/anymodel/converter/converter.py b/modelopt/torch/puzzletron/anymodel/converter/converter.py index 67ed74ed9..e241e72b6 100644 --- a/modelopt/torch/puzzletron/anymodel/converter/converter.py +++ b/modelopt/torch/puzzletron/anymodel/converter/converter.py @@ -27,6 +27,7 @@ from safetensors.torch import load_file, save_file from tqdm import tqdm from transformers import PretrainedConfig +from transformers.integrations.mxfp4 import convert_moe_packed_tensors from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptor from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig @@ -61,8 +62,9 @@ def _get_weight_map(input_dir: Path) -> Dict[str, str]: f"Neither {index_path} nor {single_file_path} found. Cannot determine model format." ) - @staticmethod + @classmethod def convert_model_weights( + cls, input_dir: Path, output_dir: Path, descriptor: ModelDescriptor, num_hidden_layers: int ): """Convert model weights to subblock format.""" @@ -95,7 +97,18 @@ def convert_model_weights( data = load_file(os.path.join(input_dir, file)) for name in param_names: if param_to_file[name] == file and name in data: - tensors[name] = data[name] + converted_name = cls.convert_weight_name(name) + # Convert MoE packed tensors if quantized is mxfp4 //gpt-oss-20b + if getattr(cls, 'quantized', None) == 'mxfp4': + if name.endswith("_blocks"): + converted_name = converted_name.replace("_blocks", "") + tensors[converted_name] = convert_moe_packed_tensors(data[converted_name+"_blocks"], data[converted_name+"_scales"]) + elif name.endswith("_scales"): + continue + else: + tensors[converted_name] = data[name] + else: + tensors[converted_name] = data[name] # Save this subblock print(f"\n✅ Group: {subblock} ({len(tensors)} layers)") diff --git a/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_converter.py b/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_converter.py index b7e83dcec..d35c004c1 100644 --- a/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_converter.py +++ b/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_converter.py @@ -36,6 +36,7 @@ class GptOss20bConverter(Converter): GPT-OSS-20B is a pure MoE model with 32 experts per layer and 4 active experts. All layers use MoE FFN (no standard dense FFN layers). """ + quantized = 'mxfp4' @staticmethod def create_block_configs_from_main_config(config: PretrainedConfig) -> List[BlockConfig]: diff --git a/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_model_descriptor.py index fd5edc063..644da802c 100644 --- a/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_model_descriptor.py +++ b/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_model_descriptor.py @@ -50,6 +50,13 @@ class GptOss20bModelDescriptor(ModelDescriptor): _DECODER_LAYER_CLS: Type[nn.Module] = None + @classmethod + def create_dummy_block(cls, original_layer: GptOssDecoderLayer, block_index: int) -> nn.Module: + dummy_block = DummyBlock(block_index=block_index) + # Required by `GptOssModel.forward`. + dummy_block.attention_type = original_layer.attention_type + return dummy_block + @staticmethod def decoder_layer_cls(): """Get the decoder layer class for GPT-OSS models. @@ -132,7 +139,7 @@ def build_ffn_predicates() -> Dict[str, re.Pattern]: r"(post_attention_layernorm\.weight" r"|mlp\.router\.weight" r"|mlp\.router\.bias" - r"|mlp\.experts\.((\d+\.)?(gate_up_proj|down_proj)(\.(weight|bias|blocks|scales))?|gate_up_proj_(bias|blocks|scales)|down_proj_(bias|blocks|scales)))$" + r"|mlp\.experts\.(gate_up_proj|down_proj)(_(bias|blocks|scales))?)$" ) for layer_idx in range(num_layers) } @@ -190,12 +197,15 @@ class GptOss20bExpertRemovalLayerDescriptor(ExpertRemovalLayerDescriptor): target_name: str = "mlp" moe_prefix_name: str = "model.layers.{layer_idx}.mlp" - expert_prefix_name: str = "experts.{expert_idx}" + expert_prefix_name: str = "experts" # Router has both weight and bias router_weights: List[str] = field(default_factory=lambda: ["router.weight"]) router_biases: List[str] = field(default_factory=lambda: ["router.bias"]) + # Fused format: experts stored as single tensors + is_fused_experts: bool = True + # Fused format: single tensors containing all experts (test models) fused_expert_weights: List[str] = field( default_factory=lambda: [ @@ -212,5 +222,12 @@ class GptOss20bExpertRemovalLayerDescriptor(ExpertRemovalLayerDescriptor): default_factory=lambda: ["gate_up_proj_bias", "down_proj_bias"] ) - # Fused format: experts stored as single tensors - is_fused_experts: bool = True + def get_modules_names_to_hook(self, model) -> List[Tuple[int, str]]: + target_class_name = "GptOssTopKRouter" + + module_names_to_hook = [] + for module_name, module in model.named_modules(): + if module_name.endswith(self.target_name) and module.__class__.__name__ == target_class_name: + module_names_to_hook.append((self.block_idx_from_module_name(module_name), module_name)) + return module_names_to_hook + diff --git a/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_pruned_to_mxfp4.py b/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_pruned_to_mxfp4.py new file mode 100644 index 000000000..8e993573d --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_pruned_to_mxfp4.py @@ -0,0 +1,506 @@ +#!/usr/bin/env python3 +""" +Create a HuggingFace checkpoint with MXFP4 MoE weights from the original gpt-oss-120b model. + +This script: +1. Copies non-MoE weights from the student model (trained attention, embeddings, etc.) +2. Extracts MoE expert weights from the original gpt-oss-120b in MXFP4 format +3. Either loads experts_to_keep.json or deduces expert mappings by comparing weights +4. Outputs a new checkpoint in decihf format with PACKED MXFP4 expert weights +""" + +import argparse +import json +import os +import shutil +from typing import Dict, List, Any, Tuple, Optional + +import torch +from safetensors import safe_open +from safetensors.torch import save_file +from tqdm import tqdm + +from transformers.integrations.mxfp4 import convert_moe_packed_tensors + + +def deduce_experts_for_layer( + layer: int, + original_path: str, + original_index: Dict, + student_path: str, +) -> Tuple[List[int], int, int]: + """ + Deduce which original experts match the student experts by comparing weights. + + Compares dequantized MXFP4 weights from the original model against the student + model's BF16 weights using L2 distance. Finds the best 1-to-1 matching. + + Args: + layer: Layer index + original_path: Path to original model + original_index: Original model's safetensors index + student_path: Path to student model + num_student_experts: Number of experts in student model (if None, auto-detect) + + Returns: + Tuple of (expert_indices, num_student_experts, num_original_experts) + """ + # Load original tensors + orig_tensors = load_layer_tensors(original_path, layer, original_index) + mlp1_blocks = orig_tensors[f'model.layers.{layer}.mlp.experts.gate_up_proj_blocks'] + mlp1_scales = orig_tensors[f'model.layers.{layer}.mlp.experts.gate_up_proj_scales'] + mlp2_blocks = orig_tensors[f'model.layers.{layer}.mlp.experts.down_proj_blocks'] + mlp2_scales = orig_tensors[f'model.layers.{layer}.mlp.experts.down_proj_scales'] + + num_original_experts = mlp1_blocks.shape[0] + + # Load student tensors + student_subblocks = os.path.join(student_path, 'subblocks_safetensors') + student_ffn = os.path.join(student_subblocks, f'block_{layer}_ffn.safetensors') + if not os.path.exists(student_ffn): + print(f"FFN file not found at {student_ffn} - fallback to no_op") + return [], 0, num_original_experts + + student_experts = {} + with safe_open(student_ffn, framework='pt') as f: + for key in f.keys(): + if 'experts' in key or 'router' in key: + student_experts[key] = f.get_tensor(key) + + # Auto-detect number of student experts + num_student_experts = student_experts[f'model.layers.{layer}.mlp.experts.gate_up_proj'].size(0) + print(f" Layer {layer}: Comparing {num_student_experts} student experts against {num_original_experts} original experts") + + # Pre-dequantize all original experts once (optimization) + print(f" Pre-dequantizing {num_original_experts} original experts...") + deqexpert_mlp1 = convert_moe_packed_tensors(mlp1_blocks, mlp1_scales).cpu() + deqexpert_mlp2 = convert_moe_packed_tensors(mlp2_blocks, mlp2_scales).cpu() + original_experts_dequant = [] + for orig_idx in range(num_original_experts): + + original_experts_dequant.append({ + 'up': deqexpert_mlp1[orig_idx], + 'down': deqexpert_mlp2[orig_idx] + }) + + # For each student expert, find best matching original expert + experts_to_keep = [] + used_original_indices = set() + + # Number of values to use for quick comparison (tune this) + quick_compare_size = 8 + # Number of candidates to keep for full comparison + top_k_candidates = min(10, num_original_experts) + + for student_idx in range(num_student_experts): + # Get student expert weights + prefix = f'model.layers.{layer}.mlp' + student_up = student_experts.get(f'{prefix}.experts.gate_up_proj')[student_idx] + student_down = student_experts.get(f'{prefix}.experts.down_proj')[student_idx] + + # if student_gate is None or student_up is None or student_down is None: + if student_up is None or student_down is None: + raise ValueError(f"Missing student expert weights for layer {layer} expert {student_idx}") + + # Step 1: Quick filtering using first N values + candidate_scores = [] + for orig_idx in range(num_original_experts): + if orig_idx in used_original_indices: + continue + + orig_expert = original_experts_dequant[orig_idx] + + up_quick = (orig_expert['up'].flatten()[:quick_compare_size] - + student_up.float().flatten()[:quick_compare_size]).pow(2).mean().sqrt() + down_quick = (orig_expert['down'].flatten()[:quick_compare_size] - + student_down.float().flatten()[:quick_compare_size]).pow(2).mean().sqrt() + + quick_score = (up_quick + down_quick) / 2.0 + candidate_scores.append((orig_idx, quick_score.item())) + + # Step 2: Get top-k candidates based on quick comparison + candidate_scores.sort(key=lambda x: x[1]) + top_candidates = [idx for idx, _ in candidate_scores[:top_k_candidates]] + + # Step 3: Full comparison only on top candidates + best_match_idx = None + best_match_score = float('inf') + + for orig_idx in top_candidates: + orig_expert = original_experts_dequant[orig_idx] + + # Full comparison across all values + up_diff = (orig_expert['up'] - student_up.float()).pow(2).mean().sqrt() + down_diff = (orig_expert['down'] - student_down.float()).pow(2).mean().sqrt() + + score = (up_diff + down_diff) / 2.0 + + if score < best_match_score: + best_match_score = score + best_match_idx = orig_idx + + if best_match_idx is None: + raise ValueError(f"Could not find match for student expert {student_idx} in layer {layer}") + + experts_to_keep.append(best_match_idx) + used_original_indices.add(best_match_idx) + print(f" Student expert {student_idx} -> Original expert {best_match_idx} (RMSE: {best_match_score:.6f})") + + return experts_to_keep, num_student_experts, num_original_experts + + +def load_original_index(path: str) -> Dict[str, Any]: + """Load the original model's safetensors index.""" + with open(path, 'r') as f: + return json.load(f) + + +def load_layer_tensors(original_path: str, layer: int, index: Dict) -> Dict[str, torch.Tensor]: + """Load all MoE-related tensors for a layer, potentially from multiple files.""" + keys_to_load = [ + f'model.layers.{layer}.mlp.experts.gate_up_proj_blocks', + f'model.layers.{layer}.mlp.experts.gate_up_proj_scales', + f'model.layers.{layer}.mlp.experts.gate_up_proj_bias', + f'model.layers.{layer}.mlp.experts.down_proj_blocks', + f'model.layers.{layer}.mlp.experts.down_proj_scales', + f'model.layers.{layer}.mlp.experts.down_proj_bias', + f'model.layers.{layer}.mlp.router.weight', # Router weight + f'model.layers.{layer}.mlp.router.bias', # Router bias + ] + + # Group by file + file_to_keys = {} + for key in keys_to_load: + if key in index['weight_map']: + filename = index['weight_map'][key] + if filename not in file_to_keys: + file_to_keys[filename] = [] + file_to_keys[filename].append(key) + + # Load from each file + tensors = {} + for filename, keys in file_to_keys.items(): + filepath = os.path.join(original_path, filename) + with safe_open(filepath, framework='pt') as f: + for key in keys: + tensors[key] = f.get_tensor(key) + + return tensors + + +def copy_non_moe_weights( + student_path: str, + output_path: str, + num_layers: int +) -> Dict[str, str]: + """ + Copy non-MoE weights from student model. + Returns weight_map for the new index. + """ + weight_map = {} + subblocks_dir = os.path.join(output_path, 'subblocks_safetensors') + os.makedirs(subblocks_dir, exist_ok=True) + + student_subblocks = os.path.join(student_path, 'subblocks_safetensors') + + # Copy embeddings + src_emb = os.path.join(student_subblocks, 'embeddings.safetensors') + dst_emb = os.path.join(subblocks_dir, 'embeddings.safetensors') + shutil.copy2(src_emb, dst_emb) + with safe_open(src_emb, framework='pt') as f: + for key in f.keys(): + weight_map[key] = 'subblocks_safetensors/embeddings.safetensors' + + # Copy lm_head + src_head = os.path.join(student_subblocks, 'lm_head.safetensors') + dst_head = os.path.join(subblocks_dir, 'lm_head.safetensors') + shutil.copy2(src_head, dst_head) + with safe_open(src_head, framework='pt') as f: + for key in f.keys(): + weight_map[key] = 'subblocks_safetensors/lm_head.safetensors' + + # Copy attention blocks + for layer in range(num_layers): + src_attn = os.path.join(student_subblocks, f'block_{layer}_attention.safetensors') + dst_attn = os.path.join(subblocks_dir, f'block_{layer}_attention.safetensors') + shutil.copy2(src_attn, dst_attn) + with safe_open(src_attn, framework='pt') as f: + for key in f.keys(): + weight_map[key] = f'subblocks_safetensors/block_{layer}_attention.safetensors' + + return weight_map + + + + +def process_single_layer( + layer: int, + original_path: str, + original_index: Dict, + student_path: str, + output_path: str, + experts_to_keep: List[int], +) -> Tuple[Dict[str, str], List[str]]: + """ + Process a single layer - loads tensors from potentially multiple files. + Returns (weight_map, verification_errors). + """ + weight_map = {} + verification_errors = [] + subblocks_dir = os.path.join(output_path, 'subblocks_safetensors') + student_subblocks = os.path.join(student_path, 'subblocks_safetensors') + + # Load all tensors for this layer (may come from multiple files) + orig_tensors = load_layer_tensors(original_path, layer, original_index) + + # Load student FFN file + student_ffn = os.path.join(student_subblocks, f'block_{layer}_ffn.safetensors') + + tensors_to_save = {} + student_tensors = {} + + with safe_open(student_ffn, framework='pt') as f: + for key in f.keys(): + tensor = f.get_tensor(key) + if 'experts' not in key and 'router' not in key: + # Copy norm weights + tensors_to_save[key] = tensor + + # Get router from original model, sliced to kept experts + orig_router_weight = orig_tensors[f'model.layers.{layer}.mlp.router.weight'] + orig_router_bias = orig_tensors[f'model.layers.{layer}.mlp.router.bias'] + + kept_indices_tensor = torch.tensor(experts_to_keep, dtype=torch.long) + sliced_router_weight = orig_router_weight[kept_indices_tensor] + sliced_router_bias = orig_router_bias[kept_indices_tensor] + + tensors_to_save[f'model.layers.{layer}.mlp.router.weight'] = sliced_router_weight + tensors_to_save[f'model.layers.{layer}.mlp.router.bias'] = sliced_router_bias + + # Get MoE tensors + mlp1_blocks = orig_tensors[f'model.layers.{layer}.mlp.experts.gate_up_proj_blocks'] + mlp1_scales = orig_tensors[f'model.layers.{layer}.mlp.experts.gate_up_proj_scales'] + mlp2_blocks = orig_tensors[f'model.layers.{layer}.mlp.experts.down_proj_blocks'] + mlp2_scales = orig_tensors[f'model.layers.{layer}.mlp.experts.down_proj_scales'] + mlp1_bias = orig_tensors[f'model.layers.{layer}.mlp.experts.gate_up_proj_bias'] + mlp2_bias = orig_tensors[f'model.layers.{layer}.mlp.experts.down_proj_bias'] + + tensors_to_save[f'model.layers.{layer}.mlp.experts.gate_up_proj_blocks'] = mlp1_blocks[kept_indices_tensor] + tensors_to_save[f'model.layers.{layer}.mlp.experts.gate_up_proj_scales'] = mlp1_scales[kept_indices_tensor] + tensors_to_save[f'model.layers.{layer}.mlp.experts.gate_up_proj_bias'] = mlp1_bias[kept_indices_tensor] + + tensors_to_save[f'model.layers.{layer}.mlp.experts.down_proj_blocks'] = mlp2_blocks[kept_indices_tensor] + tensors_to_save[f'model.layers.{layer}.mlp.experts.down_proj_scales'] = mlp2_scales[kept_indices_tensor] + tensors_to_save[f'model.layers.{layer}.mlp.experts.down_proj_bias'] = mlp2_bias[kept_indices_tensor] + + # Save the FFN file + output_file = os.path.join(subblocks_dir, f'block_{layer}_ffn.safetensors') + save_file(tensors_to_save, output_file) + + # Build weight map + for key in tensors_to_save.keys(): + weight_map[key] = f'subblocks_safetensors/block_{layer}_ffn.safetensors' + + return weight_map, verification_errors + + +def copy_config_files(student_path: str, output_path: str): + """Copy configuration files from student model and update config.json.""" + files_to_copy = [ + 'tokenizer.json', + 'tokenizer_config.json', + 'special_tokens_map.json', + 'chat_template.jinja', + ] + + # Also copy transformers compatibility files + if os.path.exists(student_path): + for f in os.listdir(student_path): + if f.startswith('transformers_'): + files_to_copy.append(f) + + for filename in files_to_copy: + src = os.path.join(student_path, filename) + dst = os.path.join(output_path, filename) + + # Try student path first + if os.path.exists(src): + try: + shutil.copy2(src, dst) + continue + except PermissionError: + pass + + # If we get here, file doesn't exist or permission denied + if not os.path.exists(dst): + print(f" Warning: Could not copy {filename}") + + # Update config.json for DeciGptOssForCausalLM with MXFP4 + src_config = os.path.join(student_path, 'config.json') + if not os.path.exists(src_config): + raise FileNotFoundError(f"config.json not found at {src_config}") + + with open(src_config, 'r') as f: + config = json.load(f) + + # Set architecture to DeciGptOssForCausalLM for MXFP4 support + config['architectures'] = ['DeciGptOssForCausalLM'] + + # Add quantization_config so vllm calls _load_weights_mxfp4 + config['quantization_config'] = { + "quant_method": "mxfp4", + "modules_to_not_convert": [ + "model.layers.*.self_attn", + "model.layers.*.mlp.router", + "model.embed_tokens", + "lm_head", + ] + } + + dst_config = os.path.join(output_path, 'config.json') + with open(dst_config, 'w') as f: + json.dump(config, f, indent=2) + + +def main(): + parser = argparse.ArgumentParser(description='Create MXFP4 checkpoint from student model') + parser.add_argument( + '--student-path', + type=str, + required=True, + help='Path to student model checkpoint' + ) + parser.add_argument( + '--original-path', + type=str, + required=True, + help='Path to original gpt-oss-120b model with MXFP4 weights' + ) + parser.add_argument( + '--output-path', + type=str, + required=True, + help='Output path for the new checkpoint' + ) + parser.add_argument( + '--num-layers', + type=int, + default=36, + help='Number of transformer layers' + ) + args = parser.parse_args() + + print(f"Creating MXFP4 checkpoint...") + print(f" Student model: {args.student_path}") + print(f" Original model: {args.original_path}") + print(f" Output: {args.output_path}") + + + # Load original model index + original_index = load_original_index( + os.path.join(args.original_path, 'model.safetensors.index.json') + ) + + print("\nDeducing expert mappings by comparing weights...") + experts_to_keep = [] + layer_statistics = [] # Store (num_student, num_original) for each layer + + for layer in range(args.num_layers): + layer_experts, num_student, num_original = deduce_experts_for_layer( + layer, + args.original_path, + original_index, + args.student_path, + ) + experts_to_keep.append(layer_experts) + layer_statistics.append((num_student, num_original)) + + # Print statistics + print(f"\n{'='*70}") + print("EXPERT DEDUCTION STATISTICS") + print(f"{'='*70}") + print(f"{'Layer':<8} {'Student Experts':<18} {'Original Experts':<18} {'Kept %':<10}") + print(f"{'-'*70}") + + total_student = 0 + total_original = 0 + for layer, (num_student, num_original) in enumerate(layer_statistics): + percentage = (num_student / num_original * 100) if num_original > 0 else 0 + print(f"{layer:<8} {num_student:<18} {num_original:<18} {percentage:<10.2f}") + total_student += num_student + total_original += num_original + + print(f"{'-'*70}") + avg_percentage = (total_student / total_original * 100) if total_original > 0 else 0 + print(f"{'TOTAL':<8} {total_student:<18} {total_original:<18} {avg_percentage:<10.2f}") + print(f"{'='*70}") + print(f"\n Deduced experts_to_keep mapping for {len(experts_to_keep)} layers") + + # Create output directory + os.makedirs(args.output_path, exist_ok=True) + os.makedirs(os.path.join(args.output_path, 'subblocks_safetensors'), exist_ok=True) + + # Copy config files + print("Copying configuration files...") + copy_config_files(args.student_path, args.output_path) + + # Save experts_to_keep.json + experts_to_keep_output = os.path.join(args.output_path, 'experts_to_keep.json') + with open(experts_to_keep_output, 'w') as f: + json.dump(experts_to_keep, f, indent=2) + print(f" Saved experts_to_keep mapping to {experts_to_keep_output}") + + # Copy non-MoE weights (embeddings, attention, lm_head) + print("Copying non-MoE weights...") + weight_map = copy_non_moe_weights( + args.student_path, + args.output_path, + args.num_layers + ) + + # Load weights per layer (handles multi-file loading) + print(f"Processing {args.num_layers} layers...") + + all_verification_errors = [] + + # Process each layer + for layer in tqdm(range(args.num_layers), desc="Processing layers"): + if len(experts_to_keep[layer]) == 0: + print(f"Layer {layer} has no experts to keep - ffn->no_op") + continue + layer_weight_map, layer_errors = process_single_layer( + layer, + args.original_path, + original_index, + args.student_path, + args.output_path, + experts_to_keep[layer], + ) + weight_map.update(layer_weight_map) + all_verification_errors.extend(layer_errors) + + # Calculate total size + total_size = 0 + subblocks_dir = os.path.join(args.output_path, 'subblocks_safetensors') + for filename in os.listdir(subblocks_dir): + filepath = os.path.join(subblocks_dir, filename) + total_size += os.path.getsize(filepath) + + # Create model.safetensors.index.json + index = { + 'metadata': { + 'total_size': total_size + }, + 'weight_map': weight_map + } + + index_path = os.path.join(args.output_path, 'model.safetensors.index.json') + with open(index_path, 'w') as f: + json.dump(index, f, indent=2) + + print(f"\nCheckpoint created successfully at: {args.output_path}") + print(f"Total size: {total_size / 1e9:.2f} GB") + + +if __name__ == '__main__': + main() From 5ad56c5a05c12d4b3e7184fa864f98ac53b64d91 Mon Sep 17 00:00:00 2001 From: mchochowski Date: Fri, 13 Feb 2026 03:17:51 -0800 Subject: [PATCH 2/8] added paragraph in readme Signed-off-by: mchochowski --- examples/puzzletron/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index 619fb619b..40d6bcfe0 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -285,3 +285,10 @@ python -m nemo_export/convert_nemo_to_hf --input-ckpt-path path/to/nemo-model -- ## Advanced Usage Modify `llama-3_1-8B_pruneffn_memory.yaml` file for advanced compression scenarios. + +## GptOss - 20b + +With this release Puzzle algorithm supports only experts removal for Gpt-Oss-20b. This model comes as a quantized checkpoint i.e. MoE experts matrices are quantized with mxfp4 format. In the prunning steps puzzle utilizes decompressed model (back to bf16) for statistics and scores computation. This means, during the conversion to puzzle format we decompress the model and store it as a bf16. Once the pruning is done i.e. experts to be removed are identified and the process is finished, user may want to get back the mxfp4 format of the checkpoint. To do so, there is an additional script, that takes the original and the pruned checkpoint and outputs pruned checkpoint in mxfp4 format. +```bash +python gpt_oss_pack_mxfp4_vllm.py --student-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/solution_0/ --original-path /workspaces/source_model_checkpoints/openai_gpt-oss-20b/ --output-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/mxfp4-ckpt/ --deduce-experts --num-layers 24 +``` From 93cee768fd08bd3d91446040351d6ed675142e74 Mon Sep 17 00:00:00 2001 From: mchochowski Date: Thu, 19 Feb 2026 04:20:57 -0800 Subject: [PATCH 3/8] fixes to readme, and config yaml. added copyrights header Signed-off-by: mchochowski --- examples/puzzletron/README.md | 2 +- .../gptoss-20b.yaml | 2 +- .../gpt_oss_20b/gpt_oss_pruned_to_mxfp4.py | 22 +++++++++++++++++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index 40d6bcfe0..2919c8f3d 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -290,5 +290,5 @@ Modify `llama-3_1-8B_pruneffn_memory.yaml` file for advanced compression scenari With this release Puzzle algorithm supports only experts removal for Gpt-Oss-20b. This model comes as a quantized checkpoint i.e. MoE experts matrices are quantized with mxfp4 format. In the prunning steps puzzle utilizes decompressed model (back to bf16) for statistics and scores computation. This means, during the conversion to puzzle format we decompress the model and store it as a bf16. Once the pruning is done i.e. experts to be removed are identified and the process is finished, user may want to get back the mxfp4 format of the checkpoint. To do so, there is an additional script, that takes the original and the pruned checkpoint and outputs pruned checkpoint in mxfp4 format. ```bash -python gpt_oss_pack_mxfp4_vllm.py --student-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/solution_0/ --original-path /workspaces/source_model_checkpoints/openai_gpt-oss-20b/ --output-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/mxfp4-ckpt/ --deduce-experts --num-layers 24 +python -m modelopt.torch.puzzletron.anymodel.models.gpt_oss_20b.gpt_oss_pruned_to_mxfp4 --student-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/solution_0/ --original-path /workspaces/source_model_checkpoints/openai_gpt-oss-20b/ --output-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/mxfp4-ckpt/ --num-layers 24 ``` diff --git a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b.yaml b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b.yaml index 7de281e78..ded4f6514 100644 --- a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b.yaml +++ b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b.yaml @@ -7,7 +7,7 @@ defaults: - _self_ puzzle_dir: ??? -descriptor: llama +descriptor: gpt_oss_20b teacher_dir: ${puzzle_dir}/ckpts/teacher/ replacement_library_path: ${puzzle_dir}/replacement_library.json dataset_path: ??? # path to Nemotron-Post-Training-Dataset-v2 diff --git a/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_pruned_to_mxfp4.py b/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_pruned_to_mxfp4.py index 8e993573d..7be72bee2 100644 --- a/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_pruned_to_mxfp4.py +++ b/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_pruned_to_mxfp4.py @@ -1,12 +1,30 @@ #!/usr/bin/env python3 + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# mypy: ignore-errors +# + """ Create a HuggingFace checkpoint with MXFP4 MoE weights from the original gpt-oss-120b model. This script: 1. Copies non-MoE weights from the student model (trained attention, embeddings, etc.) 2. Extracts MoE expert weights from the original gpt-oss-120b in MXFP4 format -3. Either loads experts_to_keep.json or deduces expert mappings by comparing weights -4. Outputs a new checkpoint in decihf format with PACKED MXFP4 expert weights +3. Deduces expert mappings by comparing weights +4. Outputs a new pruned (heterogeneous) checkpoint with PACKED MXFP4 expert weights """ import argparse From ad1067bff1893048f98196d33c15fe1d74b5600a Mon Sep 17 00:00:00 2001 From: J Rausch <38429553+j-rausch@users.noreply.github.com> Date: Tue, 17 Feb 2026 12:30:25 +0100 Subject: [PATCH 4/8] Add support and documentation for AnyModel checkpoints with Nemo evaluator (#894) This PR adds Nemo Evaluator support to the AnyModel branch. It includes documentation and a deployment script that allow for evaluation of AnyModel Puzzletron checkpoints with Nemo Evaluator. We assume development on a GPU node, following the current tutorial style, so we don't rely on Slurm-based deployment/evaluation, but instead use direct evaluation via `eval-factory run_eval`. --------- Signed-off-by: jrausch --- .pre-commit-config.yaml | 1 + examples/puzzletron/README.md | 38 +- .../evaluation/hf_deployable_anymodel.py | 724 ++++++++++++++++++ examples/puzzletron/requirements.txt | 4 +- 4 files changed, 758 insertions(+), 9 deletions(-) create mode 100644 examples/puzzletron/evaluation/hf_deployable_anymodel.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b4d3174d1..6b936106f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -109,6 +109,7 @@ repos: examples/speculative_decoding/main.py| examples/speculative_decoding/medusa_utils.py| examples/speculative_decoding/server_generate.py| + examples/puzzletron/evaluation/hf_deployable_anymodel\.py| modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_.*\.py| )$ diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index 2919c8f3d..3e7c78bc2 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -15,11 +15,11 @@ In this example, we compress the [Llama-3.1-8B-Instruct](https://huggingface.co/ ## Environment -- Install Model-Optimizer in editable mode with the corresponding dependencies: +- Install Model-Optimizer in editable mode with the corresponding dependencies (run from the repo root): ```bash pip install -e .[hf,puzzletron] -pip install -r requirements.txt +pip install -r examples/puzzletron/requirements.txt ``` - For this example we are using 2x NVIDIA H100 80GB HBM3 to show multi-GPU steps. You can use also use s single GPU. @@ -229,16 +229,38 @@ The plot shows how token accuracy changes with different compression rates. High ## Evaluation -Once the model is ready, you can evaluate it using [Language Model Evaluation Harness](https://pypi.org/project/lm-eval/). For example, run the following to evaluate the model on [Massive Multitask Language Understanding](https://huggingface.co/datasets/cais/mmlu) benchmark. +Evaluate AnyModel checkpoints by deploying a local OpenAI-compatible completions endpoint and running benchmarks against it. + +**1. Deploy the model (2 GPUs example):** + +```bash +# Install the AnyModel-patched deployable (first time only: backs up the original) +# /opt/Export-Deploy is the default path in NeMo containers — adjust if needed +cp /opt/Export-Deploy/nemo_deploy/llm/hf_deployable.py /opt/Export-Deploy/nemo_deploy/llm/hf_deployable.py.bak +cp examples/puzzletron/evaluation/hf_deployable_anymodel.py /opt/Export-Deploy/nemo_deploy/llm/hf_deployable.py + +# Start the server (blocks while running — use a separate terminal) +ray start --head --num-gpus 2 --port 6379 --disable-usage-stats +python /opt/Export-Deploy/scripts/deploy/nlp/deploy_ray_hf.py \ + --model_path path/to/checkpoint \ + --model_id anymodel-hf \ + --num_gpus 2 --num_gpus_per_replica 2 --num_cpus_per_replica 16 \ + --trust_remote_code --port 8083 --device_map "auto" --cuda_visible_devices "0,1" +``` + +**2. Run MMLU:** ```bash -lm_eval --model hf \ - --model_args pretrained=path/to/model,dtype=bfloat16,trust_remote_code=true,parallelize=True \ - --tasks mmlu \ - --num_fewshot 5 \ - --batch_size 4 +eval-factory run_eval \ + --eval_type mmlu \ + --model_id anymodel-hf \ + --model_type completions \ + --model_url http://0.0.0.0:8083/v1/completions/ \ + --output_dir examples/puzzletron/evals/mmlu_anymodel ``` +For a quick debug run, add `--overrides "config.params.limit_samples=5"`. + ## Inference Performance Benchmarking Now let's evaluate how much speedup we get with the compressed model in terms of throughput and latency. diff --git a/examples/puzzletron/evaluation/hf_deployable_anymodel.py b/examples/puzzletron/evaluation/hf_deployable_anymodel.py new file mode 100644 index 000000000..f4fd4e414 --- /dev/null +++ b/examples/puzzletron/evaluation/hf_deployable_anymodel.py @@ -0,0 +1,724 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +from typing import Any + +import numpy as np +import torch +from nemo_deploy import ITritonDeployable +from nemo_deploy.utils import broadcast_list, cast_output, str_ndarray2list +from nemo_export_deploy_common.import_utils import ( + MISSING_TRITON_MSG, + UnavailableError, + null_decorator, +) +from peft import PeftModel +from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer + +try: + from pytriton.decorators import batch + from pytriton.model_config import Tensor + + HAVE_TRITON = True +except (ImportError, ModuleNotFoundError): + from unittest.mock import MagicMock + + HAVE_TRITON = False + batch = MagicMock() + Tensor = MagicMock() + batch = null_decorator + + +LOGGER = logging.getLogger("NeMo") + +SUPPORTED_TASKS = ["text-generation"] + + +class HuggingFaceLLMDeploy(ITritonDeployable): + """A Triton inference server compatible wrapper for HuggingFace models. + + This class provides a standardized interface for deploying HuggingFace models + in Triton inference server. It supports various NLP tasks and handles model + loading, inference, and deployment configurations. + + Args: + hf_model_id_path (Optional[str]): Path to the HuggingFace model or model identifier. + Can be a local path or a model ID from HuggingFace Hub. + hf_peft_model_id_path (Optional[str]): Path to the PEFT model or model identifier. + Can be a local path or a model ID from HuggingFace Hub. + tokenizer_id_path (Optional[str]): Path to the tokenizer or tokenizer identifier. + If None, will use the same path as hf_model_id_path. + model (Optional[AutoModel]): Pre-loaded HuggingFace model. + tokenizer (Optional[AutoTokenizer]): Pre-loaded HuggingFace tokenizer. + tokenizer_padding (bool): Whether to enable padding in tokenizer. Defaults to True. + tokenizer_truncation (bool): Whether to enable truncation in tokenizer. Defaults to True. + tokenizer_padding_side (str): Which side to pad on ('left' or 'right'). Defaults to 'left'. + task (str): HuggingFace task type (e.g., "text-generation"). Defaults to "text-generation". + **hf_kwargs: Additional keyword arguments to pass to HuggingFace model loading. + """ + + def __init__( + self, + hf_model_id_path: str | None = None, + hf_peft_model_id_path: str | None = None, + tokenizer_id_path: str | None = None, + model: AutoModel | None = None, + tokenizer: AutoTokenizer | None = None, + tokenizer_padding=True, + tokenizer_truncation=True, + tokenizer_padding_side="left", + task: str | None = "text-generation", + torch_dtype: torch.dtype | None = "auto", + device_map: str | None = "auto", + **hf_kwargs, + ): + if not HAVE_TRITON: + raise UnavailableError(MISSING_TRITON_MSG) + + if hf_model_id_path is None and model is None: + raise ValueError("hf_model_id_path or model parameters has to be passed.") + elif hf_model_id_path is not None and model is not None: + LOGGER.warning( + "hf_model_id_path will be ignored and the HuggingFace model set with model parameter will be used." + ) + + assert task in SUPPORTED_TASKS, "Task {} is not a support task.".format(task) + + self.hf_model_id_path = hf_model_id_path + self.hf_peft_model_id_path = hf_peft_model_id_path + self.task = task + self.model = model + self.tokenizer = tokenizer + self.tokenizer_padding = tokenizer_padding + self.tokenizer_truncation = tokenizer_truncation + self.tokenizer_padding_side = tokenizer_padding_side + + if tokenizer_id_path is None: + self.tokenizer_id_path = hf_model_id_path + else: + self.tokenizer_id_path = tokenizer_id_path + + if model is None: + self._load(torch_dtype=torch_dtype, device_map=device_map, **hf_kwargs) + + def _load( + self, torch_dtype: torch.dtype | None = "auto", device_map: str | None = "auto", **hf_kwargs + ) -> None: + """Load the HuggingFace pipeline with the specified model and task. + + This method initializes the HuggingFace AutoModel classes using the provided model + configuration and task type. It handles the model and tokenizer loading + process. + + Args: + torch_dtype (torch.dtype): Data type for the model. Defaults to "auto". + device_map (str): Device map for the model. Defaults to "auto". + **hf_kwargs: Additional keyword arguments to pass to the HuggingFace model loading. + + Raises: + AssertionError: If task is not specified. + """ + assert self.task is not None, "A task has to be given for the generation task." + + if self.task == "text-generation": + # ========================================================================= + # BEGIN ANYMODEL PATCH + # Wraps model loading with deci_x_patcher for heterogeneous layer configs. + # See: modelopt/torch/puzzletron/anymodel/puzzformer/utils.py + # ========================================================================= + import os + import sys + + modelopt_workdir = os.environ.get("MODELOPT_WORKDIR") or os.environ.get( + "PUZZLE_WORKDIR" + ) + if modelopt_workdir and modelopt_workdir not in sys.path: + sys.path.insert(0, modelopt_workdir) + from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor + from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher + + with deci_x_patcher(model_descriptor=LlamaModelDescriptor): + self.model = AutoModelForCausalLM.from_pretrained( + self.hf_model_id_path, + torch_dtype=torch_dtype, + device_map=device_map, + **hf_kwargs, + ) + # ========================================================================= + # END ANYMODEL PATCH + # ========================================================================= + + if self.hf_peft_model_id_path is not None: + self.model = PeftModel.from_pretrained(self.model, self.hf_peft_model_id_path) + else: + raise ValueError("Task {} is not supported.".format(self.task)) + num_gpus = torch.cuda.device_count() + # If there is only one GPU, move the model to GPU. If you are using device_map as "auto" or "balanced", + # the model will be moved to GPU automatically. + if device_map is None and num_gpus >= 1 and self.model.device.type != "cuda": + self.model.cuda() + self.tokenizer = AutoTokenizer.from_pretrained( + self.tokenizer_id_path, + trust_remote_code=hf_kwargs.pop("trust_remote_code", False), + padding=self.tokenizer_padding, + truncation=self.tokenizer_truncation, + padding_side=self.tokenizer_padding_side, + ) + + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + def generate( + self, + **kwargs: Any, + ) -> list[str]: + """Generate text based on the provided input prompts. + + This method processes input prompts through the loaded pipeline and + generates text according to the specified parameters. + + Args: + **kwargs: Generation parameters including: + - text_inputs: List of input prompts + - max_length: Maximum number of tokens to generate + - num_return_sequences: Number of sequences to generate per prompt + - temperature: Sampling temperature + - top_k: Number of highest probability tokens to consider + - top_p: Cumulative probability threshold for token sampling + - do_sample: Whether to use sampling, default is False for greedy decoding + - echo: Whether to return prompt + generated text (True) or just generated text (False) + - return_full_text: Whether to return full text or only generated part + + Returns: + If output logits and output scores are False: + List[str]: A list of generated texts, one for each input prompt. + If output logits and output scores are True: + Dict: A dictionary containing: + - sentences: List of generated texts + - logits: List of logits + - scores: List of scores + - input_lengths: List of input token lengths (for echo processing) + + Raises: + RuntimeError: If the pipeline is not initialized. + """ + if not self.model: + raise RuntimeError("Model is not initialized") + + inputs = self.tokenizer( + kwargs["text_inputs"], + return_tensors="pt", + padding=self.tokenizer_padding, + truncation=self.tokenizer_truncation, + ) + + # Store input lengths to extract only generated tokens later + input_lengths = [len(input_ids) for input_ids in inputs["input_ids"]] + + # Get echo parameter (default False - only return generated text) + echo = kwargs.pop("echo", False) + kwargs.pop("text_inputs") # Remove text_inputs as it's already been tokenized + + kwargs = {**inputs, **kwargs} + for key, val in kwargs.items(): + if torch.is_tensor(val): + kwargs[key] = val.cuda() + + with torch.no_grad(): + generated_ids = self.model.generate(**kwargs) + return_dict_in_generate = kwargs.get("return_dict_in_generate", False) + if return_dict_in_generate: + # Handle dict output (when logits/scores are requested) + sequences = generated_ids["sequences"] + output = {"sentences": [], "input_lengths": input_lengths, "sequences": sequences} + + if echo: + # Return full text (prompt + generated). + # HF model's generate returns the input/prompt tokens as well by default. + for i, seq in enumerate(sequences): + full_text = self.tokenizer.decode(seq, skip_special_tokens=True) + output["sentences"].append(full_text) + else: + # Extract only the generated tokens (skip input tokens). + # This is required as HF model's generate returns the input/prompt tokens + # as well by default. (return_full_text is specific to some models) + for i, seq in enumerate(sequences): + input_len = input_lengths[i] if i < len(input_lengths) else 0 + generated_tokens = seq[input_len:] # Skip input tokens + generated_text = self.tokenizer.decode( + generated_tokens, skip_special_tokens=True + ) + output["sentences"].append(generated_text) + + if kwargs.get("output_logits", False): + output["logits"] = generated_ids["logits"] + if kwargs.get("output_scores", False): + output["scores"] = generated_ids["scores"] + else: + # Handle list output (normal case) + output = [] + if echo: + # Return full text (prompt + generated), which is the default in case of HF model generate. + for i, seq in enumerate(generated_ids): + full_text = self.tokenizer.decode(seq, skip_special_tokens=True) + output.append(full_text) + else: + # Extract only the generated tokens (skip input tokens) as the default + # behavior returns the input/prompt tokens as well. + for i, seq in enumerate(generated_ids): + input_len = input_lengths[i] if i < len(input_lengths) else 0 + generated_tokens = seq[input_len:] # Skip input tokens + generated_text = self.tokenizer.decode( + generated_tokens, skip_special_tokens=True + ) + output.append(generated_text) + + return output + + def generate_other_ranks(self): + """Generate function for ranks other than the rank 0.""" + while True: + message = torch.empty(1, dtype=torch.long, device="cuda") + torch.distributed.broadcast(message, src=0) + if message == 0: + prompts = broadcast_list(data=[None], src=0) + ( + temperature, + top_k, + top_p, + num_tokens_to_generate, + output_logits, + output_scores, + ) = broadcast_list(data=[None], src=0) + + return_dict_in_generate = False + if output_logits or output_scores: + return_dict_in_generate = True + + self.generate( + text_inputs=prompts, + do_sample=False, # do_sample=False for greedy decoding + top_k=top_k, + top_p=top_p, + temperature=temperature, + max_new_tokens=num_tokens_to_generate, + output_logits=output_logits, + output_scores=output_scores, + return_dict_in_generate=return_dict_in_generate, + ) + else: + return + + @property + def get_triton_input(self): + inputs = ( + Tensor(name="prompts", shape=(-1,), dtype=bytes), + Tensor(name="max_length", shape=(-1,), dtype=np.int_, optional=True), + Tensor(name="max_batch_size", shape=(-1,), dtype=np.int_, optional=True), + Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True), + Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True), + Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True), + Tensor(name="random_seed", shape=(-1,), dtype=np.int_, optional=True), + Tensor(name="max_length", shape=(-1,), dtype=np.int_, optional=True), + Tensor(name="output_logits", shape=(-1,), dtype=np.bool_, optional=True), + Tensor(name="output_scores", shape=(-1,), dtype=np.bool_, optional=True), + ) + return inputs + + @property + def get_triton_output(self): + return ( + Tensor(name="sentences", shape=(-1,), dtype=bytes), + Tensor(name="logits", shape=(-1,), dtype=np.single), + Tensor(name="scores", shape=(-1,), dtype=np.single), + ) + + @batch + def triton_infer_fn(self, **inputs: np.ndarray): + output_infer = {} + + try: + prompts = str_ndarray2list(inputs.pop("prompts")) + temperature = inputs.pop("temperature")[0][0] if "temperature" in inputs else 1.0 + top_k = int(inputs.pop("top_k")[0][0] if "top_k" in inputs else 1) + top_p = inputs.pop("top_p")[0][0] if "top_p" in inputs else 0 + num_tokens_to_generate = ( + inputs.pop("max_length")[0][0] if "max_length" in inputs else 256 + ) + output_logits = ( + inputs.pop("output_logits")[0][0] if "output_logits" in inputs else False + ) + output_scores = ( + inputs.pop("output_scores")[0][0] if "output_scores" in inputs else False + ) + return_dict_in_generate = False + if output_logits or output_scores: + return_dict_in_generate = True + + if torch.distributed.is_initialized(): + if torch.distributed.get_world_size() > 1: + torch.distributed.broadcast( + torch.tensor([0], dtype=torch.long, device="cuda"), src=0 + ) + broadcast_list(prompts, src=0) + broadcast_list( + data=[ + temperature, + top_k, + top_p, + num_tokens_to_generate, + output_logits, + output_scores, + ], + src=0, + ) + + output = self.generate( + text_inputs=prompts, + do_sample=False, # do_sample=False for greedy decoding + top_k=top_k, + top_p=top_p, + temperature=temperature, + max_new_tokens=num_tokens_to_generate, + output_logits=output_logits, + output_scores=output_scores, + return_dict_in_generate=return_dict_in_generate, + echo=False, + ) + + if isinstance(output, dict): + output_infer = {"sentences": cast_output(output["sentences"], np.bytes_)} + + if "scores" in output: + output_scores = [] + for r in output["scores"]: + lp = torch.tensor(r).cpu().detach().numpy() + if len(lp) == 0: + output_scores.append([0]) + else: + output_scores.append(lp) + output_infer["scores"] = np.array(output_scores).transpose(1, 0, 2) + + if "logits" in output: + output_logits = [] + for r in output["logits"]: + lp = torch.tensor(r).cpu().detach().numpy() + if len(lp) == 0: + output_logits.append([0]) + else: + output_logits.append(lp) + output_infer["logits"] = np.array(output_logits).transpose(1, 0, 2) + else: + output_infer = {"sentences": cast_output(output, np.bytes_)} + + except Exception as error: + err_msg = "An error occurred: {}".format(str(error)) + output_infer["sentences"] = cast_output([err_msg], np.bytes_) + + return output_infer + + def _compute_logprobs( + self, + prompts: list[str], + output_infer: dict[str, Any], + compute_logprob: bool, + n_top_logprobs: int, + echo: bool, + ): + """Compute log probabilities and top log probabilities from model scores. + Used by ray_infer_fn to provide OAI API compatible output for evaluations. + + This method processes the raw scores from model generation to compute: + - Log probabilities for chosen tokens + - Top-k log probabilities for each position (if requested) + - Handles both prompt tokens (when echo=True) and generated tokens + + Args: + prompts: List of input prompts + output_infer: Dictionary containing model outputs including scores, sequences, and input_lengths + compute_logprob: Whether to compute log probabilities + n_top_logprobs: Number of top log probabilities to return (0 to disable) + echo: Whether to include prompt token log probabilities + + Returns: + Tuple[Optional[List], Optional[List]]: + - log_probs_list: List of log probabilities for each sample (None if not computed) + - top_logprobs_list: List of top-k log probabilities for each sample (None if not computed) + """ + # Tokenize the prompts to get prompt token IDs (needed for echo) + prompt_token_ids = None + prompt_inputs = None + if echo: + prompt_inputs = self.tokenizer( + prompts, + return_tensors="pt", + padding=self.tokenizer_padding, + truncation=self.tokenizer_truncation, + ) + prompt_token_ids = prompt_inputs["input_ids"] + # Move to same device as model + for key, val in prompt_inputs.items(): + if torch.is_tensor(val): + prompt_inputs[key] = val.cuda() + + # Process each sample + log_probs_list = [] + top_logprobs_list = [] + + for sample_idx in range(len(prompts)): + sample_log_probs = [] + sample_top_logprobs = [] + + # Get the generated sequence for this sample + sequences = output_infer["sequences"][sample_idx] + + # For echo, compute prompt token logprobs by running forward pass + if echo and prompt_token_ids is not None: + prompt_len = len(prompt_token_ids[sample_idx]) + + # Run forward pass on prompt to get logits for prompt tokens as scores in output_infer contains + # logits only for generated tokens. + with torch.no_grad(): + # Create input for this specific sample + sample_prompt_input = { + key: val[sample_idx : sample_idx + 1] for key, val in prompt_inputs.items() + } + prompt_outputs = self.model(**sample_prompt_input) + prompt_logits = prompt_outputs.logits[0] # Shape: [seq_len, vocab_size] + + # Calculate log probs for each prompt token (except the first BOS token) + for token_pos in range(1, prompt_len): # Start from 1 to skip BOS + # The logit at position i-1 predicts token at position i + logit_for_current_token = prompt_logits[token_pos - 1] + current_token_id = prompt_token_ids[sample_idx][token_pos].item() + + # Calculate log probabilities + log_probs = torch.nn.functional.log_softmax(logit_for_current_token, dim=-1) + chosen_log_prob = log_probs[current_token_id].item() + sample_log_probs.append(chosen_log_prob) + + # Calculate top log probabilities if requested + if n_top_logprobs > 0: + top_log_probs_dict = {} + top_k_values, top_k_indices = torch.topk( + log_probs, min(n_top_logprobs, len(log_probs)) + ) + for k_idx in range(len(top_k_indices)): + token_id = top_k_indices[k_idx].item() + token_str = self.tokenizer.decode([token_id]) + top_log_probs_dict[token_str] = top_k_values[k_idx].item() + sample_top_logprobs.append(top_log_probs_dict) + + # Process the scores for generated tokens + for token_idx, score_tensor in enumerate(output_infer["scores"]): + # Get the chosen token ID from the sequence + # Scores start after the prompt, so we need to offset + input_len = ( + output_infer.get("input_lengths", [0])[sample_idx] + if "input_lengths" in output_infer + else 0 + ) + seq_idx = input_len + token_idx + + if seq_idx < len(sequences): + chosen_token_id = ( + sequences[seq_idx].item() + if hasattr(sequences[seq_idx], "item") + else sequences[seq_idx] + ) + + # Calculate log probabilities + log_probs = torch.nn.functional.log_softmax(score_tensor[sample_idx], dim=-1) + chosen_log_prob = log_probs[chosen_token_id].item() + sample_log_probs.append(chosen_log_prob) + + # Calculate top log probabilities if requested + if n_top_logprobs > 0: + top_log_probs_dict = {} + top_k_values, top_k_indices = torch.topk( + log_probs, min(n_top_logprobs, len(log_probs)) + ) + for k_idx in range(len(top_k_indices)): + token_id = top_k_indices[k_idx].item() + token_str = self.tokenizer.decode([token_id]) + top_log_probs_dict[token_str] = top_k_values[k_idx].item() + sample_top_logprobs.append(top_log_probs_dict) + + log_probs_list.append(sample_log_probs) + if n_top_logprobs > 0: + top_logprobs_list.append(sample_top_logprobs) + + # Return log probs and top logprobs + return_log_probs = log_probs_list if compute_logprob else None + return_top_logprobs = top_logprobs_list if n_top_logprobs > 0 else None + + return return_log_probs, return_top_logprobs + + def ray_infer_fn(self, inputs: dict[Any, Any]): + """Perform inference using Ray with dictionary inputs and outputs. + + Args: + inputs (Dict[Any, Any]): Dictionary containing input parameters: + - prompts: List of input prompts + - temperature: Sampling temperature (optional) + - top_k: Number of highest probability tokens to consider (optional) + - top_p: Cumulative probability threshold for token sampling (optional) + - max_tokens: Maximum number of tokens to generate (optional) + - compute_logprob: Whether to compute log probabilities (optional) + - n_top_logprobs: Number of top log probabilities to return (optional) + - echo: Whether to echo the prompt in output (optional) + + Returns: + Dict[str, Any]: Dictionary containing: + - sentences: List of generated texts + - log_probs: Optional list of log probabilities if compute_logprob is True + - top_logprobs: Optional list of top log probabilities if n_top_logprobs > 0 + """ + import json + + try: + prompts = inputs.pop("prompts") + temperature = inputs.pop("temperature", 1.0) + top_k = int(inputs.pop("top_k", 1)) + top_p = inputs.pop("top_p", 0.0) + num_tokens_to_generate = inputs.pop("max_tokens", 256) + output_logits = inputs.pop("output_logits", False) + output_scores = inputs.pop("output_scores", False) + compute_logprob = inputs.pop("compute_logprob", False) + n_top_logprobs = inputs.pop("n_top_logprobs", 0) + echo = inputs.pop("echo", False) + + output_infer = self._infer_fn_ray( + prompts=prompts, + temperature=temperature, + top_k=top_k, + top_p=top_p, + num_tokens_to_generate=num_tokens_to_generate, + output_logits=output_logits, + output_scores=output_scores, + compute_logprob=compute_logprob, + n_top_logprobs=n_top_logprobs, + echo=echo, + ) + # Code to get logprobs (required in OAI API format for eval) from the scores in output_infer. + if ( + (compute_logprob or n_top_logprobs > 0) + and "scores" in output_infer + and output_infer["scores"] + ): + log_probs_list, top_logprobs_list = self._compute_logprobs( + prompts=prompts, + output_infer=output_infer, + compute_logprob=compute_logprob, + n_top_logprobs=n_top_logprobs, + echo=echo, + ) + + # Add to output + if log_probs_list is not None: + output_infer["log_probs"] = log_probs_list + if top_logprobs_list is not None: + # Convert to JSON strings for compatibility + output_infer["top_logprobs"] = [ + json.dumps(top_logprobs) for top_logprobs in top_logprobs_list + ] + + # Remove raw outputs that are not needed in the final response + output_infer.pop("scores", None) + output_infer.pop("sequences", None) + output_infer.pop("input_lengths", None) + return output_infer + except Exception as error: + err_msg = "An error occurred: {}".format(str(error)) + return {"sentences": [err_msg]} + + def _infer_fn_ray( + self, + prompts, + temperature=1.0, + top_k=1, + top_p=0.0, + num_tokens_to_generate=256, + output_logits=False, + output_scores=False, + compute_logprob=False, + n_top_logprobs=0, + echo=False, + cast_output_func=None, + ): + """Common internal function for inference operations. + + Args: + prompts: List of input prompts + temperature: Sampling temperature + top_k: Number of highest probability tokens to consider + top_p: Cumulative probability threshold for token sampling + num_tokens_to_generate: Maximum number of tokens to generate + output_logits: Whether to output logits + output_scores: Whether to output scores + compute_logprob: Whether to compute log probabilities + n_top_logprobs: Number of top log probabilities to return + echo: Whether to echo the prompt in output + cast_output_func: Optional function to cast output values + + Returns: + Dict containing inference results with raw outputs + """ + # Enable return_dict if we need scores for logprobs or if output_logits/scores are requested + return_dict_in_generate = ( + output_logits or output_scores or compute_logprob or n_top_logprobs > 0 + ) + # Enable output_scores if we need to compute logprobs. scores and logits from generate are both identical in + # case of greedy decoding. Hence setting output_scores to True when compute_logprob or n_top_logprobs > 0. + if compute_logprob or n_top_logprobs > 0: + output_scores = True + + if torch.distributed.is_initialized(): + if torch.distributed.get_world_size() > 1: + torch.distributed.broadcast( + torch.tensor([0], dtype=torch.long, device="cuda"), src=0 + ) + broadcast_list(prompts, src=0) + broadcast_list( + data=[ + temperature, + top_k, + top_p, + num_tokens_to_generate, + output_logits, + output_scores, + ], + src=0, + ) + + output = self.generate( + text_inputs=prompts, + do_sample=False, # do_sample=False for greedy decoding + top_k=top_k, + top_p=top_p, + temperature=temperature, + max_new_tokens=num_tokens_to_generate, + output_logits=output_logits, + output_scores=output_scores, + return_dict_in_generate=return_dict_in_generate, + echo=echo, + ) + + if isinstance(output, dict): + return output + + else: + return {"sentences": output} diff --git a/examples/puzzletron/requirements.txt b/examples/puzzletron/requirements.txt index fe63c413b..0511fb473 100644 --- a/examples/puzzletron/requirements.txt +++ b/examples/puzzletron/requirements.txt @@ -1 +1,3 @@ -lm-eval==0.4.9 +lm-eval==0.4.10 +math-verify +ray From 23d9605e146ef09cfc09d1e5f31cc8f15a5d8acc Mon Sep 17 00:00:00 2001 From: J Rausch <38429553+j-rausch@users.noreply.github.com> Date: Tue, 17 Feb 2026 14:25:52 +0100 Subject: [PATCH 5/8] Add support for Puzzletron AnyModel evaluation via lm-eval (#898) ## What does this PR do? **Overview:** - Update the AnyModel Puzzletron tutorial to use lm-eval. We add a script that monkey patches lm-eval to use the patched AnyModel model loading - No need for running ray deployments or replacing the NeMo Export-Deploy deployment script with a patched version - Moved instructions for using NeMo Evaluator to an alternative readme file --------- Signed-off-by: jrausch --- examples/puzzletron/README.md | 39 +++----- .../puzzletron/evaluation/lm_eval_anymodel.py | 92 +++++++++++++++++++ .../evaluation/nemo_evaluator_instructions.md | 42 +++++++++ examples/puzzletron/requirements.txt | 1 - 4 files changed, 147 insertions(+), 27 deletions(-) create mode 100644 examples/puzzletron/evaluation/lm_eval_anymodel.py create mode 100644 examples/puzzletron/evaluation/nemo_evaluator_instructions.md diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index 3e7c78bc2..31f9926bf 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -22,6 +22,9 @@ pip install -e .[hf,puzzletron] pip install -r examples/puzzletron/requirements.txt ``` +> **Note:** NeMo containers may ship `nvidia-lm-eval` which may conflict with `lm-eval` that is used for evaluation. +> If so, run `pip uninstall nvidia-lm-eval -y` before installing requirements. + - For this example we are using 2x NVIDIA H100 80GB HBM3 to show multi-GPU steps. You can use also use s single GPU. - To make use of [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2), you need to accept the terms and conditions for the corresponding model and the dataset in the Huggingface Hub. Log in to the Huggingface Hub and enter your HF token. @@ -229,37 +232,21 @@ The plot shows how token accuracy changes with different compression rates. High ## Evaluation -Evaluate AnyModel checkpoints by deploying a local OpenAI-compatible completions endpoint and running benchmarks against it. - -**1. Deploy the model (2 GPUs example):** +Evaluate AnyModel checkpoints using [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness) directly. ```bash -# Install the AnyModel-patched deployable (first time only: backs up the original) -# /opt/Export-Deploy is the default path in NeMo containers — adjust if needed -cp /opt/Export-Deploy/nemo_deploy/llm/hf_deployable.py /opt/Export-Deploy/nemo_deploy/llm/hf_deployable.py.bak -cp examples/puzzletron/evaluation/hf_deployable_anymodel.py /opt/Export-Deploy/nemo_deploy/llm/hf_deployable.py - -# Start the server (blocks while running — use a separate terminal) -ray start --head --num-gpus 2 --port 6379 --disable-usage-stats -python /opt/Export-Deploy/scripts/deploy/nlp/deploy_ray_hf.py \ - --model_path path/to/checkpoint \ - --model_id anymodel-hf \ - --num_gpus 2 --num_gpus_per_replica 2 --num_cpus_per_replica 16 \ - --trust_remote_code --port 8083 --device_map "auto" --cuda_visible_devices "0,1" +python examples/puzzletron/evaluation/lm_eval_anymodel.py \ + --model hf \ + --model_args pretrained=path/to/checkpoint,dtype=bfloat16,parallelize=True \ + --tasks mmlu \ + --num_fewshot 5 \ + --batch_size 4 ``` -**2. Run MMLU:** - -```bash -eval-factory run_eval \ - --eval_type mmlu \ - --model_id anymodel-hf \ - --model_type completions \ - --model_url http://0.0.0.0:8083/v1/completions/ \ - --output_dir examples/puzzletron/evals/mmlu_anymodel -``` +For a quick smoke test, add `--limit 10`. -For a quick debug run, add `--overrides "config.params.limit_samples=5"`. +> **Alternative:** For server-based evaluation via an OpenAI-compatible endpoint, +> see [evaluation/nemo_evaluator_instructions.md](./evaluation/nemo_evaluator_instructions.md). ## Inference Performance Benchmarking diff --git a/examples/puzzletron/evaluation/lm_eval_anymodel.py b/examples/puzzletron/evaluation/lm_eval_anymodel.py new file mode 100644 index 000000000..38d3ea35d --- /dev/null +++ b/examples/puzzletron/evaluation/lm_eval_anymodel.py @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Run lm-eval directly on AnyModel (Puzzletron) checkpoints without a deployment server. + +Patches lm-eval's HFLM to wrap model loading with deci_x_patcher so AnyModel +Puzzletron checkpoints load correctly. Model descriptor is auto-detected from the +checkpoint's config.json model_type. +""" + +from lm_eval.__main__ import cli_evaluate +from lm_eval.api.model import T +from lm_eval.models.huggingface import HFLM +from transformers import AutoConfig + +# Trigger factory registration for all model descriptors +import modelopt.torch.puzzletron.anymodel.models # noqa: F401 +from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptorFactory +from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher + +# Map from HuggingFace config.model_type (in checkpoint config.json) to ModelDescriptorFactory name. +# Local to this script; add entries when supporting new model types for auto-detection. +_MODEL_TYPE_TO_DESCRIPTOR = { + "llama": "llama", + "mistral": "mistral_small", + "qwen2": "qwen2", + "qwen3": "qwen3", + "nemotron_h": "nemotron_h", + "nemotron_h_v2": "nemotron_h_v2", + "gpt_oss_20b": "gpt_oss_20b", +} + + +def _resolve_descriptor_from_pretrained(pretrained: str | None): + """Resolve the model descriptor by loading the checkpoint config and mapping model_type.""" + if not pretrained: + raise ValueError( + "pretrained must be set in --model_args " + "(e.g. --model_args pretrained=/path/to/checkpoint,dtype=bfloat16)." + ) + + config = AutoConfig.from_pretrained(pretrained, trust_remote_code=True) + model_type = getattr(config, "model_type", None) + + if model_type and model_type in _MODEL_TYPE_TO_DESCRIPTOR: + detected = _MODEL_TYPE_TO_DESCRIPTOR[model_type] + print( + f"[lm_eval_anymodel] Auto-detected model_type='{model_type}' → descriptor='{detected}'" + ) + return ModelDescriptorFactory.get(detected) + + known = sorted(_MODEL_TYPE_TO_DESCRIPTOR.keys()) + raise ValueError( + f"Cannot auto-detect descriptor for model_type='{model_type}'. " + f"Known model types: {known}. Add this model_type to _MODEL_TYPE_TO_DESCRIPTOR if supported." + ) + + +def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict | None = None) -> T: + """Override HFLM.create_from_arg_obj to wrap model loading with deci_x_patcher.""" + pretrained = arg_dict.get("pretrained") + descriptor = _resolve_descriptor_from_pretrained(pretrained) + + additional_config = {} if additional_config is None else additional_config + additional_config = {k: v for k, v in additional_config.items() if v is not None} + + # The patcher must be active during HFLM.__init__ because that's where + # AutoModelForCausalLM.from_pretrained() is called internally. + with deci_x_patcher(model_descriptor=descriptor): + model_obj = cls(**arg_dict, **additional_config) + + return model_obj + + +# Monkey-patch HFLM so lm-eval uses our patched model loading +HFLM.create_from_arg_obj = classmethod(create_from_arg_obj) + + +if __name__ == "__main__": + cli_evaluate() diff --git a/examples/puzzletron/evaluation/nemo_evaluator_instructions.md b/examples/puzzletron/evaluation/nemo_evaluator_instructions.md new file mode 100644 index 000000000..b8c97af5d --- /dev/null +++ b/examples/puzzletron/evaluation/nemo_evaluator_instructions.md @@ -0,0 +1,42 @@ +# Evaluation with NeMo Evaluator (Alternative) + +> **Recommended approach:** Use lm-eval for direct evaluation without a +> deployment server. See the main [README](../README.md#evaluation) for details. + +Evaluate AnyModel checkpoints by deploying a local OpenAI-compatible completions endpoint and running benchmarks against it. + +This flow requires Ray for serving the model and NeMo Export-Deploy (included in NeMo containers): + +```bash +pip install ray +``` + +**1. Deploy the model (2 GPUs example):** + +```bash +# Install the AnyModel-patched deployable (first time only: backs up the original) +# /opt/Export-Deploy is the default path in NeMo containers — adjust if needed +cp /opt/Export-Deploy/nemo_deploy/llm/hf_deployable.py /opt/Export-Deploy/nemo_deploy/llm/hf_deployable.py.bak +cp examples/puzzletron/evaluation/hf_deployable_anymodel.py /opt/Export-Deploy/nemo_deploy/llm/hf_deployable.py + +# Start the server (blocks while running — use a separate terminal) +ray start --head --num-gpus 2 --port 6379 --disable-usage-stats +python /opt/Export-Deploy/scripts/deploy/nlp/deploy_ray_hf.py \ + --model_path path/to/checkpoint \ + --model_id anymodel-hf \ + --num_gpus 2 --num_gpus_per_replica 2 --num_cpus_per_replica 16 \ + --trust_remote_code --port 8083 --device_map "auto" --cuda_visible_devices "0,1" +``` + +**2. Run MMLU:** + +```bash +eval-factory run_eval \ + --eval_type mmlu \ + --model_id anymodel-hf \ + --model_type completions \ + --model_url http://0.0.0.0:8083/v1/completions/ \ + --output_dir examples/puzzletron/evals/mmlu_anymodel +``` + +For a quick debug run, add `--overrides "config.params.limit_samples=5"`. diff --git a/examples/puzzletron/requirements.txt b/examples/puzzletron/requirements.txt index 0511fb473..db6894d63 100644 --- a/examples/puzzletron/requirements.txt +++ b/examples/puzzletron/requirements.txt @@ -1,3 +1,2 @@ lm-eval==0.4.10 math-verify -ray From 8d344ecff93b08a40ca2a231b814acdeca5641d8 Mon Sep 17 00:00:00 2001 From: J Rausch <38429553+j-rausch@users.noreply.github.com> Date: Tue, 17 Feb 2026 15:48:32 +0100 Subject: [PATCH 6/8] Update license of lm_eval_anymodel.py (#899) ## What does this PR do? **Overview:** Updated license of examples/puzzletron/evaluation/lm_eval_anymodel.py to match that of reference examples/llm_eval/lm_eval_hf.py. Signed-off-by: jrausch --- .../puzzletron/evaluation/lm_eval_anymodel.py | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/examples/puzzletron/evaluation/lm_eval_anymodel.py b/examples/puzzletron/evaluation/lm_eval_anymodel.py index 38d3ea35d..6d6fcd44e 100644 --- a/examples/puzzletron/evaluation/lm_eval_anymodel.py +++ b/examples/puzzletron/evaluation/lm_eval_anymodel.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,31 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Adapted from https://github.com/EleutherAI/lm-evaluation-harness/tree/aa457edc3d64d81530159cd3a182932320c78f8c + +# MIT License +# +# Copyright (c) 2020 EleutherAI +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Run lm-eval directly on AnyModel (Puzzletron) checkpoints without a deployment server. Patches lm-eval's HFLM to wrap model loading with deci_x_patcher so AnyModel From 296722a16057a7bf727062e762847cade94e1829 Mon Sep 17 00:00:00 2001 From: mchochowski Date: Fri, 20 Feb 2026 06:26:04 -0800 Subject: [PATCH 7/8] fix gptosss hook Signed-off-by: mchochowski --- examples/puzzletron/GPTOSS.md | 8 ++++++++ examples/puzzletron/README.md | 7 ------- .../pruning/ffn_pruning.yaml | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) create mode 100644 examples/puzzletron/GPTOSS.md diff --git a/examples/puzzletron/GPTOSS.md b/examples/puzzletron/GPTOSS.md new file mode 100644 index 000000000..953320b26 --- /dev/null +++ b/examples/puzzletron/GPTOSS.md @@ -0,0 +1,8 @@ + +## GptOss - 20b + + +With this release Puzzle algorithm supports only experts removal for Gpt-Oss-20b. This model comes as a quantized checkpoint i.e. MoE experts matrices are quantized with mxfp4 format. In the prunning steps puzzle utilizes decompressed model (back to bf16) for statistics and scores computation. This means, during the conversion to puzzle format we decompress the model and store it as a bf16. Once the pruning is done i.e. experts to be removed are identified and the process is finished, user may want to get back the mxfp4 format of the checkpoint. To do so, there is an additional script, that takes the original and the pruned checkpoint and outputs pruned checkpoint in mxfp4 format. +```bash +python -m modelopt.torch.puzzletron.anymodel.models.gpt_oss_20b.gpt_oss_pruned_to_mxfp4 --student-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/solution_0/ --original-path /workspaces/source_model_checkpoints/openai_gpt-oss-20b/ --output-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/mxfp4-ckpt/ --num-layers 24 +``` diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index 31f9926bf..48f64d3c4 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -294,10 +294,3 @@ python -m nemo_export/convert_nemo_to_hf --input-ckpt-path path/to/nemo-model -- ## Advanced Usage Modify `llama-3_1-8B_pruneffn_memory.yaml` file for advanced compression scenarios. - -## GptOss - 20b - -With this release Puzzle algorithm supports only experts removal for Gpt-Oss-20b. This model comes as a quantized checkpoint i.e. MoE experts matrices are quantized with mxfp4 format. In the prunning steps puzzle utilizes decompressed model (back to bf16) for statistics and scores computation. This means, during the conversion to puzzle format we decompress the model and store it as a bf16. Once the pruning is done i.e. experts to be removed are identified and the process is finished, user may want to get back the mxfp4 format of the checkpoint. To do so, there is an additional script, that takes the original and the pruned checkpoint and outputs pruned checkpoint in mxfp4 format. -```bash -python -m modelopt.torch.puzzletron.anymodel.models.gpt_oss_20b.gpt_oss_pruned_to_mxfp4 --student-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/solution_0/ --original-path /workspaces/source_model_checkpoints/openai_gpt-oss-20b/ --output-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/mxfp4-ckpt/ --num-layers 24 -``` diff --git a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/ffn_pruning.yaml b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/ffn_pruning.yaml index e9e15db32..00d7829e0 100644 --- a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/ffn_pruning.yaml +++ b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/ffn_pruning.yaml @@ -10,7 +10,7 @@ pruning_mixin: _target_: modelopt.torch.puzzletron.anymodel.models.gpt_oss_20b.gpt_oss_20b_model_descriptor.GptOss20bExpertRemovalLayerDescriptor target_name: "mlp.router" -hook_class: ${get_object:utils.activation_hooks.hooks.RankedChoiceVotingHook} +hook_class: ${get_object:modelopt.torch.nas.plugins.megatron_hooks.base_hooks.RankedChoiceVotingHook} activation_hooks_kwargs: # Additional kwargs to pass to the hook init num_experts_to_keep_list: [24, 16, 8] # num_experts in teacher is 128 From 12d46e70f21f339fb6984b6c5655d730c95236b2 Mon Sep 17 00:00:00 2001 From: mchochowski Date: Sat, 21 Feb 2026 08:27:09 -0800 Subject: [PATCH 8/8] fix hooks, descriptor and converter to support gptoss. fix typo in yaml config Signed-off-by: mchochowski --- .../pruning/pruning_defaults.yaml | 2 +- .../nas/plugins/megatron_hooks/base_hooks.py | 2 +- .../anymodel/converter/converter.py | 23 +++++++++++++++++++ .../gpt_oss_20b_model_descriptor.py | 2 ++ 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/pruning_defaults.yaml b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/pruning_defaults.yaml index cec781465..48c87e6a3 100644 --- a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/pruning_defaults.yaml +++ b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/pruning_defaults.yaml @@ -15,7 +15,7 @@ dataset_path: ${dataset_path} val_dataset_name: train # Prune ckpts -pruned_ckpts_outpt_dir: ${puzzle_dir}/pruning/${pruning.experiment_id} +pruned_ckpts_output_dir: ${puzzle_dir}/pruning/${pruning.experiment_id} ## FFN pruning ffn_list: diff --git a/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py b/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py index 0ed000349..7cd721444 100644 --- a/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py +++ b/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py @@ -1042,7 +1042,7 @@ def __call__( args: Tuple with one tensor entry (B, T, I) output: Router logits of shape (B, T, E) """ - router_logits = output + router_logits = output[0] if isinstance(output, tuple) else output num_experts = router_logits.shape[-1] router_argsort = torch.argsort(router_logits, dim=-1, descending=True) router_argsort = router_argsort.view(-1, num_experts).to(torch.int16).cpu() diff --git a/modelopt/torch/puzzletron/anymodel/converter/converter.py b/modelopt/torch/puzzletron/anymodel/converter/converter.py index e241e72b6..6d78de246 100644 --- a/modelopt/torch/puzzletron/anymodel/converter/converter.py +++ b/modelopt/torch/puzzletron/anymodel/converter/converter.py @@ -208,3 +208,26 @@ def create_block_configs_from_main_config(config: PretrainedConfig) -> List[Bloc return [BlockConfig(...) for layer_idx in range(num_layers)] """ raise NotImplementedError + + @staticmethod + def convert_weight_name(name: str) -> str: + """ + Convert weight names during checkpoint conversion. + + This method can be overridden by subclasses to apply model-specific weight name + transformations when converting checkpoints from HuggingFace format to Puzzletron format. + + Default implementation returns the name unchanged (identity function). + + Args: + name: Original weight name from HuggingFace checkpoint + + Returns: + Converted weight name for Puzzletron format + + Example: + For Qwen2.5-VL, this converts: + - visual.* → model.visual.* + - model.* → model.language_model.* + """ + return name \ No newline at end of file diff --git a/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_model_descriptor.py index 644da802c..e51a7d9e8 100644 --- a/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_model_descriptor.py +++ b/modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_20b_model_descriptor.py @@ -27,6 +27,8 @@ ModelDescriptor, ModelDescriptorFactory, ) +from modelopt.torch.puzzletron.utils.dummy_modules import DummyBlock + from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import ( MatchingZeros, Same,