diff --git a/tools/launcher/examples/Qwen/moonshotai/Kimi-K2.5-DFlash/step3_train.yaml b/tools/launcher/examples/Qwen/moonshotai/Kimi-K2.5-DFlash/step3_train.yaml new file mode 100644 index 00000000000..1cca1f5be58 --- /dev/null +++ b/tools/launcher/examples/Qwen/moonshotai/Kimi-K2.5-DFlash/step3_train.yaml @@ -0,0 +1,33 @@ +# Step3 offline EAGLE3 draft-head training for moonshotai/Kimi-K2.5-DFlash. +# +# Standalone task extracted from the 4-task hf_offline_eagle3 pipeline. +# Reads hidden states produced by step2 from /scratchspace/offline_hidden_states. +# +# Usage: +# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/moonshotai/Kimi-K2.5-DFlash/step3_train.yaml --yes + +job_name: moonshotai/Kimi-K2.5-DFlash_EAGLE3_train +pipeline: + allow_to_fail: false + skip: false + note: + + global_vars: + hf_model: /hf-local/Qwen/moonshotai/Kimi-K2.5-DFlash + + task_0: + script: common/eagle3/train_eagle.sh + args: + - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml + - model.model_name_or_path=<> + - data.offline_data_path=/scratchspace/offline_hidden_states + - training.output_dir=/scratchspace/eagle3 + - training.training_seq_len=4096 + - training.disable_tqdm=true + - training.ar_validate_steps=500000 + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 8 + container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0