From 4952259c7d7123824870a1752b725f332b1daedf Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 26 May 2026 09:56:46 +0800 Subject: [PATCH 1/2] feat(speculative): add Qwen3 dense target support to EAGLE-1/2/3 Register ``Qwen3ForCausalLM`` in the EAGLE dense draft dispatch table. Qwen3 already works through the existing config-driven draft path: ``head_dim`` is read via ``getattr(config, "head_dim", ...)`` (Qwen3 decouples it from ``hidden_size / num_attention_heads``), and ``attention_bias`` / ``mlp_bias`` are read via ``getattr(..., False)`` so Qwen3's config exposes them correctly. No code-path changes required; just an allowlist entry plus example configs and docstrings. - registry.py: append "Qwen3ForCausalLM" to ``_DENSE_ARCHITECTURES``. - Add example YAMLs: ``qwen3_eagle{1,2,3}_perfectblend.yaml``. - Update docstrings (draft modules + recipes) to mention Qwen3. End-to-end smoke-tested on 8x H100 with Qwen/Qwen3-8B target on a PerfectBlend 200-sample slice (EAGLE-3, 25 steps): loss decreases 9.85 -> 6.18 (~37% drop), train_acc ticks up from 0 to ~0.09. No construction-time / load-time errors. Signed-off-by: khazic --- .../eagle1/qwen3_eagle1_perfectblend.yaml | 37 +++++++++++++++++++ .../eagle2/qwen3_eagle2_perfectblend.yaml | 37 +++++++++++++++++++ .../eagle3/qwen3_eagle3_perfectblend.yaml | 36 ++++++++++++++++++ .../speculative/eagle/draft_llama.py | 12 +++--- .../speculative/eagle/draft_llama_v12.py | 4 +- .../components/speculative/eagle/registry.py | 1 + nemo_automodel/recipes/llm/train_eagle1.py | 4 +- nemo_automodel/recipes/llm/train_eagle3.py | 4 +- 8 files changed, 124 insertions(+), 11 deletions(-) create mode 100644 examples/speculative/eagle1/qwen3_eagle1_perfectblend.yaml create mode 100644 examples/speculative/eagle2/qwen3_eagle2_perfectblend.yaml create mode 100644 examples/speculative/eagle3/qwen3_eagle3_perfectblend.yaml diff --git a/examples/speculative/eagle1/qwen3_eagle1_perfectblend.yaml b/examples/speculative/eagle1/qwen3_eagle1_perfectblend.yaml new file mode 100644 index 0000000000..61ed7f4068 --- /dev/null +++ b/examples/speculative/eagle1/qwen3_eagle1_perfectblend.yaml @@ -0,0 +1,37 @@ +recipe: TrainEagle1Recipe + +dist_env: + backend: nccl + timeout_minutes: 30 + +recipe_args: + target_model_name_or_path: Qwen/Qwen3-8B + train_data_path: /path/to/train.jsonl + val_data_path: null + train_split: null + val_split: null + output_dir: ./outputs/eagle1_qwen3_mvp + seq_length: 1024 + micro_batch_size: 1 + grad_accumulation_steps: 1 + num_workers: 0 + num_epochs: 1 + draft_num_hidden_layers: 1 + hidden_loss_weight: 1.0 + token_loss_weight: 0.1 + freeze_embeddings: true + trust_remote_code: false + shuffle_seed: 42 + log_every_steps: 10 + max_grad_norm: 1.0 + +optimizer: + lr: 1.0e-4 + betas: [0.9, 0.95] + weight_decay: 0.0 + +checkpoint: + enabled: true + checkpoint_dir: ./outputs/eagle1_qwen3_mvp/checkpoints + model_save_format: safetensors + save_consolidated: true diff --git a/examples/speculative/eagle2/qwen3_eagle2_perfectblend.yaml b/examples/speculative/eagle2/qwen3_eagle2_perfectblend.yaml new file mode 100644 index 0000000000..33e771cca5 --- /dev/null +++ b/examples/speculative/eagle2/qwen3_eagle2_perfectblend.yaml @@ -0,0 +1,37 @@ +recipe: TrainEagle2Recipe + +dist_env: + backend: nccl + timeout_minutes: 30 + +recipe_args: + target_model_name_or_path: Qwen/Qwen3-8B + train_data_path: /path/to/train.jsonl + val_data_path: null + train_split: null + val_split: null + output_dir: ./outputs/eagle2_qwen3_mvp + seq_length: 1024 + micro_batch_size: 1 + grad_accumulation_steps: 1 + num_workers: 0 + num_epochs: 1 + draft_num_hidden_layers: 1 + hidden_loss_weight: 1.0 + token_loss_weight: 0.1 + freeze_embeddings: true + trust_remote_code: false + shuffle_seed: 42 + log_every_steps: 10 + max_grad_norm: 1.0 + +optimizer: + lr: 1.0e-4 + betas: [0.9, 0.95] + weight_decay: 0.0 + +checkpoint: + enabled: true + checkpoint_dir: ./outputs/eagle2_qwen3_mvp/checkpoints + model_save_format: safetensors + save_consolidated: true diff --git a/examples/speculative/eagle3/qwen3_eagle3_perfectblend.yaml b/examples/speculative/eagle3/qwen3_eagle3_perfectblend.yaml new file mode 100644 index 0000000000..5913192df3 --- /dev/null +++ b/examples/speculative/eagle3/qwen3_eagle3_perfectblend.yaml @@ -0,0 +1,36 @@ +recipe: TrainEagle3Recipe + +dist_env: + backend: nccl + timeout_minutes: 30 + +recipe_args: + target_model_name_or_path: Qwen/Qwen3-8B + train_data_path: /path/to/train.jsonl + val_data_path: null + train_split: null + val_split: null + output_dir: ./outputs/eagle3_qwen3_mvp + seq_length: 1024 + micro_batch_size: 1 + grad_accumulation_steps: 1 + num_workers: 0 + num_epochs: 1 + ttt_steps: 4 + draft_vocab_size: 8192 + freeze_embeddings: true + trust_remote_code: false + shuffle_seed: 42 + log_every_steps: 10 + max_grad_norm: 1.0 + +optimizer: + lr: 1.0e-4 + betas: [0.9, 0.95] + weight_decay: 0.0 + +checkpoint: + enabled: true + checkpoint_dir: ./outputs/eagle3_qwen3_mvp/checkpoints + model_save_format: safetensors + save_consolidated: true diff --git a/nemo_automodel/components/speculative/eagle/draft_llama.py b/nemo_automodel/components/speculative/eagle/draft_llama.py index 7a6834805d..119ff8c04e 100644 --- a/nemo_automodel/components/speculative/eagle/draft_llama.py +++ b/nemo_automodel/components/speculative/eagle/draft_llama.py @@ -18,10 +18,12 @@ decoder-only architecture whose layout matches Llama: GQA attention with optional Q/K/V/O bias (`config.attention_bias`), SwiGLU MLP with optional bias (`config.mlp_bias`), RMSNorm, and rotary position embeddings parameterized -by `config.rope_theta` / `config.rope_scaling`. This currently covers Llama -and Phi-3 dense (Phi-3 omits `attention_bias` / `mlp_bias`, which the -attention and MLP layers already read via -`getattr(config, "", False)`). +by `config.rope_theta` / `config.rope_scaling`. This currently covers Llama, +Phi-3, and Qwen3 dense (Phi-3 omits `attention_bias` / `mlp_bias`, which +the attention and MLP layers already read via +`getattr(config, "", False)`; Qwen3 decouples `head_dim` from +`hidden_size / num_attention_heads`, which the attention layer reads via +`getattr(config, "head_dim", ...)`). Class names and the public `architectures` string remain ``LlamaEagle3*`` for backward compatibility with already-trained checkpoints and with SGLang's @@ -442,7 +444,7 @@ def __init__(self, config: PretrainedConfig): class LlamaEagle3DraftModel(PreTrainedModel): - """Llama-style dense EAGLE-3 draft model (Llama, Phi-3). + """Llama-style dense EAGLE-3 draft model (Llama, Phi-3, Qwen3). State dict keys match SGLang's ``LlamaForCausalLMEagle3`` so the saved checkpoint can be loaded by SGLang's inference engine without any diff --git a/nemo_automodel/components/speculative/eagle/draft_llama_v12.py b/nemo_automodel/components/speculative/eagle/draft_llama_v12.py index 306bd3288d..ec303e2405 100644 --- a/nemo_automodel/components/speculative/eagle/draft_llama_v12.py +++ b/nemo_automodel/components/speculative/eagle/draft_llama_v12.py @@ -14,7 +14,7 @@ """Llama-style dense LLM draft model for EAGLE-1 / EAGLE-2 training. -Config-driven; supports Llama and Phi-3 dense via standard HF config +Config-driven; supports Llama, Phi-3, and Qwen3 dense via standard HF config fields (``attention_bias``, ``mlp_bias``, ``rope_theta``/``rope_scaling``, ``rms_norm_eps``). Class names are retained for checkpoint-architectures compatibility. @@ -158,7 +158,7 @@ def forward( class LlamaEagleDraftModel(PreTrainedModel): """Llama-style dense draft that predicts next-step hidden states. - Works with Llama and Phi-3 dense configs. The class name is + Works with Llama, Phi-3, and Qwen3 dense configs. The class name is retained for backward compatibility with already-trained checkpoints. """ diff --git a/nemo_automodel/components/speculative/eagle/registry.py b/nemo_automodel/components/speculative/eagle/registry.py index af618e5106..3b3e0289c9 100644 --- a/nemo_automodel/components/speculative/eagle/registry.py +++ b/nemo_automodel/components/speculative/eagle/registry.py @@ -55,6 +55,7 @@ class DraftSpec: _DENSE_ARCHITECTURES: tuple[str, ...] = ( "LlamaForCausalLM", "Phi3ForCausalLM", + "Qwen3ForCausalLM", ) diff --git a/nemo_automodel/recipes/llm/train_eagle1.py b/nemo_automodel/recipes/llm/train_eagle1.py index b61ddf6564..1a65813e59 100644 --- a/nemo_automodel/recipes/llm/train_eagle1.py +++ b/nemo_automodel/recipes/llm/train_eagle1.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""EAGLE-1 / EAGLE-2 training recipe for Llama-style dense LLMs (Llama, Phi-3).""" +"""EAGLE-1 / EAGLE-2 training recipe for Llama-style dense LLMs (Llama, Phi-3, Qwen3).""" from __future__ import annotations @@ -62,7 +62,7 @@ def _all_reduce_mean(value: torch.Tensor) -> torch.Tensor: class TrainEagle1Recipe(BaseRecipe): - """Recipe for EAGLE-1 training on Llama-style dense LLMs (Llama, Phi-3).""" + """Recipe for EAGLE-1 training on Llama-style dense LLMs (Llama, Phi-3, Qwen3).""" def __init__(self, cfg): self.cfg = cfg diff --git a/nemo_automodel/recipes/llm/train_eagle3.py b/nemo_automodel/recipes/llm/train_eagle3.py index f405ce3823..a9a611623f 100644 --- a/nemo_automodel/recipes/llm/train_eagle3.py +++ b/nemo_automodel/recipes/llm/train_eagle3.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""EAGLE-3 training recipe for Llama-style dense LLMs (Llama, Phi-3).""" +"""EAGLE-3 training recipe for Llama-style dense LLMs (Llama, Phi-3, Qwen3).""" from __future__ import annotations @@ -81,7 +81,7 @@ def _all_reduce_mean(value: torch.Tensor) -> torch.Tensor: class TrainEagle3Recipe(BaseRecipe): - """Recipe for EAGLE-3 training on Llama-style dense LLMs (Llama, Phi-3).""" + """Recipe for EAGLE-3 training on Llama-style dense LLMs (Llama, Phi-3, Qwen3).""" def __init__(self, cfg): self.cfg = cfg From ef5eb6cae91898ff73f9a145e8beeb390447b683 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 26 May 2026 09:57:26 +0800 Subject: [PATCH 2/2] fix(speculative): read activation from config instead of hardcoded SiLU EAGLE-3 draft reads ACT2FN[config.hidden_act] from the target config, but EAGLE-1/2 draft hardcoded nn.SiLU(). All currently registered dense architectures (Llama / Phi-3 / Qwen3) happen to use silu, so the hardcode is correct today. However, the dense registry is intended to grow to cover non-SiLU families next (e.g. Gemma uses gelu_pytorch_tanh). With the hardcode in place, registering such an architecture would silently mismatch the target's activation: no crash, no error, training still converges, but draft hidden states drift from target and speculative acceptance rate quietly drops with no observable symptom. Read hidden_act from config so the draft matches the target by construction and adding new architectures stays a one-line registry change. Signed-off-by: khazic --- .../components/speculative/eagle/draft_llama_v12.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_automodel/components/speculative/eagle/draft_llama_v12.py b/nemo_automodel/components/speculative/eagle/draft_llama_v12.py index ec303e2405..00c56b356a 100644 --- a/nemo_automodel/components/speculative/eagle/draft_llama_v12.py +++ b/nemo_automodel/components/speculative/eagle/draft_llama_v12.py @@ -122,7 +122,9 @@ def __init__(self, config: PretrainedConfig): self.down_proj = nn.Linear( config.intermediate_size, config.hidden_size, bias=getattr(config, "mlp_bias", False) ) - self.act_fn = nn.SiLU() + from transformers.activations import ACT2FN + + self.act_fn = ACT2FN[config.hidden_act] def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))