From 4952259c7d7123824870a1752b725f332b1daedf Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 26 May 2026 09:56:46 +0800
Subject: [PATCH 1/2] feat(speculative): add Qwen3 dense target support to
 EAGLE-1/2/3

Register ``Qwen3ForCausalLM`` in the EAGLE dense draft dispatch table.
Qwen3 already works through the existing config-driven draft path:
``head_dim`` is read via ``getattr(config, "head_dim", ...)`` (Qwen3
decouples it from ``hidden_size / num_attention_heads``), and
``attention_bias`` / ``mlp_bias`` are read via ``getattr(..., False)``
so Qwen3's config exposes them correctly. No code-path changes
required; just an allowlist entry plus example configs and docstrings.

  - registry.py: append "Qwen3ForCausalLM" to ``_DENSE_ARCHITECTURES``.
  - Add example YAMLs: ``qwen3_eagle{1,2,3}_perfectblend.yaml``.
  - Update docstrings (draft modules + recipes) to mention Qwen3.

End-to-end smoke-tested on 8x H100 with Qwen/Qwen3-8B target on a
PerfectBlend 200-sample slice (EAGLE-3, 25 steps): loss decreases
9.85 -> 6.18 (~37% drop), train_acc ticks up from 0 to ~0.09. No
construction-time / load-time errors.

Signed-off-by: khazic <khazzz1c@gmail.com>
---
 .../eagle1/qwen3_eagle1_perfectblend.yaml     | 37 +++++++++++++++++++
 .../eagle2/qwen3_eagle2_perfectblend.yaml     | 37 +++++++++++++++++++
 .../eagle3/qwen3_eagle3_perfectblend.yaml     | 36 ++++++++++++++++++
 .../speculative/eagle/draft_llama.py          | 12 +++---
 .../speculative/eagle/draft_llama_v12.py      |  4 +-
 .../components/speculative/eagle/registry.py  |  1 +
 nemo_automodel/recipes/llm/train_eagle1.py    |  4 +-
 nemo_automodel/recipes/llm/train_eagle3.py    |  4 +-
 8 files changed, 124 insertions(+), 11 deletions(-)
 create mode 100644 examples/speculative/eagle1/qwen3_eagle1_perfectblend.yaml
 create mode 100644 examples/speculative/eagle2/qwen3_eagle2_perfectblend.yaml
 create mode 100644 examples/speculative/eagle3/qwen3_eagle3_perfectblend.yaml

diff --git a/examples/speculative/eagle1/qwen3_eagle1_perfectblend.yaml b/examples/speculative/eagle1/qwen3_eagle1_perfectblend.yaml
new file mode 100644
index 0000000000..61ed7f4068
--- /dev/null
+++ b/examples/speculative/eagle1/qwen3_eagle1_perfectblend.yaml
@@ -0,0 +1,37 @@
+recipe: TrainEagle1Recipe
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 30
+
+recipe_args:
+  target_model_name_or_path: Qwen/Qwen3-8B
+  train_data_path: /path/to/train.jsonl
+  val_data_path: null
+  train_split: null
+  val_split: null
+  output_dir: ./outputs/eagle1_qwen3_mvp
+  seq_length: 1024
+  micro_batch_size: 1
+  grad_accumulation_steps: 1
+  num_workers: 0
+  num_epochs: 1
+  draft_num_hidden_layers: 1
+  hidden_loss_weight: 1.0
+  token_loss_weight: 0.1
+  freeze_embeddings: true
+  trust_remote_code: false
+  shuffle_seed: 42
+  log_every_steps: 10
+  max_grad_norm: 1.0
+
+optimizer:
+  lr: 1.0e-4
+  betas: [0.9, 0.95]
+  weight_decay: 0.0
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: ./outputs/eagle1_qwen3_mvp/checkpoints
+  model_save_format: safetensors
+  save_consolidated: true
diff --git a/examples/speculative/eagle2/qwen3_eagle2_perfectblend.yaml b/examples/speculative/eagle2/qwen3_eagle2_perfectblend.yaml
new file mode 100644
index 0000000000..33e771cca5
--- /dev/null
+++ b/examples/speculative/eagle2/qwen3_eagle2_perfectblend.yaml
@@ -0,0 +1,37 @@
+recipe: TrainEagle2Recipe
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 30
+
+recipe_args:
+  target_model_name_or_path: Qwen/Qwen3-8B
+  train_data_path: /path/to/train.jsonl
+  val_data_path: null
+  train_split: null
+  val_split: null
+  output_dir: ./outputs/eagle2_qwen3_mvp
+  seq_length: 1024
+  micro_batch_size: 1
+  grad_accumulation_steps: 1
+  num_workers: 0
+  num_epochs: 1
+  draft_num_hidden_layers: 1
+  hidden_loss_weight: 1.0
+  token_loss_weight: 0.1
+  freeze_embeddings: true
+  trust_remote_code: false
+  shuffle_seed: 42
+  log_every_steps: 10
+  max_grad_norm: 1.0
+
+optimizer:
+  lr: 1.0e-4
+  betas: [0.9, 0.95]
+  weight_decay: 0.0
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: ./outputs/eagle2_qwen3_mvp/checkpoints
+  model_save_format: safetensors
+  save_consolidated: true
diff --git a/examples/speculative/eagle3/qwen3_eagle3_perfectblend.yaml b/examples/speculative/eagle3/qwen3_eagle3_perfectblend.yaml
new file mode 100644
index 0000000000..5913192df3
--- /dev/null
+++ b/examples/speculative/eagle3/qwen3_eagle3_perfectblend.yaml
@@ -0,0 +1,36 @@
+recipe: TrainEagle3Recipe
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 30
+
+recipe_args:
+  target_model_name_or_path: Qwen/Qwen3-8B
+  train_data_path: /path/to/train.jsonl
+  val_data_path: null
+  train_split: null
+  val_split: null
+  output_dir: ./outputs/eagle3_qwen3_mvp
+  seq_length: 1024
+  micro_batch_size: 1
+  grad_accumulation_steps: 1
+  num_workers: 0
+  num_epochs: 1
+  ttt_steps: 4
+  draft_vocab_size: 8192
+  freeze_embeddings: true
+  trust_remote_code: false
+  shuffle_seed: 42
+  log_every_steps: 10
+  max_grad_norm: 1.0
+
+optimizer:
+  lr: 1.0e-4
+  betas: [0.9, 0.95]
+  weight_decay: 0.0
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: ./outputs/eagle3_qwen3_mvp/checkpoints
+  model_save_format: safetensors
+  save_consolidated: true
diff --git a/nemo_automodel/components/speculative/eagle/draft_llama.py b/nemo_automodel/components/speculative/eagle/draft_llama.py
index 7a6834805d..119ff8c04e 100644
--- a/nemo_automodel/components/speculative/eagle/draft_llama.py
+++ b/nemo_automodel/components/speculative/eagle/draft_llama.py
@@ -18,10 +18,12 @@
 decoder-only architecture whose layout matches Llama: GQA attention with
 optional Q/K/V/O bias (`config.attention_bias`), SwiGLU MLP with optional
 bias (`config.mlp_bias`), RMSNorm, and rotary position embeddings parameterized
-by `config.rope_theta` / `config.rope_scaling`. This currently covers Llama
-and Phi-3 dense (Phi-3 omits `attention_bias` / `mlp_bias`, which the
-attention and MLP layers already read via
-`getattr(config, "<field>", False)`).
+by `config.rope_theta` / `config.rope_scaling`. This currently covers Llama,
+Phi-3, and Qwen3 dense (Phi-3 omits `attention_bias` / `mlp_bias`, which
+the attention and MLP layers already read via
+`getattr(config, "<field>", False)`; Qwen3 decouples `head_dim` from
+`hidden_size / num_attention_heads`, which the attention layer reads via
+`getattr(config, "head_dim", ...)`).
 
 Class names and the public `architectures` string remain ``LlamaEagle3*`` for
 backward compatibility with already-trained checkpoints and with SGLang's
@@ -442,7 +444,7 @@ def __init__(self, config: PretrainedConfig):
 
 
 class LlamaEagle3DraftModel(PreTrainedModel):
-    """Llama-style dense EAGLE-3 draft model (Llama, Phi-3).
+    """Llama-style dense EAGLE-3 draft model (Llama, Phi-3, Qwen3).
 
     State dict keys match SGLang's ``LlamaForCausalLMEagle3`` so the saved
     checkpoint can be loaded by SGLang's inference engine without any
diff --git a/nemo_automodel/components/speculative/eagle/draft_llama_v12.py b/nemo_automodel/components/speculative/eagle/draft_llama_v12.py
index 306bd3288d..ec303e2405 100644
--- a/nemo_automodel/components/speculative/eagle/draft_llama_v12.py
+++ b/nemo_automodel/components/speculative/eagle/draft_llama_v12.py
@@ -14,7 +14,7 @@
 
 """Llama-style dense LLM draft model for EAGLE-1 / EAGLE-2 training.
 
-Config-driven; supports Llama and Phi-3 dense via standard HF config
+Config-driven; supports Llama, Phi-3, and Qwen3 dense via standard HF config
 fields (``attention_bias``, ``mlp_bias``, ``rope_theta``/``rope_scaling``,
 ``rms_norm_eps``). Class names are retained for checkpoint-architectures
 compatibility.
@@ -158,7 +158,7 @@ def forward(
 class LlamaEagleDraftModel(PreTrainedModel):
     """Llama-style dense draft that predicts next-step hidden states.
 
-    Works with Llama and Phi-3 dense configs. The class name is
+    Works with Llama, Phi-3, and Qwen3 dense configs. The class name is
     retained for backward compatibility with already-trained checkpoints.
     """
 
diff --git a/nemo_automodel/components/speculative/eagle/registry.py b/nemo_automodel/components/speculative/eagle/registry.py
index af618e5106..3b3e0289c9 100644
--- a/nemo_automodel/components/speculative/eagle/registry.py
+++ b/nemo_automodel/components/speculative/eagle/registry.py
@@ -55,6 +55,7 @@ class DraftSpec:
 _DENSE_ARCHITECTURES: tuple[str, ...] = (
     "LlamaForCausalLM",
     "Phi3ForCausalLM",
+    "Qwen3ForCausalLM",
 )
 
 
diff --git a/nemo_automodel/recipes/llm/train_eagle1.py b/nemo_automodel/recipes/llm/train_eagle1.py
index b61ddf6564..1a65813e59 100644
--- a/nemo_automodel/recipes/llm/train_eagle1.py
+++ b/nemo_automodel/recipes/llm/train_eagle1.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""EAGLE-1 / EAGLE-2 training recipe for Llama-style dense LLMs (Llama, Phi-3)."""
+"""EAGLE-1 / EAGLE-2 training recipe for Llama-style dense LLMs (Llama, Phi-3, Qwen3)."""
 
 from __future__ import annotations
 
@@ -62,7 +62,7 @@ def _all_reduce_mean(value: torch.Tensor) -> torch.Tensor:
 
 
 class TrainEagle1Recipe(BaseRecipe):
-    """Recipe for EAGLE-1 training on Llama-style dense LLMs (Llama, Phi-3)."""
+    """Recipe for EAGLE-1 training on Llama-style dense LLMs (Llama, Phi-3, Qwen3)."""
 
     def __init__(self, cfg):
         self.cfg = cfg
diff --git a/nemo_automodel/recipes/llm/train_eagle3.py b/nemo_automodel/recipes/llm/train_eagle3.py
index f405ce3823..a9a611623f 100644
--- a/nemo_automodel/recipes/llm/train_eagle3.py
+++ b/nemo_automodel/recipes/llm/train_eagle3.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""EAGLE-3 training recipe for Llama-style dense LLMs (Llama, Phi-3)."""
+"""EAGLE-3 training recipe for Llama-style dense LLMs (Llama, Phi-3, Qwen3)."""
 
 from __future__ import annotations
 
@@ -81,7 +81,7 @@ def _all_reduce_mean(value: torch.Tensor) -> torch.Tensor:
 
 
 class TrainEagle3Recipe(BaseRecipe):
-    """Recipe for EAGLE-3 training on Llama-style dense LLMs (Llama, Phi-3)."""
+    """Recipe for EAGLE-3 training on Llama-style dense LLMs (Llama, Phi-3, Qwen3)."""
 
     def __init__(self, cfg):
         self.cfg = cfg

From ef5eb6cae91898ff73f9a145e8beeb390447b683 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Tue, 26 May 2026 09:57:26 +0800
Subject: [PATCH 2/2] fix(speculative): read activation from config instead of
 hardcoded SiLU

EAGLE-3 draft reads ACT2FN[config.hidden_act] from the target config,
but EAGLE-1/2 draft hardcoded nn.SiLU(). All currently registered
dense architectures (Llama / Phi-3 / Qwen3) happen to use silu, so the
hardcode is correct today.

However, the dense registry is intended to grow to cover non-SiLU
families next (e.g. Gemma uses gelu_pytorch_tanh). With the hardcode
in place, registering such an architecture would silently mismatch the
target's activation: no crash, no error, training still converges, but
draft hidden states drift from target and speculative acceptance rate
quietly drops with no observable symptom.

Read hidden_act from config so the draft matches the target by
construction and adding new architectures stays a one-line registry
change.

Signed-off-by: khazic <khazzz1c@gmail.com>
---
 .../components/speculative/eagle/draft_llama_v12.py           | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemo_automodel/components/speculative/eagle/draft_llama_v12.py b/nemo_automodel/components/speculative/eagle/draft_llama_v12.py
index ec303e2405..00c56b356a 100644
--- a/nemo_automodel/components/speculative/eagle/draft_llama_v12.py
+++ b/nemo_automodel/components/speculative/eagle/draft_llama_v12.py
@@ -122,7 +122,9 @@ def __init__(self, config: PretrainedConfig):
         self.down_proj = nn.Linear(
             config.intermediate_size, config.hidden_size, bias=getattr(config, "mlp_bias", False)
         )
-        self.act_fn = nn.SiLU()
+        from transformers.activations import ACT2FN
+
+        self.act_fn = ACT2FN[config.hidden_act]
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))