Fix: restore requires_grad in transformers5 reloading (#907)

h-guo18 · web-flow · commit eb99488da1a5 · 2026-02-19T01:25:47.000Z
## What does this PR do? **Type of change:** ?  **Overview:** Patch transformers 5.x parameter loading to preserve original `requires_grad` settings. In transformers v5.x, loading a checkpoint forcibly sets parameters' requires_grad, which unintentionally unfreeze frozen parameters (e.g. Base model in eagle training). This leads to optimizer initialization error since the restored optimizer expected more parameter than the checkpoint. This monkey-patch restores the original`requires_grad` after loading parameters. Reference: https://github.com/huggingface/transformers/blob/v5.0.0.rc1-release/src/transformers/core_model_loading.py#L640 ## Usage  ```python # Add a code snippet demonstrating how to use this ``` ## Testing  ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information   ## Summary by CodeRabbit * **Bug Fixes** * Fixed model parameter loading in speculative decoding to properly preserve gradient requirements for each parameter when using HuggingFace Transformers 5.x, ensuring correct behavior during checkpoint resumption and model initialization.  --------- Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh
@@ -134,7 +134,7 @@ OUTPUT_DIR=${OUTPUT_DIR:-"ckpts/${MODEL_BASENAME}-$(date +%Y%m%d_%H%M)"}
 NUM_EPOCHS=${NUM_EPOCHS:-1}
 SAVE_STEPS=${SAVE_STEPS:-$DEFAULT_SAVE_STEPS}
 LR=${LR:-"1e-4"}
-TRAIN_BS=${TRAIN_BS:-4}
+TRAIN_BS=${TRAIN_BS:-1}
 MEDUSA_NUM_HEADS=${MEDUSA_NUM_HEADS:-1}
 MEDUSA_NUM_LAYERS=${MEDUSA_NUM_LAYERS:-1}
 TRAINING_SEQ_LEN=${TRAINING_SEQ_LEN:-2048}
diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py
@@ -48,7 +48,10 @@
 
 import modelopt.torch.opt as mto
 import modelopt.torch.speculative as mtsp
-from modelopt.torch.speculative.utils import load_vlm_or_llm_with_kwargs
+from modelopt.torch.speculative.utils import (
+    load_vlm_or_llm_with_kwargs,
+    patch_transformers5_params_loading,
+)
 from modelopt.torch.utils import print_rank_0
 
 torch.manual_seed(0)
@@ -162,9 +165,10 @@ def train():
     use_offline_training = data_args.offline_data_path is not None
 
     if checkpoint:
-        _, model = load_vlm_or_llm_with_kwargs(
-            checkpoint, torch_dtype="auto", trust_remote_code=True
-        )
+        with patch_transformers5_params_loading():
+            _, model = load_vlm_or_llm_with_kwargs(
+                checkpoint, torch_dtype="auto", trust_remote_code=True
+            )
         tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
     else:
         # To avoid OOM for large models, we load and convert model on CPU first.
diff --git a/modelopt/torch/speculative/utils.py b/modelopt/torch/speculative/utils.py
@@ -485,3 +485,46 @@ def load_vlm_or_llm_with_kwargs(model_name_or_path: str, **kwargs):
         model_cls = transformers.AutoModelForCausalLM
 
     return model_config, model_cls.from_pretrained(model_name_or_path, **kwargs)
+
+
+@contextlib.contextmanager
+def patch_transformers5_params_loading():
+    """Patch transformers 5.x parameter loading to preserve original `requires_grad` settings.
+
+    In transformers v5.x, loading a checkpoint forcibly sets parameters' requires_grad,
+    which may unintentionally unfreeze frozen parameters. This monkey-patch restores the original
+    `requires_grad` after loading parameters.
+
+    Reference:
+        https://github.com/huggingface/transformers/blob/v5.0.0.rc1-release/src/transformers/core_model_loading.py#L640
+    """
+    # Skip patching for non-applicable transformers version
+    if importlib.util.find_spec("transformers.core_model_loading") is None:
+        return
+    from transformers import core_model_loading
+
+    if not hasattr(core_model_loading, "set_param_for_module"):
+        return
+
+    orig_set_param_for_module = core_model_loading.set_param_for_module
+
+    def patched_set_param_for_module(*args, **kwargs):
+        """Monkey-patch set_param_for_module to restore original requires_grad."""
+        model, target_name = args[:2]
+        module_path, _, param_name = target_name.rpartition(".")
+        module_obj = model.get_submodule(module_path) if module_path else model
+
+        # Get original requires_grad value
+        orig_requires_grad = getattr(module_obj, param_name).requires_grad
+
+        # Call set_param_for_module
+        orig_set_param_for_module(*args, **kwargs)
+
+        # Restore original requires_grad value
+        getattr(module_obj, param_name).requires_grad = orig_requires_grad
+
+    try:
+        core_model_loading.set_param_for_module = patched_set_param_for_module
+        yield
+    finally:
+        core_model_loading.set_param_for_module = orig_set_param_for_module
diff --git a/tests/examples/speculative_decoding/test_eagle.py b/tests/examples/speculative_decoding/test_eagle.py
@@ -89,7 +89,7 @@ def test_llama_eagle3(tiny_llama_path, tiny_daring_anteater_path, tmp_path, eagl
             "./launch_train.sh",
             "--model", tiny_llama_path,
             "--data", tiny_daring_anteater_path,
-            "--num_epochs", "1",
+            "--num_epochs", "0.25",
             "--lr", "1e-5",
             "--mode", "eagle3",
             "--eagle_config", str(config_file),
@@ -101,6 +101,23 @@ def test_llama_eagle3(tiny_llama_path, tiny_daring_anteater_path, tmp_path, eagl
     )
 
 
+def test_resume_training(tiny_daring_anteater_path, eagle_output_dir):
+    """Test resume training of Eagle3."""
+    run_example_command(
+        [
+            "./launch_train.sh",
+            "--model",  eagle_output_dir / "eagle-tinyllama-cp1",
+            "--data", tiny_daring_anteater_path,
+            "--num_epochs", "0.5",
+            "--lr", "1e-5",
+            "--mode", "eagle3",
+            "--output_dir", eagle_output_dir / "eagle-tinyllama-cp1",
+            "--training_seq_len", "128", # Match max_position_embeddings
+        ],
+        "speculative_decoding",
+    )
+
+
 def test_ar_validate(eagle_output_dir):
     """Test in-framework AR evaluation."""
     run_example_command(