Fix compatible issue with transformers 5.0+ (#2328)

xiaoyu-work · web-flow · commit 9dddcb3a93e8 · 2026-02-23T15:23:51.000-08:00
## Describe your changes
This pull request introduces compatibility updates for Hugging Face
Transformers 5.0 and improves handling of dynamic cache and input
formats in Olive's ONNX conversion and training utilities. It also
updates tests and requirements to reflect these changes and ensure
robust model export and training workflows.

### Transformers 5.0 Compatibility

* Added patching and conversion utilities for
`DynamicLayer.lazy_initialization`, `past_key_values`, and dynamic
shapes to support the new DynamicCache format in Transformers &gt;= 5.0.
This ensures models using dynamic cache export correctly with
`torch.export`.
* Updated `_export_pytorch_model` logic to apply the new patches and
conversions only for Transformers &gt;= 5.0, while maintaining legacy
support for older versions.

### Training Argument Handling

* Improved filtering of training arguments in `create_training_args` to
remove fields not valid for Transformers 5.0 and exclude `None` values,
allowing Transformers to use its own defaults.

### Test Suite Updates

* Modified model loading and metadata tests to remove
`trust_remote_code` parameter and update expected file counts and
tokenizer types for Transformers 5.0.
[[1]](diffhunk://#diff-af681b2feed22286034d304b653185d2a4dc5d680e7d715a6ad41a1c731ff0fcL30-L45)
[[2]](diffhunk://#diff-af681b2feed22286034d304b653185d2a4dc5d680e7d715a6ad41a1c731ff0fcL76-R80)
[[3]](diffhunk://#diff-af681b2feed22286034d304b653185d2a4dc5d680e7d715a6ad41a1c731ff0fcL97-R90)
[[4]](diffhunk://#diff-af681b2feed22286034d304b653185d2a4dc5d680e7d715a6ad41a1c731ff0fcL129-R121)
* Updated model output comparison in rotation tests to cast logits to
`float` before comparison, ensuring consistency across dtypes.

### Requirements Adjustments

* Restricted `onnxscript` version to `&lt;0.6.1` and removed the
Transformers version pin, reflecting confidence in test suite
compatibility with Transformers 5.0.
[[1]](diffhunk://#diff-1ce09e5a57d7791711f12f84ecb7e089e925a2929b719d587561e8e58c7e4b90L24-R24)
[[2]](diffhunk://#diff-1ce09e5a57d7791711f12f84ecb7e089e925a2929b719d587561e8e58c7e4b90L40-L41)
## Checklist before requesting a review
- [ ] Add unit tests for this change.
- [ ] Make sure all tests can pass.
- [ ] Update documents if necessary.
- [ ] Lint and apply fixes to your code by running `lintrunner -a`
- [ ] Is this a user-facing change? If yes, give a description of this
change to be included in the release notes.

## (Optional) Issue link
diff --git a/olive/passes/onnx/conversion.py b/olive/passes/onnx/conversion.py
@@ -38,6 +38,8 @@
 from olive.passes.onnx.common import get_external_data_config, ir_model_to_olive_model
 from olive.passes.pass_config import BasePassConfig, PassConfigParam, get_user_script_data_config
 
+# pylint: disable=W0212
+
 logger = logging.getLogger(__name__)
 
 
@@ -57,6 +59,128 @@ def forward(self, *input_data, **input_dict):
         return self.model(*input_data, **input_dict)
 
 
+def _register_dynamic_cache_export_support():
+    """Utilities for `DynamicCache` <> torch.export support."""
+    from transformers.cache_utils import DynamicCache, DynamicLayer, DynamicSlidingWindowLayer
+
+    def _get_cache_dict(cache: DynamicCache):
+        """Convert cache to dictionary format for pytree operations."""
+        if any(not isinstance(layer, (DynamicLayer, DynamicSlidingWindowLayer)) for layer in cache.layers):
+            raise RuntimeError("This pytree flattening function should only be applied to DynamicCache")
+
+        return {
+            "cache": [(layer.keys, layer.values) for layer in cache.layers if layer.keys is not None],
+        }
+
+    try:
+        torch.utils._pytree.register_pytree_node(
+            DynamicCache,
+            lambda dynamic_cache: torch.utils._pytree._dict_flatten(_get_cache_dict(dynamic_cache)),
+            _unflatten_dynamic_cache,
+            serialized_type_name=f"{DynamicCache.__module__}.{DynamicCache.__name__}",
+            flatten_with_keys_fn=lambda dynamic_cache: torch.utils._pytree._dict_flatten_with_keys(
+                _get_cache_dict(dynamic_cache)
+            ),
+        )
+        # TODO (team): This won't be needed in torch 2.7.
+        torch.fx._pytree.register_pytree_flatten_spec(
+            DynamicCache,
+            lambda cache, spec: torch.fx._pytree._dict_flatten_spec(_get_cache_dict(cache), spec),
+        )
+    # Catching this in case there are multiple runs for some test runs
+    except ValueError as e:
+        if "already registered as pytree node" not in str(e):
+            raise
+
+
+def _unflatten_dynamic_cache(values, context: torch.utils._pytree.Context):
+    from transformers.cache_utils import DynamicCache
+
+    dictionary = torch.utils._pytree._dict_unflatten(values, context)
+    cache = DynamicCache()
+    # Reconstruct layers from keys and values lists
+    cache_list = dictionary.get("cache", [])
+    for i, (key, value) in enumerate(cache_list):
+        cache.update(key, value, i)
+    return cache
+
+
+def _patch_dynamic_layer_for_export():
+    """Patch DynamicLayer.lazy_initialization for torch.export compatibility (transformers >= 5.0).
+
+    The original uses torch.tensor([]) which creates a 1D empty tensor (shape [0]).
+    torch.export needs consistent tensor ranks, so we use torch.narrow + torch.empty_like
+    to preserve the full shape (e.g. [batch, heads, 0, head_dim]).
+    """
+    from transformers.cache_utils import DynamicLayer
+
+    if not hasattr(DynamicLayer, "lazy_initialization"):
+        return
+
+    def patched_lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor = None):
+        self.dtype, self.device = key_states.dtype, key_states.device
+        like = torch.narrow(key_states, dim=-2, start=0, length=0)
+        if hasattr(key_states, "fake_mode"):
+            with key_states.fake_mode:
+                self.keys = torch.empty_like(like, dtype=self.dtype, device=self.device)
+                self.values = torch.empty_like(like, dtype=self.dtype, device=self.device)
+        else:
+            self.keys = torch.empty_like(like, dtype=self.dtype, device=self.device)
+            self.values = torch.empty_like(like, dtype=self.dtype, device=self.device)
+        self.is_initialized = True
+
+    DynamicLayer.lazy_initialization = patched_lazy_initialization
+    logger.debug("Patched DynamicLayer.lazy_initialization for torch.export compatibility.")
+
+
+def _convert_past_key_values_to_dynamic_cache(dummy_kwargs: dict, config=None) -> dict:
+    """Convert legacy list-format past_key_values to DynamicCache (transformers >= 5.0).
+
+    Transformers 5.0 models expect DynamicCache objects, not lists of (key, value) tensors.
+    When config is provided, the DynamicCache will create correct layer types (e.g.
+    DynamicSlidingWindowLayer for models using sliding window attention).
+    """
+    pkv = dummy_kwargs.get("past_key_values")
+    if pkv is None or not isinstance(pkv, (list, tuple)):
+        return dummy_kwargs
+
+    # Check if it's legacy format: list of [key, value] pairs (each with exactly 2 elements)
+    if not pkv or not isinstance(pkv[0], (list, tuple)) or len(pkv[0]) != 2:
+        return dummy_kwargs
+
+    from transformers.cache_utils import DynamicCache
+
+    dc = DynamicCache(config=config)
+    for layer_idx, kv in enumerate(pkv):
+        dc.update(kv[0], kv[1], layer_idx=layer_idx)
+    dummy_kwargs["past_key_values"] = dc
+    logger.debug("Converted past_key_values from legacy list format to DynamicCache.")
+    return dummy_kwargs
+
+
+def _convert_dynamic_shapes_for_dynamic_cache(dynamic_shapes: dict) -> dict:
+    """Convert dynamic_shapes for past_key_values from nested list to DynamicCache pytree format.
+
+    The old format is: [[key_shape, val_shape], ...] (one pair per layer)
+    The DynamicCache pytree is: {"cache": [(key0, val0), (key1, val1), ...]}
+    matching the structure from _register_dynamic_cache_export_support().
+    """
+    pkv_shapes = dynamic_shapes.get("past_key_values")
+    if pkv_shapes is None or not isinstance(pkv_shapes, (list, tuple)):
+        return dynamic_shapes
+
+    if not pkv_shapes or not isinstance(pkv_shapes[0], (list, tuple)) or len(pkv_shapes[0]) != 2:
+        return dynamic_shapes
+
+    # Convert [[key0, val0], [key1, val1], ...] -> {"cache": [(key0, val0), (key1, val1), ...]}
+    # matching DynamicCache pytree: _dict_flatten({"cache": [(keys, values), ...]})
+    dynamic_shapes["past_key_values"] = {
+        "cache": [tuple(layer) for layer in pkv_shapes],
+    }
+    logger.debug("Converted dynamic_shapes for past_key_values to DynamicCache pytree format.")
+    return dynamic_shapes
+
+
 def _patch_model_if_necessary(pytorch_model: torch.nn.Module):
     if not isinstance(pytorch_model, PreTrainedModel):
         return
@@ -179,9 +303,6 @@ def _export_pytorch_model(
     if torch_dtype:
         pytorch_model = pytorch_model.to(torch_dtype)
 
-    # Apply any necessary patches
-    _patch_model_if_necessary(pytorch_model)
-
     # get input and output names, and dynamic axes
     assert io_config is not None, "Cannot get io_config for the model."
     io_config = validate_config(io_config, IoConfig)
@@ -194,8 +315,6 @@ def _export_pytorch_model(
     # is taken, the old export always writes a model to the disk. When that happens we need to
     # load the model back into IR and load all the external tensor to memory
     with tempfile.TemporaryDirectory(prefix="olive_tmp") as tmp_dir:
-        tmp_model_path = resolve_onnx_path(tmp_dir)
-
         if dynamo:
             # Take the "release" version so that dev builds like 2.5.0dev1234 are treated as 2.5.0
             if _torch_is_older_than("2.7.0") and (
@@ -212,24 +331,39 @@ def _export_pytorch_model(
                     "Please upgrade PyTorch to 2.6.0 or above."
                 )
 
-            # Register DynamicCache export support
-            from transformers.integrations.executorch import register_dynamic_cache_export_support
-
-            register_dynamic_cache_export_support()
-
             if isinstance(dummy_inputs, dict):
                 dummy_kwargs = dummy_inputs
                 dummy_inputs = ()
             else:
                 dummy_kwargs = {}
                 dummy_inputs = tuple(dummy_inputs)
 
+            # Apply patches for DynamicCache / past_key_values compatibility
+            if version.parse(transformers.__version__) >= version.parse("5.0"):
+                # transformers >= 5.0: DynamicCache refactored to use DynamicLayer
+
+                _register_dynamic_cache_export_support()
+                _patch_dynamic_layer_for_export()
+                model_config = getattr(pytorch_model, "config", None)
+                dummy_kwargs = _convert_past_key_values_to_dynamic_cache(dummy_kwargs, config=model_config)
+                if io_config.dynamic_shapes:
+                    io_config.dynamic_shapes = _convert_dynamic_shapes_for_dynamic_cache(io_config.dynamic_shapes)
+            else:
+                # transformers < 5.0: patch forward to convert list <-> DynamicCache
+                _patch_model_if_necessary(pytorch_model)
+
             # NOTE: Usually validation is done in io_config.py, but because
             # dynamic_shapes has nested complexity, and it can't be validated multiple
             # times like others, we validate it here.
             io_config.dynamic_shapes, dummy_inputs, dummy_kwargs = _validate_dynamic_shapes(
                 io_config.dynamic_shapes, dummy_inputs, dummy_kwargs, pytorch_model
             )
+            # torch.export requires strict type match between inputs and dynamic_shapes;
+            # _validate_dynamic_shapes may return OrderedDict, so convert back to plain dict
+            if isinstance(io_config.dynamic_shapes, collections.OrderedDict):
+                io_config.dynamic_shapes = dict(io_config.dynamic_shapes)
+            if isinstance(dummy_kwargs, collections.OrderedDict):
+                dummy_kwargs = dict(dummy_kwargs)
 
             # When dynamo=True, PyTorch prefers dynamic_shapes over dynamic_axes.
             # If dynamic_shapes is None and fallback is enabled, don't pass dynamic_axes
@@ -239,15 +373,13 @@ def _export_pytorch_model(
             onnx_program = torch.onnx.export(  # pylint: disable=unexpected-keyword-arg,no-value-for-parameter
                 pytorch_model,
                 dummy_inputs,
-                tmp_model_path,  # needed for fallback=True
                 kwargs=dummy_kwargs,
                 opset_version=config.target_opset,
                 input_names=io_config.input_names,
                 output_names=io_config.output_names,
                 dynamic_axes=dynamic_axes_for_export,
                 dynamic_shapes=io_config.dynamic_shapes,
                 dynamo=True,
-                fallback=False,
                 optimize=config.optimize,
                 report=logger.isEnabledFor(logging.DEBUG),
             )
@@ -264,6 +396,8 @@ def _export_pytorch_model(
                 # default is True in 2.9.0 and later
                 dynamo_args["dynamo"] = False
 
+            tmp_model_path = resolve_onnx_path(tmp_dir)
+
             torch.onnx.export(
                 pytorch_model,
                 dummy_inputs,
diff --git a/olive/passes/pytorch/train_utils.py b/olive/passes/pytorch/train_utils.py
@@ -83,6 +83,11 @@ def create_training_args(self) -> transformers.TrainingArguments:
         if version.parse(transformers_version) < version.parse("4.41") and "eval_strategy" in args:
             args["evaluation_strategy"] = args.pop("eval_strategy")
         extra_args = args.pop("extra_args")
+        # Filter out fields that are not valid TrainingArguments parameters (e.g. overwrite_output_dir
+        # was removed in transformers 5.0 but is still used by Olive's own logic) and None values
+        # so that transformers uses its own defaults
+        training_args_fields = {f.name for f in dataclasses.fields(transformers.TrainingArguments) if f.init}
+        args = {k: v for k, v in args.items() if k in training_args_fields and v is not None}
         return transformers.TrainingArguments(**args, **extra_args)
 
 
diff --git a/test/model/test_hf_model.py b/test/model/test_hf_model.py
@@ -27,23 +27,16 @@ def setup(self):
         self.local_path = huggingface_hub.snapshot_download(self.model_name, revision=self.revision)
 
     @pytest.mark.parametrize("local", [True, False])
-    @pytest.mark.parametrize("trust_remote_code", [True, False])
-    def test_load_model(self, local, trust_remote_code):
+    def test_load_model(self, local):
         olive_model = HfModelHandler(
             model_path=self.local_path if local else self.model_name,
             task=self.task,
-            load_kwargs={"trust_remote_code": trust_remote_code, "revision": self.revision},
+            load_kwargs={"revision": self.revision},
         )
 
         pytorch_model = olive_model.load_model()
         actual_class_path = f"{pytorch_model.__module__}.{pytorch_model.__class__.__name__}"
-        if trust_remote_code:
-            # When using remote code, the model is loaded from transformers_modules
-            assert actual_class_path.startswith("transformers_modules.")
-            assert actual_class_path.endswith(".modeling_phi3.Phi3ForCausalLM")
-        else:
-            # When not using remote code, the model is loaded from transformers
-            assert actual_class_path == "transformers.models.phi3.modeling_phi3.Phi3ForCausalLM"
+        assert actual_class_path == "transformers.models.phi3.modeling_phi3.Phi3ForCausalLM"
 
     @pytest.mark.parametrize("local", [True, False])
     def test_load_model_with_kwargs(self, local):
@@ -73,19 +66,18 @@ def test_save_metadata(self, local, trust_remote_code, tokenizer_exists, tmp_pat
         if tokenizer_exists:
             olive_model.get_hf_tokenizer().save_pretrained(tmp_path)
         saved_filepaths = olive_model.save_metadata(tmp_path)
-        # transformers>=4.53.x
-        assert len(saved_filepaths) == (4 if tokenizer_exists else 10)
+        # transformers>=5.0.0
+        assert len(saved_filepaths) == (4 if tokenizer_exists else 7)
         assert all(Path(fp).exists() for fp in saved_filepaths)
         assert isinstance(transformers.AutoConfig.from_pretrained(tmp_path), transformers.Phi3Config)
-        assert isinstance(transformers.AutoTokenizer.from_pretrained(tmp_path), transformers.LlamaTokenizerFast)
+        assert isinstance(transformers.AutoTokenizer.from_pretrained(tmp_path), transformers.PreTrainedTokenizerBase)
 
     @pytest.mark.parametrize("local", [True, False])
-    @pytest.mark.parametrize("trust_remote_code", [True, False])
-    def test_save_pretrained_metadata(self, local, trust_remote_code, tmp_path):
+    def test_save_pretrained_metadata(self, local, tmp_path):
         olive_model = HfModelHandler(
             model_path=self.local_path if local else self.model_name,
             task=self.task,
-            load_kwargs={"trust_remote_code": trust_remote_code, "revision": self.revision},
+            load_kwargs={"revision": self.revision},
         )
 
         # modify the config and save the model
@@ -94,8 +86,8 @@ def test_save_pretrained_metadata(self, local, trust_remote_code, tmp_path):
         loaded_model.save_pretrained(tmp_path)
 
         saved_filepaths = olive_model.save_metadata(tmp_path)
-        # generation config is also saved, transformers>=4.53.x
-        assert len(saved_filepaths) == 9
+        # generation config is also saved, transformers>=5.0.0
+        assert len(saved_filepaths) == 6
 
         with open(tmp_path / "config.json") as f:
             config = json.load(f)
@@ -126,7 +118,7 @@ def test_save_metadata_with_module_files(trust_remote_code, tmp_path):
     assert f"{config.__module__}.{config.__class__.__name__}" == expected_class_name
     assert isinstance(
         transformers.AutoTokenizer.from_pretrained(tmp_path, **load_kwargs),
-        transformers.LlamaTokenizerFast,
+        transformers.PreTrainedTokenizerBase,
     )
 
 
diff --git a/test/passes/onnx/test_conversion.py b/test/passes/onnx/test_conversion.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 import platform
-import shutil
 from itertools import chain
 from pathlib import Path
 from unittest.mock import patch
@@ -211,9 +210,7 @@ def mock_onnx_export_func(*args, **kwargs):
         nonlocal dummy_kwargs
         # For dynamo export, inputs are passed via kwargs parameter
         dummy_kwargs = kwargs.get("kwargs", {})
-        _, _, output_path = args
-        shutil.copyfile(ONNX_MODEL_PATH, output_path)
-        return MockOnnxProgram(output_path)
+        return MockOnnxProgram(ONNX_MODEL_PATH)
 
     output_folder = tmp_path / "onnx"
     output_folder.mkdir(parents=True, exist_ok=True)
diff --git a/test/passes/pytorch/test_rotate.py b/test/passes/pytorch/test_rotate.py
@@ -35,7 +35,8 @@ def common_test_rotate(rotate_pass, tmp_path, model_path, rotate_mode, atol, **c
     with torch.no_grad():
         original_output = original_model(i)
         rotated_output = rotated_model(i)
-        assert torch.allclose(original_output.logits, rotated_output.logits, atol=atol)
+        # Cast to same dtype before comparison since rotated model may be saved/loaded in a different dtype
+        assert torch.allclose(original_output.logits.float(), rotated_output.logits.float(), atol=atol)
 
 
 @pytest.mark.parametrize("model_path", ["tiny-phi3", "tiny-llama"])
diff --git a/test/requirements-test.txt b/test/requirements-test.txt
@@ -37,5 +37,3 @@ sentencepiece
 soundfile
 tabulate
 torchvision
-# Remove version pin when the tests are fixed
-transformers<5.0.0
diff --git a/test/utils.py b/test/utils.py
@@ -77,8 +77,8 @@ def get_pytorch_model(batch_size=1):
     )
 
 
-def get_hf_model(model_path="hf-internal-testing/tiny-random-gptj"):
-    return HfModelHandler(model_path=model_path)
+def get_hf_model(model_path="hf-internal-testing/tiny-random-LlamaForCausalLM"):
+    return HfModelHandler(model_path=model_path, task="text-generation")
 
 
 def get_hf_model_config():

Original file line number	Diff line number	Diff line change
`@@ -77,8 +77,8 @@ def get_pytorch_model(batch_size=1):`
`77`	`77`	`)`
`78`	`78`
`79`	`79`
`80`		`-def get_hf_model(model_path="hf-internal-testing/tiny-random-gptj"):`
`81`		`- return HfModelHandler(model_path=model_path)`
	`80`	`+def get_hf_model(model_path="hf-internal-testing/tiny-random-LlamaForCausalLM"):`
	`81`	`+ return HfModelHandler(model_path=model_path, task="text-generation")`
`82`	`82`
`83`	`83`
`84`	`84`	`def get_hf_model_config():`