AutoSP: fix torch 2.9 fake propagation issues (#2)

tohtana · spikerheado1234 · commit c6aa3dee86c8 · 2026-03-19T05:38:44.000Z
* Fix AutoSP shape propagation fake mode reuse

* Fix AutoSP torch 2.9 fake propagation

* Fix AutoSP shard slice ordering

* Add comments for AutoSP torch 2.9 fixes

---------

Signed-off-by: Masahiro Tanaka &lt;mtanaka@anyscale.com&gt;
diff --git a/deepspeed/compile/custom_ops/all_to_all.py b/deepspeed/compile/custom_ops/all_to_all.py
@@ -5,6 +5,7 @@
 
 import torch
 import deepspeed.comm as dist
+from torch.utils._sympy.functions import FloorDiv
 from .sp_dp_registry import get_group, is_setup, sp_size
 
 
@@ -54,9 +55,25 @@ def all_to_all(
 
 @torch.library.register_fake("autosp::all_to_all")
 def all_to_all_fake(input: torch.Tensor, scatter_idx: int, gather_idx: int, name: str):
+
+    def maybe_restore_sharded_dim(dim: torch.SymInt, factor: int):
+        # Torch 2.9 may keep `P * (s // P)` distinct from the original `s` during
+        # fake shape propagation. When the local dim is exactly `FloorDiv(s, P)`,
+        # restore the original symbol so downstream ops see a consistent sequence dim.
+        node = getattr(dim, "node", None)
+        if node is None:
+            return dim * factor
+
+        expr = node.expr
+        if isinstance(expr, FloorDiv) and expr.args[1] == factor:
+            hint = node.hint * factor if node.has_hint() else None
+            return node.shape_env.create_symintnode(expr.args[0], hint=hint)
+
+        return dim * factor
+
     B, dim1, dim2, H = input.shape
     if scatter_idx == 1:
-        return input.new_empty(B, dim1 // sp_size(), dim2 * sp_size(), H)
+        return input.new_empty(B, dim1 // sp_size(), maybe_restore_sharded_dim(dim2, sp_size()), H)
     else:
         return input.new_empty(B, dim1 * sp_size(), dim2 // sp_size(), H)
 
diff --git a/deepspeed/compile/passes/sp_compile.py b/deepspeed/compile/passes/sp_compile.py
@@ -8,7 +8,7 @@
 
 import torch
 import deepspeed.comm as dist
-from torch._subclasses.fake_tensor import FakeTensorMode
+from torch._subclasses.fake_tensor import FakeTensorMode, maybe_get_fake_mode
 from torch.fx import GraphModule, Node
 from torch.fx.passes.fake_tensor_prop import FakeTensorProp
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
@@ -80,7 +80,7 @@ def pass_shard_seq_dim(gm: GraphModule, example_inputs):
     seq_symint = val.shape[1]
     assert isinstance(
         seq_symint,
-        torch.SymInt), f"expected sequence dimension to be of type `torch.SymInt` but found `{type(seq_symint)}`"
+        torch.SymInt), f"expected sequence dimension to be of type {torch.SymInt!r} but found {type(seq_symint)!r}"
 
     sym_seq_dim_node = find_node_by_name(gm, str(seq_symint))
     if sym_seq_dim_node is None:
@@ -184,15 +184,52 @@ def pass_canonicalize(gm: GraphModule, real_inputs):
 
 
 def pass_propagate_shapes(gm: torch.fx.GraphModule, real_inputs):
-    shape_env = ShapeEnv()
-    fake_mode = FakeTensorMode(shape_env=shape_env)
+    fake_mode = None
+    for node in gm.graph.nodes:
+        # Reuse the graph's existing fake mode when metadata is already present.
+        # Its ShapeEnv owns the symbolic dims captured during tracing, so using a
+        # fresh mode here can desynchronize fake inputs from graph metadata.
+        if node.op == "placeholder" and "val" in node.meta:
+            fake_val = node.meta["val"]
+            if fake_val is not None and isinstance(fake_val, torch.Tensor):
+                fake_mode = maybe_get_fake_mode(fake_val)
+        elif fake_mode is None:
+            fake_val = node.meta.get("example_value", node.meta.get("val"))
+            if fake_val is not None and isinstance(fake_val, torch.Tensor):
+                fake_mode = maybe_get_fake_mode(fake_val)
+        if fake_mode is not None:
+            break
+
+    if fake_mode is None:
+        # Some graphs do not carry fake tensor metadata yet; create a fallback
+        # mode so FakeTensorProp can still run shape-only execution.
+        fake_mode = FakeTensorMode(shape_env=ShapeEnv())
+
     fake_inputs = []
     for t in real_inputs:
         if isinstance(t, torch.Tensor):
             fake_inputs.append(fake_mode.from_tensor(t))
         else:
             fake_inputs.append(t)
-    FakeTensorProp(gm).propagate(*fake_inputs)
+
+    # Torch 2.9 can fail fake propagation through SDPA's masked fake-CUDA path,
+    # even though this pass only needs output metadata. Temporarily clear
+    # attn_mask so shape propagation can proceed, then restore it immediately;
+    # SDPA output shapes are still determined by Q/K/V shapes, not mask values.
+    saved_sdpa_masks = []
+    for attn_node in get_sdpa_nodes(gm):
+        attn_mask = attn_node.kwargs.get("attn_mask")
+        if attn_mask is not None:
+            saved_sdpa_masks.append((attn_node, attn_mask))
+            attn_node.update_kwarg("attn_mask", None)
+
+    try:
+        # fake_inputs are already created under fake_mode above, so run
+        # propagation without reconverting them into a different fake mode.
+        FakeTensorProp(gm, mode=fake_mode).propagate_dont_convert_inputs(*fake_inputs)
+    finally:
+        for attn_node, attn_mask in saved_sdpa_masks:
+            attn_node.update_kwarg("attn_mask", attn_mask)
 
 
 def apply_autosp(gm: GraphModule,
diff --git a/deepspeed/compile/util.py b/deepspeed/compile/util.py
@@ -591,15 +591,21 @@ def shard_tensor_node(gm: GraphModule, tensor_node: Node):
     seq_len = val.shape[1]
 
     assert isinstance(
-        seq_len, torch.SymInt), f"Expected sequence dimension to be `torch.SymInt` but instead found `{type(seq_len)}`"
+        seq_len,
+        torch.SymInt), (f"Expected sequence dimension to be {torch.SymInt!r} but instead found {type(seq_len)!r}")
 
     symb_seq_int_node = find_node_by_name(gm, str(seq_len))
     assert symb_seq_int_node, f"Unable to find symbolic placeholder for {seq_len}"
 
     slice_all, slice_range = create_symbolic_slice_indices(gm, symb_seq_int_node)
     indices = (slice_all, slice_range)
 
-    with gm.graph.inserting_after(tensor_node):
+    positions = {node: i for i, node in enumerate(gm.graph.nodes)}
+    # Insert after the later dependency so the new getitem does not appear
+    # before the symbolic slice nodes in graph order. Torch 2.9 bf16 can place
+    # the SymInt placeholder after the tensor placeholder.
+    anchor_node = slice_range if positions[slice_range] > positions[tensor_node] else tensor_node
+    with gm.graph.inserting_after(anchor_node):
         sliced_node = gm.graph.call_function(
             operator.getitem,
             args=(tensor_node, indices),
diff --git a/tests/unit/v1/compile/test_compile_autosp.py b/tests/unit/v1/compile/test_compile_autosp.py
@@ -9,6 +9,7 @@
 import pytest
 import torch
 import torch.nn.functional as F
+from torch.fx import Graph, GraphModule
 
 from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
@@ -235,3 +236,49 @@ def test(self, seq_len):
                 f"User '{user.name}' still references the unsharded input_ids_node"
             assert sliced_node in user.all_input_nodes, \
                 f"User '{user.name}' does not reference the sliced node"
+
+    def test_preserves_topological_order_when_sym_placeholder_follows_input(self):
+        import deepspeed.comm as _dist
+        from deepspeed.compile.custom_ops import sp_dp_registry as _registry
+        from deepspeed.compile.fx import find_node_by_name, get_node_shape_meta
+        from deepspeed.compile.util import shard_tensor_node, get_input_id_node
+
+        # Regression test for the torch 2.9 bf16 trace where the SymInt
+        # placeholder can appear after input_ids. shard_tensor_node must still
+        # produce a lint-clean graph instead of inserting getitem before its
+        # symbolic slice dependencies.
+        gm, _ = create_gm_nodes(seq_len=64)
+        input_ids_node = get_input_id_node(gm)
+        seq_symint = get_node_shape_meta(input_ids_node).shape[1]
+        sym_seq_node = find_node_by_name(gm, str(seq_symint))
+        assert sym_seq_node is not None, "Symbolic sequence-length node not found in graph"
+
+        nodes = list(gm.graph.nodes)
+        input_idx = nodes.index(input_ids_node)
+        sym_idx = nodes.index(sym_seq_node)
+        assert sym_idx < input_idx, "Expected source graph to place the symbolic placeholder before input_ids"
+
+        # Reorder placeholders to mirror the torch 2.9 bf16 trace where the symbolic
+        # sequence placeholder can appear after input_ids.
+        reordered_nodes = nodes[:]
+        reordered_nodes.pop(input_idx)
+        reordered_nodes.insert(sym_idx, input_ids_node)
+        reordered_nodes.pop(sym_idx + 1)
+        reordered_nodes.insert(input_idx, sym_seq_node)
+
+        reordered_graph = Graph()
+        env = {}
+        for node in reordered_nodes:
+            new_node = reordered_graph.node_copy(node, lambda n: env[n])
+            new_node.meta = node.meta.copy()
+            env[node] = new_node
+        reordered_graph.lint()
+
+        reordered_gm = GraphModule(gm, reordered_graph)
+        reordered_input_ids = get_input_id_node(reordered_gm)
+
+        with patch.object(_registry, 'sp_size', return_value=_SP_SIZE), \
+             patch.object(_dist, 'get_rank', return_value=0):
+            shard_tensor_node(reordered_gm, reordered_input_ids)
+
+        reordered_gm.graph.lint()