patch for paddle

Waynezee · Waynezee · commit d69bdb23b633 · 2026-03-26T21:16:53.000+08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,9 +7,9 @@ name = "sonic-moe"
 dynamic = ["version"]
 requires-python = ">=3.12"
 dependencies = [
-    "nvidia-cutlass-dsl==4.4.0",
+    "nvidia-cutlass-dsl==4.4.1",
     "torch>=2.7.1,<=2.9.1",
-    "quack-kernels==0.2.5",
+    "quack-kernels @ git+https://github.com/PFCCLab/quack.git@954fe1638beef1b0a3d2fd463cfb6372c0b026cf",
     "ninja"
 ]
 
diff --git a/requirements.txt b/requirements.txt
@@ -3,9 +3,9 @@
 # ********************************************************************************
 
 # torch>=2.7.1
-nvidia-cutlass-dsl==4.3.0
+nvidia-cutlass-dsl==4.4.1
 # quack-kernels @ git+https://github.com/Dao-AILab/quack.git@3d0ab3ec2164749caac8f269f771e66a40efd2de
-quack-kernels @ git+https://github.com/PFCCLab/quack.git@12783f5ddbb20a2dd65bf88813db644c0e227f93
+quack-kernels @ git+https://github.com/PFCCLab/quack.git@954fe1638beef1b0a3d2fd463cfb6372c0b026cf
 pytest
 parameterized
 ninja
diff --git a/sonicmoe/count_cumsum/__init__.py b/sonicmoe/count_cumsum/__init__.py
@@ -22,7 +22,7 @@ def count_cumsum(x: torch.Tensor, E: int, do_cumsum: bool = True) -> torch.Tenso
 
     count_output = torch.empty(E, dtype=torch.int32, device=x.device)
     cumsum_output = torch.empty(E, dtype=torch.int32, device=x.device) if do_cumsum else None
-    stream = torch.cuda.current_stream(x.device).cuda_stream
+    stream = torch.cuda.current_stream(x.device).stream_base.raw_stream
 
     count_cumsum_cuda(x=x, count_output=count_output, cumsum_output=cumsum_output, stream=stream)
 
diff --git a/sonicmoe/functional/__init__.py b/sonicmoe/functional/__init__.py
@@ -167,6 +167,7 @@ def forward(
             num_activated_expert_per_token_offset,
         )
 
+        ctx.has_num_activated_expert_per_token_offset = num_activated_expert_per_token_offset is not None
         ctx.mark_non_differentiable(y1)
         ctx.set_materialize_grads(False)
 
@@ -260,7 +261,10 @@ def backward(ctx, _: None, dz: torch.Tensor):
         grads.extend([dx_reduced, dw1])
         if db1 is not None:
             grads.append(db1)
-        grads.extend([None] * 5)
+        if ctx.has_num_activated_expert_per_token_offset:
+            grads.extend([None] * 5)
+        else:
+            grads.extend([None] * 4)
         return tuple(grads)
 
 
@@ -280,7 +284,7 @@ def forward(
         x_gather_idx: torch.Tensor,
         s_scatter_idx: torch.Tensor,
         s_reverse_scatter_idx: torch.Tensor,
-        num_activated_expert_per_token_offset: torch.Tensor,
+        num_activated_expert_per_token_offset: torch.Tensor | None,
         is_varlen_K: bool,
         activation_type: ActivationType,
     ) -> torch.Tensor:
@@ -335,6 +339,7 @@ def forward(
             s_scatter_idx,
             s_reverse_scatter_idx,
         )
+        ctx.has_num_activated_expert_per_token_offset = num_activated_expert_per_token_offset is None
 
         return o
 
@@ -436,7 +441,12 @@ def backward(ctx, dout: torch.Tensor):
         grads.extend([None, dz, dw2])
         if db2 is not None:
             grads.append(db2)
-        grads.extend([ds, *[None] * 5])
+
+        if ctx.has_num_activated_expert_per_token_offset:
+            grads.extend([ds, *[None] * 4])
+        else:
+            grads.extend([ds, *[None] * 5])
+
         return tuple(grads)
 
 
diff --git a/sonicmoe/functional/grouped_gemm.py b/sonicmoe/functional/grouped_gemm.py
@@ -592,10 +592,6 @@ def load_A_gather(
                     mA_cur_copy = cute.make_tensor(tPrAptr, ((copy_elems_per_thr_load, 1), 1))
 
                     cute.copy(A_g2s_thr_copy, mA_cur_copy, tAsA[None, None, i])
-                else:
-                    zero_frag = cute.make_fragment_like(tAsA[None, None, i])
-                    zero_frag.fill(0.0)
-                    cute.basic_copy(zero_frag, tAsA[None, None, i])
 
             else:
                 MIdx = tmAIdx[i]
diff --git a/tests/moe_test.py b/tests/moe_test.py
@@ -47,7 +47,6 @@ class MoETest(TestCommons):
             [paddle.device("cuda")],
             [torch.bfloat16],
             [
-                ((16384 + 512) * 16, 512, 512, 128, 8)
                 (8192, 768, 256, 128, 8),
                 (8192, 768, 512, 64, 4),
                 (8192, 768, 1024, 32, 2),
diff --git a/tests/test_commons.py b/tests/test_commons.py
@@ -11,7 +11,18 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from torch.testing import assert_close
+
+
+def assert_close(actual, expected, rtol=None, atol=None, **kwargs):
+    """Drop-in replacement for torch.testing.assert_close,
+    since paddle.testing is not available in the current Paddle version."""
+    a = actual.detach().cpu().float().numpy()
+    b = expected.detach().cpu().float().numpy()
+    if rtol is None:
+        rtol = 1.3e-6
+    if atol is None:
+        atol = 1e-5
+    np.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
 
 
 class TestCommons(TestCase):

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,6 @@ class MoETest(TestCommons):`
`47`	`47`	`[paddle.device("cuda")],`
`48`	`48`	`[torch.bfloat16],`
`49`	`49`	`[`
`50`		`- ((16384 + 512) * 16, 512, 512, 128, 8)`
`51`	`50`	`(8192, 768, 256, 128, 8),`
`52`	`51`	`(8192, 768, 512, 64, 4),`
`53`	`52`	`(8192, 768, 1024, 32, 2),`