clang-format for a8w8_moe_blk_gemm1 splitk change

huaiguxu · huaiguxu · commit d13213163893 · 2025-12-12T16:24:23.000+08:00
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
@@ -171,7 +171,7 @@ using DeviceOpInstance                   = ck::tensor_operation::device::DeviceM
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
                 CShuffleMXDLPerWave,    CShuffleNXDLPerWave,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, 1>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ActOP, Nswizzle, true, false, MulRoutedWeight, int32_t, A0DataType>;
 #else
 static constexpr ck::index_t MPerBlock = 64; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale<
                Row, Col, DsLayout, ELayout,
@@ -185,7 +185,7 @@ static constexpr ck::index_t MPerBlock = 64; using DeviceOpInstance = ck::tensor
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
                4,    2,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, ActOP, Nswizzle, true, false, MulRoutedWeight, int32_t, A0DataType>;
 #endif
 // clang-format on
 
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale_splitk.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale_splitk.cpp
@@ -38,10 +38,10 @@ using Row    = ck::tensor_layout::gemm::RowMajor;
 using Col    = ck::tensor_layout::gemm::ColumnMajor;
 using Bypass = ck::tensor_layout::BypassLayoutVerification;
 
-using A0DataType = F8;
-using A1DataType = F32;
-using B0DataType = F8;
-using B1DataType = F32;
+using A0DataType       = F8;
+using A1DataType       = F32;
+using B0DataType       = F8;
+using B1DataType       = F32;
 using EDataType        = F32;
 using AccDataType      = F32;
 using CShuffleDataType = EDataType;
@@ -56,7 +56,6 @@ using D1Layout = Col;
 using D2Layout = ELayout;
 using DsLayout = ck::Tuple<D2Layout>;
 
-
 struct MulABScaleExpertWeight
 {
     template <typename E, typename C, typename D2>
@@ -113,30 +112,30 @@ static constexpr ck::index_t Scale_Block_M = 1;
 static constexpr ck::index_t Scale_Block_N = 128;
 static constexpr ck::index_t Scale_Block_K = 128;
 
-static constexpr ck::index_t Nswizzle = false;
-static constexpr ck::index_t IsInputGemm= true; //splitk gemm1 goes to gemm2 pipeline.
-static constexpr ck::index_t IsSplitK = true; //splitk gemm1 
-static constexpr ck::index_t ActOP    = 0; // 0: gelu_and_mul, 1: silu_and_mul
-static constexpr bool MulRoutedWeight = false; //splitk gemm1 does not do routedWeight.
+static constexpr ck::index_t Nswizzle    = false;
+static constexpr ck::index_t IsInputGemm = true;  // splitk gemm1 goes to gemm2 pipeline.
+static constexpr ck::index_t IsSplitK    = true;  // splitk gemm1
+static constexpr ck::index_t ActOP       = 0;     // 0: gelu_and_mul, 1: silu_and_mul
+static constexpr bool MulRoutedWeight    = false; // splitk gemm1 does not do routedWeight.
 
 #if 1
-static constexpr ck::index_t MPerBlock = 32;
-static constexpr ck::index_t NPerBlock   = 128;
-static constexpr ck::index_t MNPerXDL    = 16;
-static constexpr ck::index_t MXDLPerWave = MPerBlock / (MNPerXDL * 1);
-static constexpr ck::index_t NXDLPerWave = NPerBlock / (MNPerXDL * 4);
+static constexpr ck::index_t MPerBlock           = 32;
+static constexpr ck::index_t NPerBlock           = 128;
+static constexpr ck::index_t MNPerXDL            = 16;
+static constexpr ck::index_t MXDLPerWave         = MPerBlock / (MNPerXDL * 1);
+static constexpr ck::index_t NXDLPerWave         = NPerBlock / (MNPerXDL * 4);
 static constexpr ck::index_t CShuffleMXDLPerWave = MXDLPerWave;
 static constexpr ck::index_t CShuffleNXDLPerWave = NXDLPerWave;
-static constexpr ck::index_t BLOCKSIZE   = 256;
+static constexpr ck::index_t BLOCKSIZE           = 256;
 
-static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
-static constexpr ck::index_t AK1         = 16 / sizeof(A0DataType);
-static constexpr ck::index_t BK1         = 16 / sizeof(B0DataType);
-static constexpr ck::index_t EVec        = 16 / sizeof(EDataType);
-static constexpr ck::index_t D0Vec       = 1;
-static constexpr ck::index_t D1Vec       = 1;
+static constexpr ck::index_t KPerBlock = 128 / sizeof(A0DataType);
+static constexpr ck::index_t AK1       = 16 / sizeof(A0DataType);
+static constexpr ck::index_t BK1       = 16 / sizeof(B0DataType);
+static constexpr ck::index_t EVec      = 16 / sizeof(EDataType);
+static constexpr ck::index_t D0Vec     = 1;
+static constexpr ck::index_t D1Vec     = 1;
 
-using DeviceOpInstance                   = ck::tensor_operation::device::DeviceMoeGemmBlockScale
+using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale
     // clang-format off
         <      Row, Col, DsLayout, ELayout,
                A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
@@ -183,8 +182,8 @@ int main(int argc, char* argv[])
     bool time_kernel     = true;
 #if 1
     // GEMM shape
-    ck::index_t N       = 4096;
-    ck::index_t K       = 6144;
+    ck::index_t N = 4096;
+    ck::index_t K = 6144;
     // ck::index_t N       = 128;
     // ck::index_t K       = 512;
     ck::index_t experts = 8;
@@ -397,30 +396,29 @@ int main(int argc, char* argv[])
 
     b0_device_buf.ToDevice(b0_preshuffled.mData.data());
 
-    auto invoker = device_op.MakeInvoker();
-    auto argument =
-        device_op.MakeArgument(sorted_token_ids_dev.GetDeviceBuffer(),
-                               expert_ids_dev.GetDeviceBuffer(),
-                               max_token_id_dev.GetDeviceBuffer(),
-                               a0_device_buf.GetDeviceBuffer(),
-                               b0_device_buf.GetDeviceBuffer(),
-                               std::array<const void*, NumDTensor>{nullptr},
-                               e_device_buf.GetDeviceBuffer(),
-                               tokens,
-                               topk,
-                               sorted_size,
-                               N,
-                               K,
-                               StrideA,
-                               StrideB,
-                               StrideDs,
-                               StrideE,
-                               a1_device_buf.GetDeviceBuffer(),
-                               b1_device_buf.GetDeviceBuffer(),
-                               KBatch,
-                               a_element_op,
-                               b_element_op,
-                               cde_element_op);
+    auto invoker  = device_op.MakeInvoker();
+    auto argument = device_op.MakeArgument(sorted_token_ids_dev.GetDeviceBuffer(),
+                                           expert_ids_dev.GetDeviceBuffer(),
+                                           max_token_id_dev.GetDeviceBuffer(),
+                                           a0_device_buf.GetDeviceBuffer(),
+                                           b0_device_buf.GetDeviceBuffer(),
+                                           std::array<const void*, NumDTensor>{nullptr},
+                                           e_device_buf.GetDeviceBuffer(),
+                                           tokens,
+                                           topk,
+                                           sorted_size,
+                                           N,
+                                           K,
+                                           StrideA,
+                                           StrideB,
+                                           StrideDs,
+                                           StrideE,
+                                           a1_device_buf.GetDeviceBuffer(),
+                                           b1_device_buf.GetDeviceBuffer(),
+                                           KBatch,
+                                           a_element_op,
+                                           b_element_op,
+                                           cde_element_op);
 
     if(!device_op.IsSupportedArgument(argument))
     {
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
@@ -165,7 +165,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale<
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
                2,        2,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, MulRoutedWeight, int32_t, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, false, MulRoutedWeight, int32_t, A0DataType>;
 
 #else
 static constexpr ck::index_t MPerBlock = 64; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale<
@@ -180,7 +180,7 @@ static constexpr ck::index_t MPerBlock = 64; using DeviceOpInstance = ck::tensor
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
                2,    2,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 0, false, false, MulRoutedWeight, int32_t, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 0, false, false, false, MulRoutedWeight, int32_t, A0DataType>;
 #endif
 // clang-format on
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
@@ -203,7 +203,8 @@ struct DeviceMoeGemmBlockScale
             }
 
             index_t gdx, gdy, gdz;
-            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N * (IsInputGemm && IsSplitK ? 2 : 1), arg.K, arg.KBatch);
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(
+                arg.M, arg.N * (IsInputGemm && IsSplitK ? 2 : 1), arg.K, arg.KBatch);
 
             float ave_time = 0;
 
@@ -236,7 +237,7 @@ struct DeviceMoeGemmBlockScale
                         DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
                     });
                     ck::utility::RotatingMemWrapperMultiD<typename GridwiseGemm::Argument,
-                                                                      DsDataType>
+                                                          DsDataType>
                         rotating_mem(arg_,
                                      stream_config.rotating_count,
                                      size_a_buffer,
@@ -253,7 +254,8 @@ struct DeviceMoeGemmBlockScale
                         // if(arg_.KBatch > 1)
                         //     hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
                         //                                      0,
-                        //                                      arg_.M * arg_.N * sizeof(CDataType) * (IsInputGemm && IsSplitK ? 2 : 1),
+                        //                                      arg_.M * arg_.N * sizeof(CDataType)
+                        //                                      * (IsInputGemm && IsSplitK ? 2 : 1),
                         //                                      stream_config.stream_id_));
                     };
 
@@ -271,7 +273,8 @@ struct DeviceMoeGemmBlockScale
                     // if(arg.KBatch > 1)
                     //     hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
                     //                                      0,
-                    //                                      arg.M * arg.N * sizeof(CDataType) * (IsInputGemm && IsSplitK ? 2 : 1),
+                    //                                      arg.M * arg.N * sizeof(CDataType) *
+                    //                                      (IsInputGemm && IsSplitK ? 2 : 1),
                     //                                      stream_config.stream_id_));
 
                     ave_time = launch_and_time_kernel(
@@ -290,8 +293,9 @@ struct DeviceMoeGemmBlockScale
 
             constexpr index_t minimum_occupancy = (estimated_reg_total >= 256) ? 1 : 2;
 
-            constexpr auto MemoryDataOp =
-                (IsInputGemm && !IsSplitK) ? InMemoryDataOperationEnum::Set : InMemoryDataOperationEnum::AtomicAdd;
+            constexpr auto MemoryDataOp = (IsInputGemm && !IsSplitK)
+                                              ? InMemoryDataOperationEnum::Set
+                                              : InMemoryDataOperationEnum::AtomicAdd;
 
             if(has_main_k_block_loop)
             {
@@ -442,7 +446,7 @@ struct DeviceMoeGemmBlockScale
         {
             return false;
         }
-        if (arg.KBatch > 1 && arg.K % (KPerBlock * arg.KBatch) != 0)
+        if(arg.KBatch > 1 && arg.K % (KPerBlock * arg.KBatch) != 0)
         {
             // Not support Kpadding with KBatch > 1
             return false;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale_splitk.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale_splitk.hpp