From 9f6c3c0bc852de5156d9e40a6278ae59cb4fe5d4 Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Wed, 15 Apr 2026 15:15:21 +0800 Subject: [PATCH 01/28] support eb5 fp4 cuda_graph --- fastdeploy/model_executor/forward_meta.py | 2 ++ fastdeploy/model_executor/utils.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py index 44cf528bed3..30bf0ef0c18 100644 --- a/fastdeploy/model_executor/forward_meta.py +++ b/fastdeploy/model_executor/forward_meta.py @@ -158,6 +158,8 @@ class ForwardMeta: # for prefill exist_prefill: bool = False + audio_token_num: int = 0 + # for mla & dsa position_ids: Optional[paddle.Tensor] = None mask_encoder_batch: Optional[paddle.Tensor] = None diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index e63603047be..abe09a082ec 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -131,6 +131,9 @@ def slice_fn(weight_or_paramter, output_dim, start, end, step=1): def process_weight_transpose(layer, weight_name): weight = getattr(layer, weight_name) + if not weight._is_initialized(): + logger.info("权重没初始化啊!") + return if len(weight.shape) == 2: weight_shape = weight.shape[::-1] elif len(weight.shape) == 3: From 55d1a0537c6a499a7641c925955ee7576e829ddb Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Wed, 15 Apr 2026 16:01:44 +0800 Subject: [PATCH 02/28] update --- fastdeploy/model_executor/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index abe09a082ec..7d1ac37ae28 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -132,7 +132,7 @@ def slice_fn(weight_or_paramter, output_dim, start, end, step=1): def process_weight_transpose(layer, weight_name): weight = getattr(layer, weight_name) if not weight._is_initialized(): - logger.info("权重没初始化啊!") + logger.warning(f"Weight {weight_name} not initialized, skipping transpose.") return if len(weight.shape) == 2: weight_shape = weight.shape[::-1] From 3509714e929320a111c9a8b42a8b681e3d2b8674 Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Sat, 18 Apr 2026 15:56:11 +0800 Subject: [PATCH 03/28] merge develop --- .../model_executor/layers/moe/flashinfer_cutedsl_moe.py | 3 +++ fastdeploy/model_executor/layers/quantization/nvfp4.py | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py b/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py index 654b9090ec6..48126a4ccca 100644 --- a/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py +++ b/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py @@ -159,7 +159,10 @@ def flashinfer_cutedsl_moe_masked( # === GEMM1: gate+up projection === # grouped_gemm_nt_masked requires output in [m, 2*n, l] layout + # logger.info(f"num_experts:{num_experts}") + # logger.info(f"m:{m}") gateup_output = paddle.empty([num_experts, m, n * 2], dtype=paddle.bfloat16) + gateup_output = gateup_output.transpose([1, 2, 0]) # [m, 2*n, num_experts] # w1: [E, 2*n, k//2] → _perm(., 1, 2, 0) → [2*n, k//2, E] diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 7a22b3e1260..ff65b5cada4 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -670,6 +670,8 @@ def apply_ep_prefill( # 1. top experts and weights gate_out = gate(x.cast("float32")) + logger.info(f"gate_out.shape:{gate_out.shape}") + gate_out = paddle.randn(gate_out.shape, dtype="float32") topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) hidden_size = x.shape[1] @@ -735,7 +737,7 @@ def apply_ep_prefill( if self.ep_prefill_runner.num_worst_tokens > 0: use_tbo = os.getenv("USE_TBO", "0") - token_split_factor = 2 if int(use_tbo) == 1 else 1 + token_split_factor = 8 if int(use_tbo) == 1 else 1 max_tokens_per_rank = ( layer.fd_config.scheduler_config.max_num_batched_tokens // layer.fd_config.parallel_config.tensor_parallel_size @@ -754,7 +756,7 @@ def apply_ep_prefill( max_token_num = layer.ep_size * max_tokens_per_rank permute_input = permute_input.reshape([layer.num_local_experts, max_token_num, recv_x_value.shape[-1]]) - + # logger.info(f"permute_input.shaoe:{permute_input.shape}") # ffn_out: [num_local_experts, m, hidden_size] # NVFP4 dispatch returns BF16 (no pre-quantized scale), so permute_scale is empty. # Use per-expert 1/input_scale (up_gate_proj_input_scale_quant) as input_global_scale, From dd4118d10e53ee2377c9c43d07b5845b0acdb433 Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Sun, 19 Apr 2026 23:27:32 +0800 Subject: [PATCH 04/28] Support FP4 communication quantization --- .../moe/prefill_permute_to_masked_gemm.cu | 27 +++++- fastdeploy/envs.py | 2 + .../layers/quantization/nvfp4.py | 84 ++++++++++++++----- 3 files changed, 89 insertions(+), 24 deletions(-) diff --git a/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu b/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu index a5fb2494605..09ec220e34d 100644 --- a/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu +++ b/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu @@ -211,6 +211,21 @@ std::vector PrefillPermuteToMaskedGemm( } } } + case paddle::DataType::UINT8: { + switch (scale.dtype()) { + case paddle::DataType::FLOAT32: { + switch (topk) { + DISPATCH_TOPK(paddle::DataType::UINT8, paddle::DataType::FLOAT32, 4) + DISPATCH_TOPK(paddle::DataType::UINT8, paddle::DataType::FLOAT32, 6) + DISPATCH_TOPK(paddle::DataType::UINT8, paddle::DataType::FLOAT32, 8) + default: + PD_THROW("Unsupported topk value, must be 4 or 6 or 8"); + } + } + default: + PD_THROW("Unsupported scale dtype for UINT8 x, must be float32"); + } + } case paddle::DataType::BFLOAT16: { switch (scale.dtype()) { case paddle::DataType::FLOAT32: { @@ -235,10 +250,20 @@ std::vector PrefillPermuteToMaskedGemm( PD_THROW("Unsupported topk value, must be 4 or 8"); } } + case paddle::DataType::UINT8: { + switch (topk) { + DISPATCH_TOPK( + paddle::DataType::BFLOAT16, paddle::DataType::UINT8, 4) + DISPATCH_TOPK( + paddle::DataType::BFLOAT16, paddle::DataType::UINT8, 8) + default: + PD_THROW("Unsupported topk value, must be 4 or 8"); + } + } } } default: - PD_THROW("Unsupported dtype, must be float8_e4m3fn or bfloat16"); + PD_THROW("Unsupported dtype, must be uint8, float8_e4m3fn or bfloat16"); } #undef DISPATCH_TOPK diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index e52604f2ff0..e5ae65ffc84 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -256,6 +256,8 @@ def _validate_split_kv_size(value: int) -> int: # train-infer consistency, used in RL # Whether to align RoPE and moe gate precision with training "FD_ENABLE_RL": lambda: int(os.getenv("FD_ENABLE_RL", "0")), + # Whether to enable FP4 communication quantization for DeepEP prefill dispatch + "FD_USE_NVFP4_COMM_QUANT": lambda: bool(int(os.getenv("FD_USE_NVFP4_COMM_QUANT", "0"))), # Whether to use phi FP8 quantization,if 1,use paddle default. "FD_USE_PHI_FP8_QUANT": lambda: bool(int(os.getenv("FD_USE_PHI_FP8_QUANT", "1"))), # Enables the Paddle/phi combined TopK operator only when topk_method == noaux_tc, diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 24f7983adb9..f5723b3ddde 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -670,14 +670,28 @@ def apply_ep_prefill( # 1. top experts and weights gate_out = gate(x.cast("float32")) - logger.info(f"gate_out.shape:{gate_out.shape}") - gate_out = paddle.randn(gate_out.shape, dtype="float32") topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) hidden_size = x.shape[1] if topk_ids_hookfunc is not None: topk_ids_hookfunc(topk_ids=topk_idx) + use_fp4_comm_quant = envs.FD_USE_NVFP4_COMM_QUANT + + if use_fp4_comm_quant: + # FP4 communication quantization: quantize to FP4 before dispatch, + # reducing communication volume by ~2x vs BF16. + x_fp4, x_fp4_scale = fp4_quantize( + x, layer.up_gate_proj_input_scale_quant, sf_vec_size=16, is_sf_swizzled_layout=False + ) + x_fp4_scale = x_fp4_scale.view(paddle.float32) # float8_e4m3fn -> float32 + dispatch_input = x_fp4 + dispatch_scale = x_fp4_scale + else: + # BF16 communication: dispatch BF16 data without pre-quantization. + dispatch_input = x + dispatch_scale = None + event = deep_ep.Buffer.capture() if self.ep_prefill_runner.num_worst_tokens <= 0: @@ -692,11 +706,12 @@ def apply_ep_prefill( handle, event, ) = self.ep_prefill_runner.dispatch( - x, + dispatch_input, topk_idx, topk_weights, expert_alignment=128, previous_event=event, + x_scale_tensor=dispatch_scale, ) if self.ep_prefill_runner.num_worst_tokens > 0: @@ -737,7 +752,7 @@ def apply_ep_prefill( if self.ep_prefill_runner.num_worst_tokens > 0: use_tbo = os.getenv("USE_TBO", "0") - token_split_factor = 8 if int(use_tbo) == 1 else 1 + token_split_factor = 2 if int(use_tbo) == 1 else 1 max_tokens_per_rank = ( layer.fd_config.scheduler_config.max_num_batched_tokens // layer.fd_config.parallel_config.tensor_parallel_size @@ -754,25 +769,48 @@ def apply_ep_prefill( ) ) - max_token_num = layer.ep_size * max_tokens_per_rank - permute_input = permute_input.reshape([layer.num_local_experts, max_token_num, recv_x_value.shape[-1]]) - # logger.info(f"permute_input.shaoe:{permute_input.shape}") - # ffn_out: [num_local_experts, m, hidden_size] - # NVFP4 dispatch returns BF16 (no pre-quantized scale), so permute_scale is empty. - # Use per-expert 1/input_scale (up_gate_proj_input_scale_quant) as input_global_scale, - # consistent with apply_ep_decode which also uses this value directly. - ffn_out = flashinfer_cutedsl_moe_masked( - hidden_states=(permute_input, None), - input_global_scale=layer.up_gate_proj_input_scale_quant.expand([layer.num_local_experts]), - w1=layer.up_gate_proj_weight, - w1_blockscale=layer.up_gate_proj_blockscale_swizzled, - w1_alpha=layer.g1_alphas, - w2=layer.down_proj_weight, - a2_global_scale=layer.down_proj_input_scale_quant.expand([layer.num_local_experts]), - w2_blockscale=layer.down_proj_blockscale_swizzled, - w2_alpha=layer.g2_alphas, - masked_m=token_nums_per_expert.squeeze(-1), - ) + if recv_x_scale is not None: + # FP4 pre-quantized dispatch path: + # permute_input is uint8 [E, M, hidden//2] (FP4 packed) + # permute_scale is float32 [E, M, hidden//64] with custom strides + # from C++ kernel (physical layout [E, S, M], non-contiguous). + # Convert scale to float8_e4m3fn, then apply swizzle for + # grouped_gemm_nt_masked which expects SFA in swizzled layout + # (32, 4, rm, 4, rk, l) logical / (l, rm, rk, 32, 4, 4) physical. + # This is the same _process_scale_interleaved used for weight + # blockscale, converting flat [E, M, K] to swizzled layout. + permute_scale_fp8 = permute_scale.contiguous().view(paddle.float8_e4m3fn) + permute_scale_swizzled = _process_scale_interleaved(permute_scale_fp8) + permute_input_t = permute_input.transpose([1, 2, 0]) + permute_scale_swizzled_t = permute_scale_swizzled.transpose([1, 2, 0]) + + ffn_out = flashinfer_cutedsl_moe_masked( + hidden_states=(permute_input_t, permute_scale_swizzled_t), + input_global_scale=None, + w1=layer.up_gate_proj_weight, + w1_blockscale=layer.up_gate_proj_blockscale_swizzled, + w1_alpha=layer.g1_alphas, + w2=layer.down_proj_weight, + a2_global_scale=layer.down_proj_input_scale_quant.expand([layer.num_local_experts]), + w2_blockscale=layer.down_proj_blockscale_swizzled, + w2_alpha=layer.g2_alphas, + masked_m=token_nums_per_expert.squeeze(-1), + ) + else: + # BF16 dispatch path: permute_input is BF16, quantize to FP4 + # inside flashinfer_cutedsl_moe_masked + ffn_out = flashinfer_cutedsl_moe_masked( + hidden_states=(permute_input, None), + input_global_scale=layer.up_gate_proj_input_scale_quant.expand([layer.num_local_experts]), + w1=layer.up_gate_proj_weight, + w1_blockscale=layer.up_gate_proj_blockscale_swizzled, + w1_alpha=layer.g1_alphas, + w2=layer.down_proj_weight, + a2_global_scale=layer.down_proj_input_scale_quant.expand([layer.num_local_experts]), + w2_blockscale=layer.down_proj_blockscale_swizzled, + w2_alpha=layer.g2_alphas, + masked_m=token_nums_per_expert.squeeze(-1), + ) tmp_ffn_out = call_depermute_prefill_combine( x=ffn_out, From 3fdbc08bee29200da8d5ad53940e9d44a6dad8f7 Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Sun, 19 Apr 2026 23:30:32 +0800 Subject: [PATCH 05/28] fix --- fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py b/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py index 48126a4ccca..654b9090ec6 100644 --- a/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py +++ b/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py @@ -159,10 +159,7 @@ def flashinfer_cutedsl_moe_masked( # === GEMM1: gate+up projection === # grouped_gemm_nt_masked requires output in [m, 2*n, l] layout - # logger.info(f"num_experts:{num_experts}") - # logger.info(f"m:{m}") gateup_output = paddle.empty([num_experts, m, n * 2], dtype=paddle.bfloat16) - gateup_output = gateup_output.transpose([1, 2, 0]) # [m, 2*n, num_experts] # w1: [E, 2*n, k//2] → _perm(., 1, 2, 0) → [2*n, k//2, E] From 6c3cc4b3de4911dc029ba26784c78b00939eda89 Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Tue, 21 Apr 2026 11:49:14 +0800 Subject: [PATCH 06/28] update --- .../layers/moe/flashinfer_cutedsl_moe.py | 58 +++++++++++---- .../layers/quantization/nvfp4.py | 70 ++++++++++++------- 2 files changed, 91 insertions(+), 37 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py b/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py index 654b9090ec6..74310e33924 100644 --- a/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py +++ b/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py @@ -78,6 +78,7 @@ def flashinfer_cutedsl_moe_masked( down_sm_count: Optional[int] = None, down_signals: Optional[paddle.Tensor] = None, down_start_event: Optional[Any] = None, + pre_permuted: bool = False, ): """ Perform masked Mixture-of-Experts computation with FlashInfer's CuteDSL kernels. @@ -113,7 +114,12 @@ def flashinfer_cutedsl_moe_masked( assert len(hidden_states) == 2, f"hidden_states must be a tuple of length 2, got {len(hidden_states)}" # intermediate_size derived from w2 last dimension - n = w2.shape[-1] * 2 + # - normal: w2 [E, k, n//2] -> n = shape[-1] * 2 + # - pre_permuted: w2 [k, n//2, E] -> n = shape[-2] * 2 + if pre_permuted: + n = w2.shape[-2] * 2 + else: + n = w2.shape[-1] * 2 if hidden_states[1] is not None: # Pre-quantized path: tokens already FP4-packed by dispatch @@ -140,14 +146,26 @@ def flashinfer_cutedsl_moe_masked( input_global_scale, ) - assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n={2*n}, got {w1.shape[-2]}" - assert w1.shape[-1] * 2 == k, f"w1 last dim * 2 must equal k={k}, got {w1.shape[-1] * 2}" - assert ( - w2.shape[-2] == k and w2.shape[-1] == n // 2 - ), f"w2 shape mismatch, got {list(w2.shape[-2:])}, expected [{k}, {n // 2}]" - assert list(w1_alpha.shape) == [num_experts], f"w1_alpha must be (l,), got {w1_alpha.shape}" + if pre_permuted: + # w1 [2n, k//2, E], w2 [k, n//2, E] + assert w1.shape[0] == 2 * n, f"w1 dim0 must be 2*n={2*n}, got {w1.shape[0]}" + assert w1.shape[1] * 2 == k, f"w1 dim1 * 2 must equal k={k}, got {w1.shape[1] * 2}" + assert ( + w2.shape[0] == k and w2.shape[1] == n // 2 + ), f"w2 shape mismatch, got {list(w2.shape[:2])}, expected [{k}, {n // 2}]" + else: + assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n={2*n}, got {w1.shape[-2]}" + assert w1.shape[-1] * 2 == k, f"w1 last dim * 2 must equal k={k}, got {w1.shape[-1] * 2}" + assert ( + w2.shape[-2] == k and w2.shape[-1] == n // 2 + ), f"w2 shape mismatch, got {list(w2.shape[-2:])}, expected [{k}, {n // 2}]" + if pre_permuted: + assert list(w1_alpha.shape) == [1, 1, num_experts], f"w1_alpha must be (1,1,l), got {w1_alpha.shape}" + assert list(w2_alpha.shape) == [1, 1, num_experts], f"w2_alpha must be (1,1,l), got {w2_alpha.shape}" + else: + assert list(w1_alpha.shape) == [num_experts], f"w1_alpha must be (l,), got {w1_alpha.shape}" + assert list(w2_alpha.shape) == [num_experts], f"w2_alpha must be (l,), got {w2_alpha.shape}" assert list(a2_global_scale.shape) == [num_experts], f"a2_global_scale must be (l,), got {a2_global_scale.shape}" - assert list(w2_alpha.shape) == [num_experts], f"w2_alpha must be (l,), got {w2_alpha.shape}" assert _is_dtype(a_q, "uint8") assert _is_dtype(a_q_sf, "float8_e4m3fn") @@ -165,16 +183,24 @@ def flashinfer_cutedsl_moe_masked( # w1: [E, 2*n, k//2] → _perm(., 1, 2, 0) → [2*n, k//2, E] # w1_blockscale:[E, 2*n, k//G] → _perm(., 1, 2, 0) → [2*n, k//G, E] # Both must share the same expert-last layout for grouped_gemm_nt_masked. + if pre_permuted: + w1_p = w1 + w1_bs_p = w1_blockscale + w1_alpha_r = w1_alpha + else: + w1_p = _perm(w1, 1, 2, 0) + w1_bs_p = _perm(w1_blockscale, 1, 2, 0) + w1_alpha_r = w1_alpha.reshape([1, 1, num_experts]) grouped_gemm_nt_masked( (a_q, a_q_sf), - (_perm(w1, 1, 2, 0), _perm(w1_blockscale, 1, 2, 0)), + (w1_p, w1_bs_p), gateup_output, masked_m, ab_dtype=ab_dtype, sf_dtype=sf_dtype, c_dtype=c_dtype, sf_vec_size=sf_vec_size, - alpha=w1_alpha.reshape([1, 1, num_experts]), + alpha=w1_alpha_r, alpha_dtype=get_cute_dtype(w1_alpha), ) # fills gateup_output in logical [m, 2*n, l] @@ -197,16 +223,24 @@ def flashinfer_cutedsl_moe_masked( # w2: [E, k, n//2] → _perm(., 1, 2, 0) → [k, n//2, E] # w2_blockscale:[E, k, n//G] → _perm(., 1, 2, 0) → [k, n//G, E] # Both must share the same expert-last layout for grouped_gemm_nt_masked. + if pre_permuted: + w2_p = w2 + w2_bs_p = w2_blockscale + w2_alpha_r = w2_alpha + else: + w2_p = _perm(w2, 1, 2, 0) + w2_bs_p = _perm(w2_blockscale, 1, 2, 0) + w2_alpha_r = w2_alpha.reshape([1, 1, num_experts]) grouped_gemm_nt_masked( (diq, diq_sf), - (_perm(w2, 1, 2, 0), _perm(w2_blockscale, 1, 2, 0)), + (w2_p, w2_bs_p), out, masked_m, ab_dtype=ab_dtype, sf_dtype=sf_dtype, c_dtype=c_dtype, sf_vec_size=sf_vec_size, - alpha=w2_alpha.reshape([1, 1, num_experts]), + alpha=w2_alpha_r, alpha_dtype=get_cute_dtype(w2_alpha), **( dict( diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index f5723b3ddde..18f5770c41c 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -659,6 +659,22 @@ def process_weights_after_loading(self, layer): free_tensor(layer.down_proj_weight_scale) layer.down_proj_weight_scale = None + # Cache permuted / reshaped views once to avoid per-forward Python overhead + # inside flashinfer_cutedsl_moe_masked. These are plain attributes (not + # parameters) — they share storage with the originals via a transpose view. + E = layer.num_local_experts + # w1: [E, 2n, k//2] -> [2n, k//2, E]; w2: [E, k, n//2] -> [k, n//2, E] + layer.up_gate_proj_weight_t = layer.up_gate_proj_weight.transpose([1, 2, 0]) + layer.down_proj_weight_t = layer.down_proj_weight.transpose([1, 2, 0]) + layer.up_gate_proj_blockscale_swizzled_t = layer.up_gate_proj_blockscale_swizzled.transpose([1, 2, 0]) + layer.down_proj_blockscale_swizzled_t = layer.down_proj_blockscale_swizzled.transpose([1, 2, 0]) + # alpha: (E,) -> (1, 1, E) for grouped GEMM broadcast + layer.g1_alphas_r = layer.g1_alphas.reshape([1, 1, E]) + layer.g2_alphas_r = layer.g2_alphas.reshape([1, 1, E]) + # a2_global_scale is always expanded to (E,); pre-expand once + layer.down_proj_input_scale_quant_expand = layer.down_proj_input_scale_quant.expand([E]) + layer.up_gate_proj_input_scale_quant_expand = layer.up_gate_proj_input_scale_quant.expand([E]) + def apply_ep_prefill( self, layer: nn.Layer, @@ -670,6 +686,8 @@ def apply_ep_prefill( # 1. top experts and weights gate_out = gate(x.cast("float32")) + gate_out = paddle.randn(gate_out.shape, dtype="float32") + # logger.info(f"gate_out.shape:{gate_out.shape}") topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) hidden_size = x.shape[1] @@ -752,7 +770,7 @@ def apply_ep_prefill( if self.ep_prefill_runner.num_worst_tokens > 0: use_tbo = os.getenv("USE_TBO", "0") - token_split_factor = 2 if int(use_tbo) == 1 else 1 + token_split_factor = 8 if int(use_tbo) == 1 else 1 max_tokens_per_rank = ( layer.fd_config.scheduler_config.max_num_batched_tokens // layer.fd_config.parallel_config.tensor_parallel_size @@ -787,29 +805,31 @@ def apply_ep_prefill( ffn_out = flashinfer_cutedsl_moe_masked( hidden_states=(permute_input_t, permute_scale_swizzled_t), input_global_scale=None, - w1=layer.up_gate_proj_weight, - w1_blockscale=layer.up_gate_proj_blockscale_swizzled, - w1_alpha=layer.g1_alphas, - w2=layer.down_proj_weight, - a2_global_scale=layer.down_proj_input_scale_quant.expand([layer.num_local_experts]), - w2_blockscale=layer.down_proj_blockscale_swizzled, - w2_alpha=layer.g2_alphas, + w1=layer.up_gate_proj_weight_t, + w1_blockscale=layer.up_gate_proj_blockscale_swizzled_t, + w1_alpha=layer.g1_alphas_r, + w2=layer.down_proj_weight_t, + a2_global_scale=layer.down_proj_input_scale_quant_expand, + w2_blockscale=layer.down_proj_blockscale_swizzled_t, + w2_alpha=layer.g2_alphas_r, masked_m=token_nums_per_expert.squeeze(-1), + pre_permuted=True, ) else: # BF16 dispatch path: permute_input is BF16, quantize to FP4 # inside flashinfer_cutedsl_moe_masked ffn_out = flashinfer_cutedsl_moe_masked( hidden_states=(permute_input, None), - input_global_scale=layer.up_gate_proj_input_scale_quant.expand([layer.num_local_experts]), - w1=layer.up_gate_proj_weight, - w1_blockscale=layer.up_gate_proj_blockscale_swizzled, - w1_alpha=layer.g1_alphas, - w2=layer.down_proj_weight, - a2_global_scale=layer.down_proj_input_scale_quant.expand([layer.num_local_experts]), - w2_blockscale=layer.down_proj_blockscale_swizzled, - w2_alpha=layer.g2_alphas, + input_global_scale=layer.up_gate_proj_input_scale_quant_expand, + w1=layer.up_gate_proj_weight_t, + w1_blockscale=layer.up_gate_proj_blockscale_swizzled_t, + w1_alpha=layer.g1_alphas_r, + w2=layer.down_proj_weight_t, + a2_global_scale=layer.down_proj_input_scale_quant_expand, + w2_blockscale=layer.down_proj_blockscale_swizzled_t, + w2_alpha=layer.g2_alphas_r, masked_m=token_nums_per_expert.squeeze(-1), + pre_permuted=True, ) tmp_ffn_out = call_depermute_prefill_combine( @@ -873,18 +893,18 @@ def apply_ep_decode( ) # Compute FFN via CuteDSL masked grouped GEMM - num_experts = layer.num_local_experts ffn_out = flashinfer_cutedsl_moe_masked( hidden_states=(recv_x, None), - input_global_scale=layer.up_gate_proj_input_scale_quant.expand([num_experts]), - w1=layer.up_gate_proj_weight, - w1_blockscale=layer.up_gate_proj_blockscale_swizzled, - w1_alpha=layer.g1_alphas, - w2=layer.down_proj_weight, - a2_global_scale=layer.down_proj_input_scale_quant.expand([num_experts]), - w2_blockscale=layer.down_proj_blockscale_swizzled, - w2_alpha=layer.g2_alphas, + input_global_scale=layer.up_gate_proj_input_scale_quant_expand, + w1=layer.up_gate_proj_weight_t, + w1_blockscale=layer.up_gate_proj_blockscale_swizzled_t, + w1_alpha=layer.g1_alphas_r, + w2=layer.down_proj_weight_t, + a2_global_scale=layer.down_proj_input_scale_quant_expand, + w2_blockscale=layer.down_proj_blockscale_swizzled_t, + w2_alpha=layer.g2_alphas_r, masked_m=token_nums_per_expert, + pre_permuted=True, ) if shared_experts is not None: From e89dff79c11c97f0875badf5792c8394a40d4fea Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Tue, 21 Apr 2026 16:02:56 +0800 Subject: [PATCH 07/28] fix --- .../layers/quantization/nvfp4.py | 149 +++++++----------- 1 file changed, 58 insertions(+), 91 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 18f5770c41c..515f26ea363 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -659,21 +659,42 @@ def process_weights_after_loading(self, layer): free_tensor(layer.down_proj_weight_scale) layer.down_proj_weight_scale = None - # Cache permuted / reshaped views once to avoid per-forward Python overhead - # inside flashinfer_cutedsl_moe_masked. These are plain attributes (not - # parameters) — they share storage with the originals via a transpose view. + # === Pre-permute weights for flashinfer cutedsl grouped GEMM === + # flashinfer_cutedsl_moe_masked requires weights in [2n, k//2, E] / [k, n//2, E]. + # Paddle's allocator is cached (freed memory stays in the pool), so + # going through create_parameter_and_copy would leave 2x allocations + # lingering. Instead we hold the transposed tensor as a plain attribute + # (paddle.transpose may return a view; if it's a view, storage is + # shared with the original and stays alive after free_tensor clears the + # original tensor handle, since DenseTensor uses shared_ptr). E = layer.num_local_experts - # w1: [E, 2n, k//2] -> [2n, k//2, E]; w2: [E, k, n//2] -> [k, n//2, E] - layer.up_gate_proj_weight_t = layer.up_gate_proj_weight.transpose([1, 2, 0]) - layer.down_proj_weight_t = layer.down_proj_weight.transpose([1, 2, 0]) - layer.up_gate_proj_blockscale_swizzled_t = layer.up_gate_proj_blockscale_swizzled.transpose([1, 2, 0]) - layer.down_proj_blockscale_swizzled_t = layer.down_proj_blockscale_swizzled.transpose([1, 2, 0]) - # alpha: (E,) -> (1, 1, E) for grouped GEMM broadcast - layer.g1_alphas_r = layer.g1_alphas.reshape([1, 1, E]) - layer.g2_alphas_r = layer.g2_alphas.reshape([1, 1, E]) - # a2_global_scale is always expanded to (E,); pre-expand once - layer.down_proj_input_scale_quant_expand = layer.down_proj_input_scale_quant.expand([E]) - layer.up_gate_proj_input_scale_quant_expand = layer.up_gate_proj_input_scale_quant.expand([E]) + if envs.FD_MOE_BACKEND == "flashinfer-cutedsl": + # w1: [E, 2n, k//2] -> [2n, k//2, E] + layer.up_gate_proj_weight_t = layer.up_gate_proj_weight.transpose([1, 2, 0]) + free_tensor(layer.up_gate_proj_weight) + layer.up_gate_proj_weight = None + + # w2: [E, k, n//2] -> [k, n//2, E] + layer.down_proj_weight_t = layer.down_proj_weight.transpose([1, 2, 0]) + free_tensor(layer.down_proj_weight) + layer.down_proj_weight = None + + # blockscale w1: [E, 2n, k//G] -> [2n, k//G, E] + layer.up_gate_proj_blockscale_swizzled_t = layer.up_gate_proj_blockscale_swizzled.transpose([1, 2, 0]) + free_tensor(layer.up_gate_proj_blockscale_swizzled) + layer.up_gate_proj_blockscale_swizzled = None + + # blockscale w2: [E, k, n//G] -> [k, n//G, E] + layer.down_proj_blockscale_swizzled_t = layer.down_proj_blockscale_swizzled.transpose([1, 2, 0]) + free_tensor(layer.down_proj_blockscale_swizzled) + layer.down_proj_blockscale_swizzled = None + + # alpha: (E,) -> (1, 1, E) broadcast-ready. reshape is a view, no copy. + layer.g1_alphas_r = layer.g1_alphas.reshape([1, 1, E]) + layer.g2_alphas_r = layer.g2_alphas.reshape([1, 1, E]) + + layer.down_proj_input_scale_quant_expand = layer.down_proj_input_scale_quant.expand([E]) + layer.up_gate_proj_input_scale_quant_expand = layer.up_gate_proj_input_scale_quant.expand([E]) def apply_ep_prefill( self, @@ -686,30 +707,13 @@ def apply_ep_prefill( # 1. top experts and weights gate_out = gate(x.cast("float32")) - gate_out = paddle.randn(gate_out.shape, dtype="float32") - # logger.info(f"gate_out.shape:{gate_out.shape}") + # gate_out = paddle.randn(gate_out.shape, dtype="float32") topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) hidden_size = x.shape[1] if topk_ids_hookfunc is not None: topk_ids_hookfunc(topk_ids=topk_idx) - use_fp4_comm_quant = envs.FD_USE_NVFP4_COMM_QUANT - - if use_fp4_comm_quant: - # FP4 communication quantization: quantize to FP4 before dispatch, - # reducing communication volume by ~2x vs BF16. - x_fp4, x_fp4_scale = fp4_quantize( - x, layer.up_gate_proj_input_scale_quant, sf_vec_size=16, is_sf_swizzled_layout=False - ) - x_fp4_scale = x_fp4_scale.view(paddle.float32) # float8_e4m3fn -> float32 - dispatch_input = x_fp4 - dispatch_scale = x_fp4_scale - else: - # BF16 communication: dispatch BF16 data without pre-quantization. - dispatch_input = x - dispatch_scale = None - event = deep_ep.Buffer.capture() if self.ep_prefill_runner.num_worst_tokens <= 0: @@ -724,12 +728,12 @@ def apply_ep_prefill( handle, event, ) = self.ep_prefill_runner.dispatch( - dispatch_input, + x, topk_idx, topk_weights, expert_alignment=128, previous_event=event, - x_scale_tensor=dispatch_scale, + x_scale_tensor=None, ) if self.ep_prefill_runner.num_worst_tokens > 0: @@ -745,21 +749,13 @@ def apply_ep_prefill( if thread_name not in global_values: global_values[thread_name] = {} - # nvfp4 dispatch returns a plain BF16 tensor (no fp8 scale), unlike deepgemm which returns (value, scale) tuple - if isinstance(recv_x, tuple): - (recv_x_value, recv_x_scale) = recv_x - else: - recv_x_value = recv_x - recv_x_scale = None - global_values[thread_name]["x"] = x global_values[thread_name]["topk_idx"] = topk_idx global_values[thread_name]["topk_weights"] = topk_weights global_values[thread_name]["x_scale_tensor"] = None - global_values[thread_name]["recv_x_value"] = recv_x_value - global_values[thread_name]["recv_x_scale"] = recv_x_scale + global_values[thread_name]["recv_x"] = recv_x global_values[thread_name]["recv_topk_idx"] = recv_topk_idx global_values[thread_name]["recv_topk_weights"] = recv_topk_weights global_values[thread_name]["handle"] = handle @@ -770,7 +766,7 @@ def apply_ep_prefill( if self.ep_prefill_runner.num_worst_tokens > 0: use_tbo = os.getenv("USE_TBO", "0") - token_split_factor = 8 if int(use_tbo) == 1 else 1 + token_split_factor = 16 if int(use_tbo) == 1 else 1 max_tokens_per_rank = ( layer.fd_config.scheduler_config.max_num_batched_tokens // layer.fd_config.parallel_config.tensor_parallel_size @@ -779,64 +775,35 @@ def apply_ep_prefill( permute_input, permute_scale, permuted_indice_map, token_nums_per_expert = ( call_prefill_permute_to_masked_gemm( - x=recv_x_value, - scale=recv_x_scale, + x=recv_x, + scale=None, topk_ids=recv_topk_idx, num_local_experts=layer.num_local_experts, max_token_num=layer.ep_size * max_tokens_per_rank, ) ) - if recv_x_scale is not None: - # FP4 pre-quantized dispatch path: - # permute_input is uint8 [E, M, hidden//2] (FP4 packed) - # permute_scale is float32 [E, M, hidden//64] with custom strides - # from C++ kernel (physical layout [E, S, M], non-contiguous). - # Convert scale to float8_e4m3fn, then apply swizzle for - # grouped_gemm_nt_masked which expects SFA in swizzled layout - # (32, 4, rm, 4, rk, l) logical / (l, rm, rk, 32, 4, 4) physical. - # This is the same _process_scale_interleaved used for weight - # blockscale, converting flat [E, M, K] to swizzled layout. - permute_scale_fp8 = permute_scale.contiguous().view(paddle.float8_e4m3fn) - permute_scale_swizzled = _process_scale_interleaved(permute_scale_fp8) - permute_input_t = permute_input.transpose([1, 2, 0]) - permute_scale_swizzled_t = permute_scale_swizzled.transpose([1, 2, 0]) - - ffn_out = flashinfer_cutedsl_moe_masked( - hidden_states=(permute_input_t, permute_scale_swizzled_t), - input_global_scale=None, - w1=layer.up_gate_proj_weight_t, - w1_blockscale=layer.up_gate_proj_blockscale_swizzled_t, - w1_alpha=layer.g1_alphas_r, - w2=layer.down_proj_weight_t, - a2_global_scale=layer.down_proj_input_scale_quant_expand, - w2_blockscale=layer.down_proj_blockscale_swizzled_t, - w2_alpha=layer.g2_alphas_r, - masked_m=token_nums_per_expert.squeeze(-1), - pre_permuted=True, - ) - else: - # BF16 dispatch path: permute_input is BF16, quantize to FP4 - # inside flashinfer_cutedsl_moe_masked - ffn_out = flashinfer_cutedsl_moe_masked( - hidden_states=(permute_input, None), - input_global_scale=layer.up_gate_proj_input_scale_quant_expand, - w1=layer.up_gate_proj_weight_t, - w1_blockscale=layer.up_gate_proj_blockscale_swizzled_t, - w1_alpha=layer.g1_alphas_r, - w2=layer.down_proj_weight_t, - a2_global_scale=layer.down_proj_input_scale_quant_expand, - w2_blockscale=layer.down_proj_blockscale_swizzled_t, - w2_alpha=layer.g2_alphas_r, - masked_m=token_nums_per_expert.squeeze(-1), - pre_permuted=True, - ) + # BF16 dispatch path: permute_input is BF16, quantize to FP4 + # inside flashinfer_cutedsl_moe_masked + ffn_out = flashinfer_cutedsl_moe_masked( + hidden_states=(permute_input, None), + input_global_scale=layer.up_gate_proj_input_scale_quant_expand, + w1=layer.up_gate_proj_weight_t, + w1_blockscale=layer.up_gate_proj_blockscale_swizzled_t, + w1_alpha=layer.g1_alphas_r, + w2=layer.down_proj_weight_t, + a2_global_scale=layer.down_proj_input_scale_quant_expand, + w2_blockscale=layer.down_proj_blockscale_swizzled_t, + w2_alpha=layer.g2_alphas_r, + masked_m=token_nums_per_expert.squeeze(-1), + pre_permuted=True, + ) tmp_ffn_out = call_depermute_prefill_combine( x=ffn_out, indice_map=permuted_indice_map, topk_weights=recv_topk_weights, - num_worst_tokens=recv_x_value.shape[0], + num_worst_tokens=recv_x.shape[0], ) elif token_all_num > 0: From 24d07c69f44bca05eee82a233227e162e5cfd52c Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Wed, 22 Apr 2026 19:44:00 +0800 Subject: [PATCH 08/28] support mix_quant and nvfp4 --- .../layers/moe/flashinfer_cutedsl_moe.py | 58 ++-------- .../layers/quantization/__init__.py | 64 ++++++++++- .../layers/quantization/mix_quant.py | 42 ++++--- .../layers/quantization/nvfp4.py | 108 +++++++----------- fastdeploy/model_executor/utils.py | 18 ++- 5 files changed, 160 insertions(+), 130 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py b/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py index 74310e33924..654b9090ec6 100644 --- a/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py +++ b/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py @@ -78,7 +78,6 @@ def flashinfer_cutedsl_moe_masked( down_sm_count: Optional[int] = None, down_signals: Optional[paddle.Tensor] = None, down_start_event: Optional[Any] = None, - pre_permuted: bool = False, ): """ Perform masked Mixture-of-Experts computation with FlashInfer's CuteDSL kernels. @@ -114,12 +113,7 @@ def flashinfer_cutedsl_moe_masked( assert len(hidden_states) == 2, f"hidden_states must be a tuple of length 2, got {len(hidden_states)}" # intermediate_size derived from w2 last dimension - # - normal: w2 [E, k, n//2] -> n = shape[-1] * 2 - # - pre_permuted: w2 [k, n//2, E] -> n = shape[-2] * 2 - if pre_permuted: - n = w2.shape[-2] * 2 - else: - n = w2.shape[-1] * 2 + n = w2.shape[-1] * 2 if hidden_states[1] is not None: # Pre-quantized path: tokens already FP4-packed by dispatch @@ -146,26 +140,14 @@ def flashinfer_cutedsl_moe_masked( input_global_scale, ) - if pre_permuted: - # w1 [2n, k//2, E], w2 [k, n//2, E] - assert w1.shape[0] == 2 * n, f"w1 dim0 must be 2*n={2*n}, got {w1.shape[0]}" - assert w1.shape[1] * 2 == k, f"w1 dim1 * 2 must equal k={k}, got {w1.shape[1] * 2}" - assert ( - w2.shape[0] == k and w2.shape[1] == n // 2 - ), f"w2 shape mismatch, got {list(w2.shape[:2])}, expected [{k}, {n // 2}]" - else: - assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n={2*n}, got {w1.shape[-2]}" - assert w1.shape[-1] * 2 == k, f"w1 last dim * 2 must equal k={k}, got {w1.shape[-1] * 2}" - assert ( - w2.shape[-2] == k and w2.shape[-1] == n // 2 - ), f"w2 shape mismatch, got {list(w2.shape[-2:])}, expected [{k}, {n // 2}]" - if pre_permuted: - assert list(w1_alpha.shape) == [1, 1, num_experts], f"w1_alpha must be (1,1,l), got {w1_alpha.shape}" - assert list(w2_alpha.shape) == [1, 1, num_experts], f"w2_alpha must be (1,1,l), got {w2_alpha.shape}" - else: - assert list(w1_alpha.shape) == [num_experts], f"w1_alpha must be (l,), got {w1_alpha.shape}" - assert list(w2_alpha.shape) == [num_experts], f"w2_alpha must be (l,), got {w2_alpha.shape}" + assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n={2*n}, got {w1.shape[-2]}" + assert w1.shape[-1] * 2 == k, f"w1 last dim * 2 must equal k={k}, got {w1.shape[-1] * 2}" + assert ( + w2.shape[-2] == k and w2.shape[-1] == n // 2 + ), f"w2 shape mismatch, got {list(w2.shape[-2:])}, expected [{k}, {n // 2}]" + assert list(w1_alpha.shape) == [num_experts], f"w1_alpha must be (l,), got {w1_alpha.shape}" assert list(a2_global_scale.shape) == [num_experts], f"a2_global_scale must be (l,), got {a2_global_scale.shape}" + assert list(w2_alpha.shape) == [num_experts], f"w2_alpha must be (l,), got {w2_alpha.shape}" assert _is_dtype(a_q, "uint8") assert _is_dtype(a_q_sf, "float8_e4m3fn") @@ -183,24 +165,16 @@ def flashinfer_cutedsl_moe_masked( # w1: [E, 2*n, k//2] → _perm(., 1, 2, 0) → [2*n, k//2, E] # w1_blockscale:[E, 2*n, k//G] → _perm(., 1, 2, 0) → [2*n, k//G, E] # Both must share the same expert-last layout for grouped_gemm_nt_masked. - if pre_permuted: - w1_p = w1 - w1_bs_p = w1_blockscale - w1_alpha_r = w1_alpha - else: - w1_p = _perm(w1, 1, 2, 0) - w1_bs_p = _perm(w1_blockscale, 1, 2, 0) - w1_alpha_r = w1_alpha.reshape([1, 1, num_experts]) grouped_gemm_nt_masked( (a_q, a_q_sf), - (w1_p, w1_bs_p), + (_perm(w1, 1, 2, 0), _perm(w1_blockscale, 1, 2, 0)), gateup_output, masked_m, ab_dtype=ab_dtype, sf_dtype=sf_dtype, c_dtype=c_dtype, sf_vec_size=sf_vec_size, - alpha=w1_alpha_r, + alpha=w1_alpha.reshape([1, 1, num_experts]), alpha_dtype=get_cute_dtype(w1_alpha), ) # fills gateup_output in logical [m, 2*n, l] @@ -223,24 +197,16 @@ def flashinfer_cutedsl_moe_masked( # w2: [E, k, n//2] → _perm(., 1, 2, 0) → [k, n//2, E] # w2_blockscale:[E, k, n//G] → _perm(., 1, 2, 0) → [k, n//G, E] # Both must share the same expert-last layout for grouped_gemm_nt_masked. - if pre_permuted: - w2_p = w2 - w2_bs_p = w2_blockscale - w2_alpha_r = w2_alpha - else: - w2_p = _perm(w2, 1, 2, 0) - w2_bs_p = _perm(w2_blockscale, 1, 2, 0) - w2_alpha_r = w2_alpha.reshape([1, 1, num_experts]) grouped_gemm_nt_masked( (diq, diq_sf), - (w2_p, w2_bs_p), + (_perm(w2, 1, 2, 0), _perm(w2_blockscale, 1, 2, 0)), out, masked_m, ab_dtype=ab_dtype, sf_dtype=sf_dtype, c_dtype=c_dtype, sf_vec_size=sf_vec_size, - alpha=w2_alpha_r, + alpha=w2_alpha.reshape([1, 1, num_experts]), alpha_dtype=get_cute_dtype(w2_alpha), **( dict( diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index 8e8197ba1de..d7aa043ca25 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -83,8 +83,63 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader): model_quantization_config = model_config.quantization_config quantization_config = model_quantization_config + # Special case: user wants to override an offline NVFP4 (modelopt) + # checkpoint with a top-level mix_quant config so that MoE continues + # to load NVFP4 weights while dense layers fall back to another online + # quantization (e.g. block_wise_fp8). + mix_quant_overrides_nvfp4 = ( + cli_is_full_config + and isinstance(cli_quantization, dict) + and cli_quantization.get("quantization") == "mix_quant" + and cli_quantization.get("moe_quant_type") == "modelopt_fp4" + and isinstance(model_quantization_config, dict) + and model_quantization_config.get("quant_method") == "modelopt" + and model_quantization_config.get("quant_algo", "").upper() == "NVFP4" + ) + # If CLI provides a full quantization_config JSON, handle priority with config.json - if cli_is_full_config: + if mix_quant_overrides_nvfp4: + logger.warning( + "Using --quantization mix_quant to override model's NVFP4 config.json. " + "MoE will load NVFP4 weights from the checkpoint, dense layers will " + f"use '{cli_quantization.get('dense_quant_type')}' online quantization." + ) + merged = dict(cli_quantization) + # Pass the original NVFP4 dict through to MixQuantConfig so it can + # instantiate ModelOptNvFp4Config for MoE layers. + merged["moe_quant_config"] = dict(model_quantization_config) + # Only MoE is offline-quantized in the checkpoint; dense Linear + # weights are still bf16 and should be quantized online. + merged["is_quantized"] = False + merged["is_moe_quantized"] = True + quantization_config = merged + # Only MoE weights are offline-quantized in the checkpoint. Do NOT + # set model_config.is_quantized=True here; if we did, every dense + # Linear would switch to using ".quant_weight/.weight_scale" keys + # (see linear.py self.is_quantized logic) and silently fail to + # load the bf16 attention / shared_experts weights, producing + # garbage output. MoE routing uses model_config.is_moe_quantized. + model_config.is_moe_quantized = True + # Prune ignore patterns that must now be covered by dense online + # quantization (attention / shared_experts). Keep embed / lm_head / + # mtp patterns as bf16 (they don't go through mix_quant dense path + # as expected quantized linears). + if isinstance(model_quantization_config.get("ignore"), list): + keep_patterns = [] + for p in model_quantization_config["ignore"]: + if any(kw in p for kw in ("self_attn", "shared_experts")): + continue + keep_patterns.append(p) + # Rewrite model_config.quantization_config so modules_to_convert + # no longer excludes attention / shared_experts Linear layers. + pruned = dict(model_quantization_config) + pruned["ignore"] = keep_patterns + model_config.quantization_config = pruned + # Important: stop treating this as an "offline" config for the + # downstream dispatch below; we want to go through the normal + # mix_quant cls path instead of _get_offline_quant_config_name. + model_quantization_config = None + elif cli_is_full_config: if model_quantization_config is not None: if model_quantization_config != cli_quantization: logger.warning( @@ -101,7 +156,12 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader): # 1.model_config.is_quantized # TODO(bukejiyu) model_config.is_quantized is v0 only need to be removed in future if model_config.model_format == "torch": - if quantization_config is not None: + # In the mix_quant-override-NVFP4 hybrid case, only MoE weights are + # offline-quantized; dense Linear weights are still bf16 and must NOT + # be flagged as quantized (otherwise Linear.is_quantized becomes True + # and attention weight-loading looks for non-existent ".quant_weight" + # keys, leaving attention weights at init and producing garbage). + if quantization_config is not None and not mix_quant_overrides_nvfp4: model_config.is_quantized = True else: if not model_config.is_quantized: diff --git a/fastdeploy/model_executor/layers/quantization/mix_quant.py b/fastdeploy/model_executor/layers/quantization/mix_quant.py index 2956d506306..5c9c6eee85d 100644 --- a/fastdeploy/model_executor/layers/quantization/mix_quant.py +++ b/fastdeploy/model_executor/layers/quantization/mix_quant.py @@ -41,6 +41,7 @@ def __init__( hadamard_block_size: int = 128, moe_dynamic_quant: bool = False, is_moe_quantized: bool = False, + moe_quant_config: Optional[dict] = None, ) -> None: super().__init__() self.dense_quant_type = dense_quant_type @@ -61,6 +62,10 @@ def __init__( self.hadamard_block_size = hadamard_block_size self.moe_dynamic_quant = moe_dynamic_quant self.is_moe_quantized = is_moe_quantized + # When moe_quant_type is an "offline" method (e.g. modelopt_fp4), this + # holds the original offline quantization_config dict (from model's + # config.json) so we can instantiate the sub-config correctly. + self.moe_quant_config = moe_quant_config or {} def name(self) -> str: return "mix_quant" @@ -79,21 +84,36 @@ def from_config(cls, config: dict) -> "MixQuantConfig": config.get("hadamard_block_size", 128), config.get("moe_dynamic_quant", False), config.get("is_moe_quantized", False), + config.get("moe_quant_config", None), ) + def _build_moe_sub_config(self, moe_quant_type: str) -> dict: + """ + Build the dict passed to the sub quant-config's from_config(). + For offline formats like modelopt_fp4 we need to forward the + original model-side quantization_config (with quant_algo, ignore, + group_size, etc). For online formats like block_wise_fp8 / wint4, + the minimal online fields are enough. + """ + if moe_quant_type == "modelopt_fp4" and self.moe_quant_config: + sub = dict(self.moe_quant_config) + sub.setdefault("is_permuted", self.is_permuted) + sub.setdefault("is_quantized", True) + sub.setdefault("hadamard_block_size", self.hadamard_block_size) + return sub + return { + "is_permuted": self.is_permuted, + "is_quantized": not self.is_checkpoint_bf16 or self.is_moe_quantized, + "hadamard_block_size": self.hadamard_block_size, + } + def get_quant_method(self, layer) -> Optional[QuantMethodBase]: if isinstance(layer, FusedMoE): if layer.moe_tag == "Image": if self.image_moe_quant_type is not None: return ( get_quantization_config(self.image_moe_quant_type) - .from_config( - { - "is_permuted": self.is_permuted, - "is_quantized": not self.is_checkpoint_bf16, - "hadamard_block_size": self.hadamard_block_size, - } - ) + .from_config(self._build_moe_sub_config(self.image_moe_quant_type)) .get_quant_method(layer) ) else: @@ -102,13 +122,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: if self.moe_quant_type is not None: return ( get_quantization_config(self.moe_quant_type) - .from_config( - { - "is_permuted": self.is_permuted, - "is_quantized": not self.is_checkpoint_bf16 or self.is_moe_quantized, - "hadamard_block_size": self.hadamard_block_size, - } - ) + .from_config(self._build_moe_sub_config(self.moe_quant_type)) .get_quant_method(layer) ) else: diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 515f26ea363..f34700e0bf8 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -366,7 +366,7 @@ def _create_weight_scales(self, layer, weight_scale_shape, weight_scale_2_shape, ) def process_weights_after_loading(self, layer) -> None: - + logger.info("跑了Nvfp4") input_scale_2 = layer.input_scale.max().to(paddle.float32) weight_scale_2 = layer.weight_scale_2.max().to(paddle.float32) alpha = input_scale_2 * weight_scale_2 @@ -606,7 +606,7 @@ def load_up_proj_weight_first(self) -> bool: def process_weights_after_loading(self, layer): """ """ # FlashInfer CUTLASS kernel assumes [Up, Gate] Proj as W13 - + # logger.info(f"跑了fp4 moe了") if self.backend == "flashinfer-cutlass": [a, b] = layer.up_gate_proj_weight.split(2, axis=1) layer.up_gate_proj_weight.set_value(paddle.concat([b, a], axis=1)) @@ -659,43 +659,6 @@ def process_weights_after_loading(self, layer): free_tensor(layer.down_proj_weight_scale) layer.down_proj_weight_scale = None - # === Pre-permute weights for flashinfer cutedsl grouped GEMM === - # flashinfer_cutedsl_moe_masked requires weights in [2n, k//2, E] / [k, n//2, E]. - # Paddle's allocator is cached (freed memory stays in the pool), so - # going through create_parameter_and_copy would leave 2x allocations - # lingering. Instead we hold the transposed tensor as a plain attribute - # (paddle.transpose may return a view; if it's a view, storage is - # shared with the original and stays alive after free_tensor clears the - # original tensor handle, since DenseTensor uses shared_ptr). - E = layer.num_local_experts - if envs.FD_MOE_BACKEND == "flashinfer-cutedsl": - # w1: [E, 2n, k//2] -> [2n, k//2, E] - layer.up_gate_proj_weight_t = layer.up_gate_proj_weight.transpose([1, 2, 0]) - free_tensor(layer.up_gate_proj_weight) - layer.up_gate_proj_weight = None - - # w2: [E, k, n//2] -> [k, n//2, E] - layer.down_proj_weight_t = layer.down_proj_weight.transpose([1, 2, 0]) - free_tensor(layer.down_proj_weight) - layer.down_proj_weight = None - - # blockscale w1: [E, 2n, k//G] -> [2n, k//G, E] - layer.up_gate_proj_blockscale_swizzled_t = layer.up_gate_proj_blockscale_swizzled.transpose([1, 2, 0]) - free_tensor(layer.up_gate_proj_blockscale_swizzled) - layer.up_gate_proj_blockscale_swizzled = None - - # blockscale w2: [E, k, n//G] -> [k, n//G, E] - layer.down_proj_blockscale_swizzled_t = layer.down_proj_blockscale_swizzled.transpose([1, 2, 0]) - free_tensor(layer.down_proj_blockscale_swizzled) - layer.down_proj_blockscale_swizzled = None - - # alpha: (E,) -> (1, 1, E) broadcast-ready. reshape is a view, no copy. - layer.g1_alphas_r = layer.g1_alphas.reshape([1, 1, E]) - layer.g2_alphas_r = layer.g2_alphas.reshape([1, 1, E]) - - layer.down_proj_input_scale_quant_expand = layer.down_proj_input_scale_quant.expand([E]) - layer.up_gate_proj_input_scale_quant_expand = layer.up_gate_proj_input_scale_quant.expand([E]) - def apply_ep_prefill( self, layer: nn.Layer, @@ -704,10 +667,10 @@ def apply_ep_prefill( topk_ids_hookfunc: Callable = None, shared_experts: nn.Layer = None, ) -> paddle.Tensor: - # 1. top experts and weights + # logger.info(f"prefill的nvfp4") + # logger.info(f"layer.up_gate_proj_input_scale_quant:{layer.up_gate_proj_input_scale_quant}") gate_out = gate(x.cast("float32")) - # gate_out = paddle.randn(gate_out.shape, dtype="float32") topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) hidden_size = x.shape[1] @@ -733,7 +696,6 @@ def apply_ep_prefill( topk_weights, expert_alignment=128, previous_event=event, - x_scale_tensor=None, ) if self.ep_prefill_runner.num_worst_tokens > 0: @@ -749,13 +711,21 @@ def apply_ep_prefill( if thread_name not in global_values: global_values[thread_name] = {} + # nvfp4 dispatch returns a plain BF16 tensor (no fp8 scale), unlike deepgemm which returns (value, scale) tuple + if isinstance(recv_x, tuple): + (recv_x_value, recv_x_scale) = recv_x + else: + recv_x_value = recv_x + recv_x_scale = None + global_values[thread_name]["x"] = x global_values[thread_name]["topk_idx"] = topk_idx global_values[thread_name]["topk_weights"] = topk_weights global_values[thread_name]["x_scale_tensor"] = None - global_values[thread_name]["recv_x"] = recv_x + global_values[thread_name]["recv_x_value"] = recv_x_value + global_values[thread_name]["recv_x_scale"] = recv_x_scale global_values[thread_name]["recv_topk_idx"] = recv_topk_idx global_values[thread_name]["recv_topk_weights"] = recv_topk_weights global_values[thread_name]["handle"] = handle @@ -766,7 +736,7 @@ def apply_ep_prefill( if self.ep_prefill_runner.num_worst_tokens > 0: use_tbo = os.getenv("USE_TBO", "0") - token_split_factor = 16 if int(use_tbo) == 1 else 1 + token_split_factor = 8 if int(use_tbo) == 1 else 1 max_tokens_per_rank = ( layer.fd_config.scheduler_config.max_num_batched_tokens // layer.fd_config.parallel_config.tensor_parallel_size @@ -775,35 +745,39 @@ def apply_ep_prefill( permute_input, permute_scale, permuted_indice_map, token_nums_per_expert = ( call_prefill_permute_to_masked_gemm( - x=recv_x, - scale=None, + x=recv_x_value, + scale=recv_x_scale, topk_ids=recv_topk_idx, num_local_experts=layer.num_local_experts, max_token_num=layer.ep_size * max_tokens_per_rank, ) ) - # BF16 dispatch path: permute_input is BF16, quantize to FP4 - # inside flashinfer_cutedsl_moe_masked + max_token_num = layer.ep_size * max_tokens_per_rank + permute_input = permute_input.reshape([layer.num_local_experts, max_token_num, recv_x_value.shape[-1]]) + + # ffn_out: [num_local_experts, m, hidden_size] + # NVFP4 dispatch returns BF16 (no pre-quantized scale), so permute_scale is empty. + # Use per-expert 1/input_scale (up_gate_proj_input_scale_quant) as input_global_scale, + # consistent with apply_ep_decode which also uses this value directly. ffn_out = flashinfer_cutedsl_moe_masked( hidden_states=(permute_input, None), - input_global_scale=layer.up_gate_proj_input_scale_quant_expand, - w1=layer.up_gate_proj_weight_t, - w1_blockscale=layer.up_gate_proj_blockscale_swizzled_t, - w1_alpha=layer.g1_alphas_r, - w2=layer.down_proj_weight_t, - a2_global_scale=layer.down_proj_input_scale_quant_expand, - w2_blockscale=layer.down_proj_blockscale_swizzled_t, - w2_alpha=layer.g2_alphas_r, + input_global_scale=layer.up_gate_proj_input_scale_quant.expand([layer.num_local_experts]), + w1=layer.up_gate_proj_weight, + w1_blockscale=layer.up_gate_proj_blockscale_swizzled, + w1_alpha=layer.g1_alphas, + w2=layer.down_proj_weight, + a2_global_scale=layer.down_proj_input_scale_quant.expand([layer.num_local_experts]), + w2_blockscale=layer.down_proj_blockscale_swizzled, + w2_alpha=layer.g2_alphas, masked_m=token_nums_per_expert.squeeze(-1), - pre_permuted=True, ) tmp_ffn_out = call_depermute_prefill_combine( x=ffn_out, indice_map=permuted_indice_map, topk_weights=recv_topk_weights, - num_worst_tokens=recv_x.shape[0], + num_worst_tokens=recv_x_value.shape[0], ) elif token_all_num > 0: @@ -860,18 +834,18 @@ def apply_ep_decode( ) # Compute FFN via CuteDSL masked grouped GEMM + num_experts = layer.num_local_experts ffn_out = flashinfer_cutedsl_moe_masked( hidden_states=(recv_x, None), - input_global_scale=layer.up_gate_proj_input_scale_quant_expand, - w1=layer.up_gate_proj_weight_t, - w1_blockscale=layer.up_gate_proj_blockscale_swizzled_t, - w1_alpha=layer.g1_alphas_r, - w2=layer.down_proj_weight_t, - a2_global_scale=layer.down_proj_input_scale_quant_expand, - w2_blockscale=layer.down_proj_blockscale_swizzled_t, - w2_alpha=layer.g2_alphas_r, + input_global_scale=layer.up_gate_proj_input_scale_quant.expand([num_experts]), + w1=layer.up_gate_proj_weight, + w1_blockscale=layer.up_gate_proj_blockscale_swizzled, + w1_alpha=layer.g1_alphas, + w2=layer.down_proj_weight, + a2_global_scale=layer.down_proj_input_scale_quant.expand([num_experts]), + w2_blockscale=layer.down_proj_blockscale_swizzled, + w2_alpha=layer.g2_alphas, masked_m=token_nums_per_expert, - pre_permuted=True, ) if shared_experts is not None: diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index 314fe83c5ff..8e661e41c7c 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -180,6 +180,16 @@ def fn(model_sublayer_name: str, param=None): if fd_config.quant_config and not fd_config.quant_config.is_checkpoint_bf16: # skip for offline quantization return + # Hybrid mix_quant case: the global quant_config may be bf16-checkpoint + # (for dense online quant), but individual sublayers (e.g. MoE with + # offline NVFP4) have an offline sub-config. Their per-weight-load + # hook would fire before all sibling tensors (weight_scale_2, etc.) + # are loaded and crash. Defer those to process_final_after_loading. + _sub_qm = getattr(model_sublayer, "quant_method", None) + if _sub_qm is not None: + _sub_qc = getattr(_sub_qm, "quant_config", None) + if _sub_qc is not None and getattr(_sub_qc, "is_checkpoint_bf16", True) is False: + return if hasattr(model_sublayer, "quant_method"): quant_method = getattr(model_sublayer, "quant_method", None) unquant_moe_layer = get_moe_method() @@ -267,7 +277,13 @@ def process_final_after_loading(model, fd_config: FDConfig): unquant_moe_cls = type(unquant_moe_layer) is_unquant_cls = type(quant_method) is UnquantizedLinearMethod or type(quant_method) is unquant_moe_cls is_offline_quantized_ckpt = not (fd_config.quant_config and fd_config.quant_config.is_checkpoint_bf16) - if is_unquant_cls or is_offline_quantized_ckpt: + # Hybrid mix_quant case: individual sublayer's sub-config may be + # offline even when global quant_config is bf16-checkpoint. Those + # sublayers were skipped by the incremental hook on purpose and + # need their final post-load here. + _sub_qc = getattr(quant_method, "quant_config", None) + sublayer_is_offline = _sub_qc is not None and getattr(_sub_qc, "is_checkpoint_bf16", True) is False + if is_unquant_cls or is_offline_quantized_ckpt or sublayer_is_offline: if hasattr(quant_method, "process_weights_after_loading"): quant_method.process_weights_after_loading(sublayer) continue From 19a7019eeac1c9ce9bfe8e3f253526ae8d84c5ee Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Fri, 24 Apr 2026 12:16:40 +0800 Subject: [PATCH 09/28] support prefill cuda_graph --- custom_ops/gpu_ops/helper.h | 190 +++++----- fastdeploy/envs.py | 10 + .../graph_optimization/cuda_graph_op.py | 324 ++++++++++++++++++ fastdeploy/model_executor/layers/linear.py | 7 + .../model_executor/layers/normalization.py | 4 + .../layers/quantization/__init__.py | 33 +- .../layers/quantization/nvfp4.py | 3 +- fastdeploy/worker/gpu_model_runner.py | 53 +++ fastdeploy/worker/gpu_worker.py | 3 + 9 files changed, 514 insertions(+), 113 deletions(-) create mode 100644 fastdeploy/model_executor/graph_optimization/cuda_graph_op.py diff --git a/custom_ops/gpu_ops/helper.h b/custom_ops/gpu_ops/helper.h index 83f3ad1077d..fd5f3a27eb4 100644 --- a/custom_ops/gpu_ops/helper.h +++ b/custom_ops/gpu_ops/helper.h @@ -52,6 +52,8 @@ namespace cub = hipcub; #include "env.h" #include "paddle/extension.h" #include "paddle/phi/core/allocator.h" +#include "paddle/phi/core/memory/allocation/allocator_facade.h" +#include "paddle/phi/backends/gpu/cuda/cuda_graph.h" #ifdef PADDLE_WITH_CUSTOM_DEVICE #include "paddle/phi/backends/custom/custom_context.h" #else @@ -87,7 +89,7 @@ using json = nlohmann::json; #ifdef PADDLE_WITH_HIP template -inline hipError_t GetNumBlocks(int64_t n, int *num_blocks) { +inline hipError_t GetNumBlocks(int64_t n, int* num_blocks) { int dev; { hipError_t err = hipGetDevice(&dev); @@ -119,7 +121,7 @@ inline hipError_t GetNumBlocks(int64_t n, int *num_blocks) { } #else template -inline cudaError_t GetNumBlocks(int64_t n, int *num_blocks) { +inline cudaError_t GetNumBlocks(int64_t n, int* num_blocks) { int dev; { cudaError_t err = cudaGetDevice(&dev); @@ -267,41 +269,41 @@ template struct alignas(sizeof(T) * Size) AlignedVector { T val[Size]; - HOSTDEVICE inline const T &operator[](int i) const { return val[i]; } - HOSTDEVICE inline T &operator[](int i) { return val[i]; } + HOSTDEVICE inline const T& operator[](int i) const { return val[i]; } + HOSTDEVICE inline T& operator[](int i) { return val[i]; } }; template -HOSTDEVICE inline void Load(const T *addr, AlignedVector *vec) { - const AlignedVector *addr_vec = - reinterpret_cast *>(addr); +HOSTDEVICE inline void Load(const T* addr, AlignedVector* vec) { + const AlignedVector* addr_vec = + reinterpret_cast*>(addr); *vec = *addr_vec; } template -HOSTDEVICE inline void Store(const AlignedVector &vec, T *addr) { - AlignedVector *addr_vec = - reinterpret_cast *>(addr); +HOSTDEVICE inline void Store(const AlignedVector& vec, T* addr) { + AlignedVector* addr_vec = + reinterpret_cast*>(addr); *addr_vec = vec; } #ifdef PADDLE_WITH_HIP template -HOSTDEVICE inline void Store(const AlignedVector &vec, - int8_t *addr) { +HOSTDEVICE inline void Store(const AlignedVector& vec, + int8_t* addr) { printf("Error: Store hip_bfloat16 to int8_t is not supported!"); } #else template -HOSTDEVICE inline void Store(const AlignedVector<__nv_bfloat16, Size> &vec, - int8_t *addr) { +HOSTDEVICE inline void Store(const AlignedVector<__nv_bfloat16, Size>& vec, + int8_t* addr) { printf("Error: Store __nv_bfloat16 to int8_t is not supported!"); } #endif template -HOSTDEVICE inline void Store(const AlignedVector &vec, - int8_t *addr) { +HOSTDEVICE inline void Store(const AlignedVector& vec, + int8_t* addr) { printf("Error: Store half to int8_t is not supported!"); } @@ -314,7 +316,7 @@ __device__ T max_func(const T a, const T b) { template struct MaxOp { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { + __device__ __forceinline__ T operator()(const T& a, const T& b) const { return max_func(a, b); } }; @@ -322,7 +324,7 @@ struct MaxOp { template <> struct MaxOp { // This is slightly faster - __device__ __forceinline__ float operator()(float const &x, float const &y) { + __device__ __forceinline__ float operator()(float const& x, float const& y) { return max(x, y); } }; @@ -342,7 +344,7 @@ inline int GetBlockSize(int vocab_size) { } #ifndef PADDLE_WITH_COREX -inline json readJsonFromFile(const std::string &filePath) { +inline json readJsonFromFile(const std::string& filePath) { std::ifstream file(filePath); if (!file.is_open()) { throw std::runtime_error("Unable to open file: " + filePath); @@ -368,10 +370,21 @@ inline json readJsonFromFile(const std::string &filePath) { // paddle::GPUPlace() #ifdef PADDLE_DEV -inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, - const paddle::DataType &dtype, - const paddle::Place &place) { - auto *allocator = paddle::GetAllocator(place); +inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, + const paddle::DataType& dtype, + const paddle::Place& place) { + phi::Allocator* allocator = nullptr; +#if defined(PADDLE_WITH_CUDA) + if (phi::backends::gpu::CUDAGraph::IsThisThreadCapturing()) { + allocator = paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place) + .get(); + } else { + allocator = paddle::GetAllocator(place); + } +#else + allocator = paddle::GetAllocator(place); +#endif phi::DenseTensor dense_tensor; dense_tensor.Resize(dims); dense_tensor.AllocateFrom( @@ -379,11 +392,22 @@ inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, return paddle::Tensor(std::make_shared(dense_tensor)); } -inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, - const common::DDim &strides, - const paddle::DataType &dtype, - const paddle::Place &place) { - auto *allocator = paddle::GetAllocator(place); +inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, + const common::DDim& strides, + const paddle::DataType& dtype, + const paddle::Place& place) { + phi::Allocator* allocator = nullptr; +#if defined(PADDLE_WITH_CUDA) + if (phi::backends::gpu::CUDAGraph::IsThisThreadCapturing()) { + allocator = paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place) + .get(); + } else { + allocator = paddle::GetAllocator(place); + } +#else + allocator = paddle::GetAllocator(place); +#endif phi::DenseTensor dense_tensor; dense_tensor.Resize(dims); dense_tensor.AllocateFrom( @@ -393,67 +417,67 @@ inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, } #endif -__global__ void free_and_dispatch_block(bool *stop_flags, - int *seq_lens_this_time, - int *seq_lens_decoder, - int *block_tables, - int *encoder_block_lens, - bool *is_block_step, - int *step_block_list, // [bsz] - int *step_len, - int *recover_block_list, - int *recover_len, - int *need_block_list, - int *need_block_len, - int *used_list_len, - int *free_list, - int *free_list_len, - int64_t *first_token_ids, +__global__ void free_and_dispatch_block(bool* stop_flags, + int* seq_lens_this_time, + int* seq_lens_decoder, + int* block_tables, + int* encoder_block_lens, + bool* is_block_step, + int* step_block_list, // [bsz] + int* step_len, + int* recover_block_list, + int* recover_len, + int* need_block_list, + int* need_block_len, + int* used_list_len, + int* free_list, + int* free_list_len, + int64_t* first_token_ids, const int bsz, const int block_size, const int block_num_per_seq, const int max_decoder_block_num); __global__ void speculate_free_and_dispatch_block( - bool *stop_flags, - int *seq_lens_this_time, - int *seq_lens_decoder, - int *block_tables, - int *encoder_block_lens, - bool *is_block_step, - int *step_block_list, // [bsz] - int *step_len, - int *recover_block_list, - int *recover_len, - int *need_block_list, - int *need_block_len, - int *used_list_len, - int *free_list, - int *free_list_len, - int64_t *first_token_ids, - int *accept_num, + bool* stop_flags, + int* seq_lens_this_time, + int* seq_lens_decoder, + int* block_tables, + int* encoder_block_lens, + bool* is_block_step, + int* step_block_list, // [bsz] + int* step_len, + int* recover_block_list, + int* recover_len, + int* need_block_list, + int* need_block_len, + int* used_list_len, + int* free_list, + int* free_list_len, + int64_t* first_token_ids, + int* accept_num, const int bsz, const int block_size, const int block_num_per_seq, const int max_decoder_block_num, const int max_draft_tokens); -__device__ bool speculate_free_and_dispatch_block(const int &qid, - int *need_block_list, - const int &need_block_len); +__device__ bool speculate_free_and_dispatch_block(const int& qid, + int* need_block_list, + const int& need_block_len); static std::string global_base64_chars = // NOLINT "Tokp9lA/BjimRVKx32edMPFftOzsbNQ8C15Xn+YUEGc4WD0uLIq7hyJ6vZaHSwrg"; // Base64 编码函数 -inline std::string base64_encode(const std::string &input) { +inline std::string base64_encode(const std::string& input) { std::string ret; int i = 0; int j = 0; unsigned char char_array_3[3]; unsigned char char_array_4[4]; - for (const auto &c : input) { + for (const auto& c : input) { char_array_3[i++] = c; if (i == 3) { char_array_4[0] = (char_array_3[0] & 0xfc) >> 2; @@ -495,7 +519,7 @@ inline std::string base64_encode(const std::string &input) { } // Base64 解码函数 -inline std::string base64_decode(const std::string &encoded_string) { +inline std::string base64_decode(const std::string& encoded_string) { int in_len = encoded_string.size(); int i = 0; int j = 0; @@ -550,9 +574,9 @@ inline std::string base64_decode(const std::string &encoded_string) { #ifndef PADDLE_WITH_COREX template -inline T get_relative_best(nlohmann::json *json_data, - const std::string &target_key, - const T &default_value) { +inline T get_relative_best(nlohmann::json* json_data, + const std::string& target_key, + const T& default_value) { if (json_data->contains(target_key)) { return json_data->at(target_key); } else { @@ -564,7 +588,7 @@ inline T get_relative_best(nlohmann::json *json_data, #endif __device__ inline bool is_in_end(const int64_t id, - const int64_t *end_ids, + const int64_t* end_ids, int length) { bool flag = false; for (int i = 0; i < length; i++) { @@ -588,7 +612,7 @@ __device__ __inline__ T ClipFunc(const T v, const T min, const T max) { } template -static void PrintMatrix3(const T *mat_d, int num, std::string name) { +static void PrintMatrix3(const T* mat_d, int num, std::string name) { std::vector tmp(num); #ifdef PADDLE_WITH_HIP hipMemcpy(tmp.data(), mat_d, sizeof(T) * num, hipMemcpyDeviceToHost); @@ -613,7 +637,7 @@ static void PrintMatrix3(const T *mat_d, int num, std::string name) { #ifndef PADDLE_WITH_HIP #ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU -__forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr, +__forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t* flag_addr, int mode = 0) { uint32_t flag; if (mode == 0) { @@ -632,7 +656,7 @@ __forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr, return flag; } -__forceinline__ __device__ void st_flag_release(uint32_t *flag_addr, +__forceinline__ __device__ void st_flag_release(uint32_t* flag_addr, uint32_t flag, int mode = 0) { if (mode == 0) { @@ -674,12 +698,12 @@ inline bool GetMlaUseTensorcore() { return mla_use_tensorcore; } -inline const char *getEnvVar(const char *varName) { +inline const char* getEnvVar(const char* varName) { return std::getenv(varName); } inline bool checkAttentionBackend() { - const char *backend = getEnvVar("FD_ATTENTION_BACKEND"); + const char* backend = getEnvVar("FD_ATTENTION_BACKEND"); if (backend && (std::strcmp(backend, "MLA_ATTN") == 0 || std::strcmp(backend, "DSA_ATTN") == 0)) { return true; @@ -691,17 +715,17 @@ inline bool checkAttentionBackend() { #define GPU_MEMORY_CHECKER_H class GPUMemoryChecker { public: - static GPUMemoryChecker *getInstance() { + static GPUMemoryChecker* getInstance() { static GPUMemoryChecker instance; return &instance; } - void addCheckPoint(const char *call_file, int call_line); + void addCheckPoint(const char* call_file, int call_line); unsigned int getGPUCount() const { return deviceCount_; } void getCUDAVisibleDevice(); - GPUMemoryChecker(const GPUMemoryChecker &) = delete; - void operator=(const GPUMemoryChecker &) = delete; + GPUMemoryChecker(const GPUMemoryChecker&) = delete; + void operator=(const GPUMemoryChecker&) = delete; private: GPUMemoryChecker(); @@ -737,8 +761,8 @@ __device__ __forceinline__ float blockReduceMax(float value) { return value; } -inline bool getBoolEnv(char const *name) { - char const *env = std::getenv(name); +inline bool getBoolEnv(char const* name) { + char const* env = std::getenv(name); return env && env[0] == '1' && env[1] == '\0'; } @@ -761,7 +785,7 @@ inline void launchWithPdlWhenEnabled(KernelFn kernelFn, dim3 block, size_t dynamicShmSize, cudaStream_t stream, - Args &&...args) { + Args&&... args) { #ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU (*kernelFn)<<>>( std::forward(args)...); diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index e5ae65ffc84..9e3e1394be2 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -271,6 +271,16 @@ def _validate_split_kv_size(value: int) -> int: "FD_SiluAndMul_USE_PHI_SWIGLU": lambda: bool(int(os.getenv("FD_SiluAndMul_USE_PHI_SWIGLU", "0"))), # Whether to enable FP8 quantization with pow2scale. "FD_FP8_QUANT_WITH_POW2SCALE": lambda: bool(int(os.getenv("FD_FP8_QUANT_WITH_POW2SCALE", "0"))), + # Whether to enable block-wise CUDA Graph capture/replay. + # When enabled, individual layer forward methods decorated with @block_wise_cuda_graph_wrap + # will be captured and replayed as CUDA Graphs for improved performance. + # Set to 1 to enable; defaults to 0 (disabled). + "FD_USE_BLOCK_WISE_CUDA_GRAPH": lambda: bool(int(os.getenv("FD_USE_BLOCK_WISE_CUDA_GRAPH", "0"))), + # Comma-separated list of token counts to pre-capture for block-wise CUDA Graphs. + # Used during the warmup phase to pre-capture graphs for these specific sizes. + # At runtime, token counts not in this list fall back to eager execution. + # Example: "1,2,4,8,16,32,64,128,256,512" + "FD_BLOCK_WISE_CUDA_GRAPH_SIZES": lambda: os.getenv("FD_BLOCK_WISE_CUDA_GRAPH_SIZES", "128,256,512,1024,2048"), } diff --git a/fastdeploy/model_executor/graph_optimization/cuda_graph_op.py b/fastdeploy/model_executor/graph_optimization/cuda_graph_op.py new file mode 100644 index 00000000000..4e409058a43 --- /dev/null +++ b/fastdeploy/model_executor/graph_optimization/cuda_graph_op.py @@ -0,0 +1,324 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import functools +import inspect +import logging +from typing import Callable, Optional, Sequence + +import paddle + +import fastdeploy + +# ---- Module-level state for pre-captured block-wise CUDA graphs ---- + +# When True, the wrapper is in the capture phase (during dummy_run) and +# will capture new graphs. When False, uncached keys fall back to eager. +_BLOCK_WISE_CAPTURING: bool = False + +# Registry of all shared-mode graph caches, for bulk clearing. +_ALL_SHARED_CACHES: list = [] + +# Global counter / registry of all captured block-wise graphs (for logging). +# Each entry: (qualname, key, shared_mode) +_CAPTURED_GRAPH_LOG: list = [] +_LOGGER = logging.getLogger("fastdeploy.block_wise_cuda_graph") +if not _LOGGER.handlers: + _LOGGER.setLevel(logging.INFO) + + +def get_captured_graph_log(): + """Return the list of all captured (qualname, key, shared) triples.""" + return list(_CAPTURED_GRAPH_LOG) + + +def dump_captured_graph_summary(): + """Print a summary of all captured block-wise CUDA graphs.""" + from collections import Counter + + if not _CAPTURED_GRAPH_LOG: + _LOGGER.info("[block_wise_cuda_graph] no graph captured") + return + counter = Counter(q for q, _, _ in _CAPTURED_GRAPH_LOG) + _LOGGER.info( + "[block_wise_cuda_graph] total captured graphs=%d across %d distinct methods:", + len(_CAPTURED_GRAPH_LOG), + len(counter), + ) + for qname, cnt in sorted(counter.items(), key=lambda x: -x[1]): + _LOGGER.info(" - %s : %d graph(s)", qname, cnt) + + +def set_block_wise_capturing(capturing: bool): + """Toggle the capture phase flag. Only capture graphs when this is True.""" + global _BLOCK_WISE_CAPTURING + _BLOCK_WISE_CAPTURING = capturing + + +def clear_all_block_wise_graphs(): + """Clear all shared block-wise graph caches (e.g. for RL weight updates).""" + for graphs, cinputs, coutputs in _ALL_SHARED_CACHES: + graphs.clear() + cinputs.clear() + coutputs.clear() + + +def block_wise_cuda_graph_wrap( + inputs: Sequence[str], + self_attrs: Sequence[str] = (), + key_fn: Optional[Callable[..., tuple]] = None, +): + """ + Method decorator that wraps a forward method with CUDA Graph capture/replay. + + On the first call for a given cache key (derived from tensor shapes/dtypes), + the decorated method is captured into a CUDA Graph. Subsequent calls with the + same key will replay the graph after updating input data pointers. + + When ``_BLOCK_WISE_CAPTURING`` is managed via ``set_block_wise_capturing``, + new graphs are only captured during the capture phase (dummy_run). At runtime, + uncached keys fall back to eager execution, avoiding expensive on-the-fly captures. + + When ``self_attrs`` is provided, the named tensor attributes of ``self`` + (e.g. ``weight``) are also tracked for pointer replacement, and the graph + cache is **shared across all instances** (closure-level). This allows layers + with identical computation but different weights to share a single captured + graph, dramatically reducing the total number of graphs from O(num_layers) + to O(num_unique_shapes). + + When ``self_attrs`` is empty (default), graphs are cached per instance. + + Output tensors from the capture phase are reused across replays — the graph + always writes to the same output memory. This avoids per-replay allocation + overhead. Callers must consume the output before the next replay of the same + graph (which is naturally satisfied in sequential layer-by-layer forward). + + Args: + inputs: Names of parameters that are input tensors to be tracked for + CUDA Graph pointer replacement. These must be parameter names of the + decorated method. Only non-None tensor arguments are tracked. + self_attrs: Attribute names on ``self`` that are tensor parameters to be + replaced via pointer replacement (e.g. ``["weight"]``). When non-empty, + enables cross-instance graph sharing. + key_fn: Optional callable to generate the cache key from method arguments. + Signature: key_fn(arg0, arg1, ...) with args in declaration order + (excluding self). Defaults to a key based on tensor shapes/dtypes. + + Example: + class MyNorm(nn.Layer): + @block_wise_cuda_graph_wrap( + inputs=["x", "residual"], + self_attrs=["weight"], # all layers share one graph + ) + def forward(self, x, residual=None): + return rms_norm(x, self.weight), residual + """ + + def decorator(method: Callable) -> Callable: + sig = inspect.signature(method) + params = list(sig.parameters.keys()) # ["self", "x", "residual_input", ...] + _qualname = method.__qualname__ + + for name in inputs: + if name not in params or name == "self": + raise ValueError( + f"cuda_graph_wrap: input '{name}' is not a parameter of " + f"{method.__qualname__}. Available: {[p for p in params if p != 'self']}" + ) + + # ---- Pre-compute at decoration time (runs once) ---- + + _EMPTY = inspect.Parameter.empty + _Tensor = paddle.Tensor + + # For each non-self param: (name, args_index, default_value) + # args_index is position in *args (0-based, since self is consumed by Python) + _param_info = tuple((p, i - 1, sig.parameters[p].default) for i, p in enumerate(params) if p != "self") + + # For each declared input tensor: (name, args_index) + _input_info = tuple((name, params.index(name) - 1) for name in inputs) + + _self_attr_names = tuple(self_attrs) + _shared = len(_self_attr_names) > 0 + + _use_custom_key = key_fn is not None + + # --- Cache storage --- + # When self_attrs is provided: closure-level (shared across all instances) + # When not: per-instance (stored in self.__dict__) + if _shared: + _shared_graphs = {} + _shared_cinputs = {} + _shared_coutputs = {} # stores actual result tensors (reused across replays) + _ALL_SHARED_CACHES.append((_shared_graphs, _shared_cinputs, _shared_coutputs)) + + # Per-instance attribute key names + _g = f"_cg_{method.__name__}_g" + _ci = f"_cg_{method.__name__}_ci" + _co = f"_cg_{method.__name__}_co" + + @functools.wraps(method) + def wrapper(self, *args, **kwargs): + if not fastdeploy.envs.FD_USE_BLOCK_WISE_CUDA_GRAPH: + return method(self, *args, **kwargs) + + nargs = len(args) + + # Skip CUDA graph if any input tensor has a 0 in its shape + for a in args: + if isinstance(a, _Tensor) and 0 in a.shape: + return method(self, *args, **kwargs) + for v in kwargs.values(): + if isinstance(v, _Tensor) and 0 in v.shape: + return method(self, *args, **kwargs) + + # === Key generation: inline, no sig.bind === + if _use_custom_key: + # Resolve all args for custom key_fn + resolved = [] + for pname, aidx, default in _param_info: + if pname in kwargs: + resolved.append(kwargs[pname]) + elif aidx < nargs: + resolved.append(args[aidx]) + elif default is not _EMPTY: + resolved.append(default) + else: + resolved.append(None) + key = key_fn(*resolved) + else: + # Default: fast inline key from shapes/dtypes + _kp = [] + for pname, aidx, default in _param_info: + if pname in kwargs: + v = kwargs[pname] + elif aidx < nargs: + v = args[aidx] + else: + v = default + if isinstance(v, _Tensor): + _kp.append((tuple(v.shape), v.dtype)) + elif v is None: + _kp.append(None) + elif callable(v): + _kp.append(True) + # Include self_attrs shapes/dtypes in key + for attr_name in _self_attr_names: + attr = getattr(self, attr_name, None) + if attr is not None and isinstance(attr, _Tensor): + _kp.append((attr_name, tuple(attr.shape), attr.dtype)) + else: + _kp.append((attr_name, None)) + key = tuple(_kp) + + # === Get cache (shared or per-instance) === + if _shared: + graphs = _shared_graphs + cinputs = _shared_cinputs + coutputs = _shared_coutputs + else: + _d = self.__dict__ + try: + graphs = _d[_g] + cinputs = _d[_ci] + coutputs = _d[_co] + except KeyError: + graphs = {} + cinputs = {} + coutputs = {} + _d[_g] = graphs + _d[_ci] = cinputs + _d[_co] = coutputs + + if key not in graphs: + # === First encounter: only capture during capture phase === + if not _BLOCK_WISE_CAPTURING: + # Not in capture phase -- fall back to eager + return method(self, *args, **kwargs) + + # === Capture === + graph = paddle.device.cuda.graphs.CUDAGraph(enable_replace=True) + graphs[key] = graph + ci = {} + for name, aidx in _input_info: + v = kwargs[name] if name in kwargs else (args[aidx] if aidx < nargs else None) + if v is not None and isinstance(v, _Tensor): + ci[name] = v.data_ptr() + + # Record self_attrs pointers for cross-instance replacement + for attr_name in _self_attr_names: + attr = getattr(self, attr_name, None) + if attr is not None and isinstance(attr, _Tensor): + ci[f"__attr_{attr_name}"] = attr.data_ptr() + + cinputs[key] = ci + + graph.capture_begin() + result = method(self, *args, **kwargs) + graph.capture_end() + + # --- Log which op just entered the CUDA graph --- + _CAPTURED_GRAPH_LOG.append((_qualname, key, _shared)) + _LOGGER.info( + "[block_wise_cuda_graph] captured #%d op=%s shared=%s key=%s", + len(_CAPTURED_GRAPH_LOG), + _qualname, + _shared, + key, + ) + + graph.replay() + + # Store the actual result for reuse. The graph always writes to + # the same output memory, so we return the same tensors on replay. + coutputs[key] = result + return result + else: + # === Replay path (HOT PATH) === + old_ptrs = [] + new_ptrs = [] + ci = cinputs[key] + + for name, aidx in _input_info: + v = kwargs[name] if name in kwargs else (args[aidx] if aidx < nargs else None) + if v is not None and name in ci: + old_ptrs.append(ci[name]) + new_ptr = v.data_ptr() + new_ptrs.append(new_ptr) + ci[name] = new_ptr + + # Replace self_attrs pointers (e.g. weight) + for attr_name in _self_attr_names: + attr_key = f"__attr_{attr_name}" + if attr_key in ci: + attr = getattr(self, attr_name, None) + if attr is not None: + old_ptrs.append(ci[attr_key]) + new_ptr = attr.data_ptr() + new_ptrs.append(new_ptr) + ci[attr_key] = new_ptr + + if old_ptrs: + graphs[key].replace_input_ptrs(old_ptrs, new_ptrs) + graphs[key].replay() + + # Reuse the output tensors from capture — graph wrote fresh + # data to the same memory, no allocation needed. + return coutputs[key] + + return wrapper + + return decorator diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index bea36b3e05a..987d83a58c5 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -25,6 +25,9 @@ decode_alltoall_transpose, tensor_model_parallel_all_reduce, ) +from fastdeploy.model_executor.graph_optimization.cuda_graph_op import ( + block_wise_cuda_graph_wrap, +) from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase from fastdeploy.model_executor.utils import ( default_weight_loader, @@ -253,6 +256,7 @@ def load_state_dict(self, state_dict: dict): bias_tensor = paddle.to_tensor(get_tensor(state_dict.pop(self.bias_key))) self.bias.set_value(bias_tensor) + @block_wise_cuda_graph_wrap(inputs=["x"], self_attrs=["weight", "weight_scale_inv", "bias"]) def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor: """ Forward function for Linear. @@ -948,6 +952,9 @@ def all2all_transpose(self, x: paddle.Tensor) -> paddle.Tensor: out.reshape_([x.shape[0] // self.tp_size, self.input_size]) return out + # NOTE: do NOT wrap with @block_wise_cuda_graph_wrap here. + # This forward contains collective comms (alltoall / all_reduce) which + # cannot be captured into a CUDA Graph. def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor: if self.split_token: x = self.all2all_transpose(x) diff --git a/fastdeploy/model_executor/layers/normalization.py b/fastdeploy/model_executor/layers/normalization.py index 4a2cf32d1ba..26e07c3d672 100644 --- a/fastdeploy/model_executor/layers/normalization.py +++ b/fastdeploy/model_executor/layers/normalization.py @@ -21,6 +21,9 @@ from paddle import nn from fastdeploy.model_executor.forward_meta import ForwardMeta +from fastdeploy.model_executor.graph_optimization.cuda_graph_op import ( + block_wise_cuda_graph_wrap, +) from fastdeploy.platforms import current_platform if current_platform.is_gcu(): @@ -208,6 +211,7 @@ def allgather(self, out, token_num): paddle.distributed.all_gather(multi_outs, out, self.tp_group) return multi_outs[:token_num, :] + @block_wise_cuda_graph_wrap(inputs=["x", "residual_input"], self_attrs=["weight"]) def forward( self, x, diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index d7aa043ca25..5780edc1d1d 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -83,10 +83,9 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader): model_quantization_config = model_config.quantization_config quantization_config = model_quantization_config - # Special case: user wants to override an offline NVFP4 (modelopt) - # checkpoint with a top-level mix_quant config so that MoE continues + # override an offline NVFP4 (modelopt) checkpoint with a top-level mix_quant config so that MoE continues # to load NVFP4 weights while dense layers fall back to another online - # quantization (e.g. block_wise_fp8). + # quantization (e.g. block_wise_fp8). For example, eb5-800B-fp4 mix_quant_overrides_nvfp4 = ( cli_is_full_config and isinstance(cli_quantization, dict) @@ -113,32 +112,10 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader): merged["is_quantized"] = False merged["is_moe_quantized"] = True quantization_config = merged - # Only MoE weights are offline-quantized in the checkpoint. Do NOT - # set model_config.is_quantized=True here; if we did, every dense - # Linear would switch to using ".quant_weight/.weight_scale" keys - # (see linear.py self.is_quantized logic) and silently fail to - # load the bf16 attention / shared_experts weights, producing - # garbage output. MoE routing uses model_config.is_moe_quantized. + # MoE routing uses model_config.is_moe_quantized. model_config.is_moe_quantized = True - # Prune ignore patterns that must now be covered by dense online - # quantization (attention / shared_experts). Keep embed / lm_head / - # mtp patterns as bf16 (they don't go through mix_quant dense path - # as expected quantized linears). - if isinstance(model_quantization_config.get("ignore"), list): - keep_patterns = [] - for p in model_quantization_config["ignore"]: - if any(kw in p for kw in ("self_attn", "shared_experts")): - continue - keep_patterns.append(p) - # Rewrite model_config.quantization_config so modules_to_convert - # no longer excludes attention / shared_experts Linear layers. - pruned = dict(model_quantization_config) - pruned["ignore"] = keep_patterns - model_config.quantization_config = pruned - # Important: stop treating this as an "offline" config for the - # downstream dispatch below; we want to go through the normal - # mix_quant cls path instead of _get_offline_quant_config_name. - model_quantization_config = None + # Skip _get_offline_quant_config_name; use mix_quant cls instead. + model_quantization_config = None elif cli_is_full_config: if model_quantization_config is not None: if model_quantization_config != cli_quantization: diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index f34700e0bf8..e4c2d21b7e2 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -668,9 +668,8 @@ def apply_ep_prefill( shared_experts: nn.Layer = None, ) -> paddle.Tensor: # 1. top experts and weights - # logger.info(f"prefill的nvfp4") - # logger.info(f"layer.up_gate_proj_input_scale_quant:{layer.up_gate_proj_input_scale_quant}") gate_out = gate(x.cast("float32")) + # gate_out = paddle.randn(gate_out.shape,dtype="float32") topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) hidden_size = x.shape[1] diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 8d6603b1275..5f24278890f 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -2771,6 +2771,14 @@ def clear_parameters(self, pid): # Clear CUDAGraph if self.use_cudagraph: self.model.clear_graph_opt_backend() + + # Clear block-wise CUDA graphs + if envs.FD_USE_BLOCK_WISE_CUDA_GRAPH: + from fastdeploy.model_executor.graph_optimization.cuda_graph_op import ( + clear_all_block_wise_graphs, + ) + + clear_all_block_wise_graphs() # Clear parameters and Send single self.dynamic_weight_manager.clear_parameters( pid, self.fd_config.parallel_config.shutdown_comm_group_if_worker_idle @@ -3194,3 +3202,48 @@ def initialize_routing_replay_manager(self): block_table=self.share_inputs["block_tables"], total_block_num=self.num_gpu_blocks, ) + + def capture_block_wise_graphs(self) -> None: + """ + Independent capture loop for block-wise CUDA graphs. + Pre-captures graphs for designated token counts so that at runtime, + matching sizes replay the graph while other sizes fall back to eager. + """ + if envs.FD_USE_BLOCK_WISE_CUDA_GRAPH: + return + + from fastdeploy.model_executor.graph_optimization.cuda_graph_op import ( # Parse capture sizes from env var + dump_captured_graph_summary, + set_block_wise_capturing, + ) + + sizes_str = envs.FD_BLOCK_WISE_CUDA_GRAPH_SIZES + capture_sizes = sorted([int(s.strip()) for s in sizes_str.split(",") if s.strip()], reverse=True) + if not capture_sizes: + logger.warning("FD_BLOCK_WISE_CUDA_GRAPH_SIZES is empty, skipping block-wise CUDA graph capture") + return + + logger.info(f"Block-wise CUDA graph capture starting for sizes: {sorted(capture_sizes)}") + time_before_capture = time.perf_counter() + + set_block_wise_capturing(True) + try: + for num_tokens in capture_sizes: + batch_size = min(num_tokens, self.scheduler_config.max_num_seqs) + if batch_size < 1: + batch_size = 1 + self._dummy_run( + num_tokens=num_tokens, + batch_size=batch_size, + in_capturing=False, + ) + logger.info(f"Block-wise CUDA graph captured for num_tokens={num_tokens}") + finally: + set_block_wise_capturing(False) + + time_after_capture = time.perf_counter() + logger.info( + f"Block-wise CUDA graph capturing took {time_after_capture - time_before_capture:.3f} seconds " + f"for {len(capture_sizes)} sizes" + ) + dump_captured_graph_summary() diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index 423d9fb54a5..13e1ba7e612 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -245,6 +245,9 @@ def graph_optimize_and_warm_up_model(self) -> None: # Capture CUDAGraph for decode phase (all modes) self.model_runner.capture_model() + # Block-wise CUDA graph capture (independent loop) + self.model_runner.capture_block_wise_graphs() + # Deterministic mode: reset RNG and share_inputs after warmup. # Warmup _dummy_run() calls consume CUDA RNG state and leave stale # data (infer_seed, stop_flags, seq_lens, etc.) in share_inputs. From 842feba93f10f9a29df5f6fb154decb2691d7291 Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Tue, 28 Apr 2026 12:34:06 +0800 Subject: [PATCH 10/28] support fp4 communication quantization --- custom_ops/gpu_ops/cpp_extensions.cc | 4 +- .../moe/prefill_permute_to_masked_gemm.cu | 163 +++++++++++++----- fastdeploy/envs.py | 3 + .../graph_optimization/cuda_graph_op.py | 26 ++- .../layers/moe/fused_moe_blackwell_backend.py | 2 +- .../layers/moe/fused_moe_deepgemm_backend.py | 2 +- .../layers/quantization/nvfp4.py | 88 +++++++--- fastdeploy/worker/gpu_model_runner.py | 7 +- .../test_permute_prefill_masked_gemm.py | 2 +- 9 files changed, 207 insertions(+), 90 deletions(-) diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index eecc3590595..80b12b455a8 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -1194,7 +1194,8 @@ std::vector PrefillPermuteToMaskedGemm( const paddle::Tensor& scale, const paddle::Tensor& topk_ids, const int num_local_experts, - const int max_token_num); + const int max_token_num, + const bool swizzle_scale); std::vector DepermutePrefillCombine( const paddle::Tensor& x, @@ -1903,6 +1904,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("topk_ids"), py::arg("num_local_experts"), py::arg("max_token_num"), + py::arg("swizzle_scale") = false, "Prefill permute to masked GEMM for MoE"); m.def("depermute_prefill_combine", diff --git a/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu b/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu index 09ec220e34d..170286e0b0d 100644 --- a/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu +++ b/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu @@ -14,9 +14,15 @@ #include "helper.h" +#include + constexpr int BLOCK_THREADS = 512; -template +template __global__ void PrefillPermuteToMaskedGemmKernel( T* __restrict__ permute_x, ScaleT* __restrict__ permute_scale, @@ -34,8 +40,12 @@ __global__ void PrefillPermuteToMaskedGemmKernel( const int tidx = threadIdx.x; const int x_num_vecs = hidden / VecSize; - constexpr int ScaleVecSize = 16 / sizeof(float); // 4 - const int scale_num_vecs = hidden_scale / ScaleVecSize; + // Pre-compute swizzle constants outside the slot loop (compile-time dead if + // !SWIZZLE_SCALE) + const int scale_bytes_per_token = + hidden_scale * static_cast(sizeof(ScaleT)); + const int m_tiles = max_tokens_per_expert / 128; + const int k_tiles = scale_bytes_per_token / 4; for (int token_idx = blockIdx.x; token_idx < num_tokens; token_idx += gridDim.x) { @@ -71,20 +81,49 @@ __global__ void PrefillPermuteToMaskedGemmKernel( Store(vec_x, dst_x + v * VecSize); } - // Copy scale[token_idx, :] -> permute_scale with transposed layout - // Physical layout is [E, S, M], accessed as [E, M, S] via strides [S*M, - // 1, M] So permute_scale[expert_idx, offset, s] -> physical addr: - // expert_idx*(S*M) + offset + s*M const ScaleT* src_scale = scale + static_cast(token_idx) * hidden_scale; - ScaleT* dst_scale_base = permute_scale + - static_cast(expert_idx) * - hidden_scale * max_tokens_per_expert + - offset; - - for (int s = tidx; s < hidden_scale; s += BLOCK_THREADS) { - dst_scale_base[static_cast(s) * max_tokens_per_expert] = - src_scale[s]; + + if constexpr (SWIZZLE_SCALE) { + // Directly write packed FP8 scale bytes into the swizzled layout used + // by flashinfer cutedsl: [E, M/128, K/4, 32, 4, 4]. The tensor is + // exposed to Paddle as packed float32 [E, M, K/4]. + const uint8_t* src_scale_bytes = + reinterpret_cast(src_scale); + uint8_t* dst_scale_bytes = reinterpret_cast(permute_scale); + const int rm = offset >> 7; + const int m_in = offset & 127; + const int m_in2 = m_in >> 5; + const int m_in3 = m_in & 31; + + for (int s = tidx; s < scale_bytes_per_token; s += BLOCK_THREADS) { + const int rk = s >> 2; + const int k_in = s & 3; + const int64_t dst_idx = + (((((static_cast(expert_idx) * m_tiles + rm) * + k_tiles + + rk) * + 32 + + m_in3) * + 4 + + m_in2) * + 4 + + k_in); + dst_scale_bytes[dst_idx] = src_scale_bytes[s]; + } + } else { + // Copy scale[token_idx, :] -> permute_scale with transposed layout. + // Physical layout is [E, S, M], accessed as [E, M, S] via strides + // [S*M, 1, M]. + ScaleT* dst_scale_base = permute_scale + + static_cast(expert_idx) * + hidden_scale * max_tokens_per_expert + + offset; + + for (int s = tidx; s < hidden_scale; s += BLOCK_THREADS) { + dst_scale_base[static_cast(s) * max_tokens_per_expert] = + src_scale[s]; + } } __syncthreads(); @@ -102,7 +141,8 @@ std::vector PrefillPermuteToMaskedGemmDispatch( const paddle::Tensor& scale, const paddle::Tensor& topk_ids, const int num_local_experts, - const int max_token_num) { + const int max_token_num, + const bool swizzle_scale) { typedef PDTraits traits_; typedef PDTraits scale_traits_; typedef typename traits_::DataType DataType_; @@ -121,13 +161,26 @@ std::vector PrefillPermuteToMaskedGemmDispatch( auto permute_x = GetEmptyTensor( {num_local_experts, max_token_num, hidden}, x.dtype(), place); - auto permute_scale = - GetEmptyTensor({num_local_experts, max_token_num, hidden_scale}, - {static_cast(hidden_scale) * max_token_num, - 1, - static_cast(max_token_num)}, - ScaleD, - place); + paddle::Tensor permute_scale; + if (swizzle_scale) { + const int scale_bytes_per_token = + hidden_scale * static_cast(sizeof(ScaleDataType_)); + PD_CHECK(max_token_num % 128 == 0, + "swizzle_scale requires max_token_num to be divisible by 128"); + PD_CHECK(scale_bytes_per_token % 4 == 0, + "swizzle_scale requires the unpacked FP8 scale dimension to be " + "divisible by 4"); + permute_scale = GetEmptyTensor( + {num_local_experts, max_token_num, hidden_scale}, ScaleD, place); + } else { + permute_scale = + GetEmptyTensor({num_local_experts, max_token_num, hidden_scale}, + {static_cast(hidden_scale) * max_token_num, + 1, + static_cast(max_token_num)}, + ScaleD, + place); + } auto permuted_indice_map = GetEmptyTensor({num_tokens, topk}, paddle::DataType::INT32, place); @@ -154,21 +207,34 @@ std::vector PrefillPermuteToMaskedGemmDispatch( cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev); int num_blocks = sm_count * 2; - PrefillPermuteToMaskedGemmKernel - <<>>( - reinterpret_cast(permute_x.data()), - reinterpret_cast( - permute_scale.template data()), - permuted_indice_map.data(), - token_nums_per_expert.data(), - reinterpret_cast(x.data()), - reinterpret_cast( - scale.template data()), - topk_ids.data(), - num_tokens, - hidden, - hidden_scale, - max_token_num); +#define LAUNCH_PREFILL_PERMUTE(SWIZZLE) \ + PrefillPermuteToMaskedGemmKernel \ + <<>>( \ + reinterpret_cast(permute_x.data()), \ + reinterpret_cast( \ + permute_scale.template data()), \ + permuted_indice_map.data(), \ + token_nums_per_expert.data(), \ + reinterpret_cast(x.data()), \ + reinterpret_cast( \ + scale.template data()), \ + topk_ids.data(), \ + num_tokens, \ + hidden, \ + hidden_scale, \ + max_token_num) + + if (swizzle_scale) { + LAUNCH_PREFILL_PERMUTE(true); + } else { + LAUNCH_PREFILL_PERMUTE(false); + } + +#undef LAUNCH_PREFILL_PERMUTE return {permute_x, permute_scale, permuted_indice_map, token_nums_per_expert}; } @@ -178,13 +244,20 @@ std::vector PrefillPermuteToMaskedGemm( const paddle::Tensor& scale, const paddle::Tensor& topk_ids, const int num_local_experts, - const int max_token_num) { + const int max_token_num, + const bool swizzle_scale) { + if (swizzle_scale) { + PD_CHECK(x.dtype() == paddle::DataType::UINT8 && + scale.dtype() == paddle::DataType::FLOAT32, + "swizzle_scale=true is only valid for UINT8 x + FLOAT32 scale " + "(FP4 comm quant path)"); + } const int topk = topk_ids.shape()[1]; #define DISPATCH_TOPK(DTYPE, SCALE_DTYPE, TOPK_VAL) \ case TOPK_VAL: \ return PrefillPermuteToMaskedGemmDispatch( \ - x, scale, topk_ids, num_local_experts, max_token_num); + x, scale, topk_ids, num_local_experts, max_token_num, swizzle_scale); switch (x.dtype()) { case paddle::DataType::FLOAT8_E4M3FN: { @@ -274,7 +347,8 @@ std::vector> PrefillPermuteToMaskedGemmInferShape( const std::vector& scale_shape, const std::vector& topk_ids_shape, const int num_local_experts, - const int max_token_num) { + const int max_token_num, + const bool swizzle_scale) { int64_t num_tokens = x_shape[0]; int64_t hidden = x_shape[1]; int64_t hidden_scale = scale_shape[1]; @@ -293,7 +367,8 @@ std::vector PrefillPermuteToMaskedGemmInferDtype( const paddle::DataType& scale_dtype, const paddle::DataType& topk_ids_dtype, const int num_local_experts, - const int max_token_num) { + const int max_token_num, + const bool swizzle_scale) { return { x_dtype, scale_dtype, paddle::DataType::INT32, paddle::DataType::INT32}; } @@ -304,7 +379,9 @@ PD_BUILD_STATIC_OP(prefill_permute_to_masked_gemm) "permute_scale", "permuted_indice_map", "token_nums_per_expert"}) - .Attrs({"num_local_experts: int", "max_token_num: int"}) + .Attrs({"num_local_experts: int", + "max_token_num: int", + "swizzle_scale: bool"}) .SetKernelFn(PD_KERNEL(PrefillPermuteToMaskedGemm)) .SetInferShapeFn(PD_INFER_SHAPE(PrefillPermuteToMaskedGemmInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(PrefillPermuteToMaskedGemmInferDtype)); diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 9e3e1394be2..1ed52cfc499 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -281,6 +281,9 @@ def _validate_split_kv_size(value: int) -> int: # At runtime, token counts not in this list fall back to eager execution. # Example: "1,2,4,8,16,32,64,128,256,512" "FD_BLOCK_WISE_CUDA_GRAPH_SIZES": lambda: os.getenv("FD_BLOCK_WISE_CUDA_GRAPH_SIZES", "128,256,512,1024,2048"), + # When set to 1, print which op / shape enters the block-wise CUDA Graph + # during the capture phase. Defaults to 0 (silent). + "FD_BLOCK_WISE_DEBUG": lambda: bool(int(os.getenv("FD_BLOCK_WISE_DEBUG", "0"))), } diff --git a/fastdeploy/model_executor/graph_optimization/cuda_graph_op.py b/fastdeploy/model_executor/graph_optimization/cuda_graph_op.py index 4e409058a43..3daecc87f69 100644 --- a/fastdeploy/model_executor/graph_optimization/cuda_graph_op.py +++ b/fastdeploy/model_executor/graph_optimization/cuda_graph_op.py @@ -16,10 +16,10 @@ import functools import inspect -import logging from typing import Callable, Optional, Sequence import paddle +from paddleformers.utils.log import logger as _LOGGER import fastdeploy @@ -35,9 +35,6 @@ # Global counter / registry of all captured block-wise graphs (for logging). # Each entry: (qualname, key, shared_mode) _CAPTURED_GRAPH_LOG: list = [] -_LOGGER = logging.getLogger("fastdeploy.block_wise_cuda_graph") -if not _LOGGER.handlers: - _LOGGER.setLevel(logging.INFO) def get_captured_graph_log(): @@ -49,17 +46,18 @@ def dump_captured_graph_summary(): """Print a summary of all captured block-wise CUDA graphs.""" from collections import Counter + if not fastdeploy.envs.FD_BLOCK_WISE_DEBUG: + return if not _CAPTURED_GRAPH_LOG: _LOGGER.info("[block_wise_cuda_graph] no graph captured") return counter = Counter(q for q, _, _ in _CAPTURED_GRAPH_LOG) _LOGGER.info( - "[block_wise_cuda_graph] total captured graphs=%d across %d distinct methods:", - len(_CAPTURED_GRAPH_LOG), - len(counter), + f"[block_wise_cuda_graph] total captured graphs={len(_CAPTURED_GRAPH_LOG)} " + f"across {len(counter)} distinct methods:" ) for qname, cnt in sorted(counter.items(), key=lambda x: -x[1]): - _LOGGER.info(" - %s : %d graph(s)", qname, cnt) + _LOGGER.info(f" - {qname} : {cnt} graph(s)") def set_block_wise_capturing(capturing: bool): @@ -272,13 +270,11 @@ def wrapper(self, *args, **kwargs): # --- Log which op just entered the CUDA graph --- _CAPTURED_GRAPH_LOG.append((_qualname, key, _shared)) - _LOGGER.info( - "[block_wise_cuda_graph] captured #%d op=%s shared=%s key=%s", - len(_CAPTURED_GRAPH_LOG), - _qualname, - _shared, - key, - ) + if fastdeploy.envs.FD_BLOCK_WISE_DEBUG: + _LOGGER.info( + f"[block_wise_cuda_graph] captured #{len(_CAPTURED_GRAPH_LOG)} " + f"op={_qualname} shared={_shared} key={key}" + ) graph.replay() diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_blackwell_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_blackwell_backend.py index eb3fd3ce3b0..b1b20a9e8ed 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_blackwell_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_blackwell_backend.py @@ -88,7 +88,7 @@ def call_prefill_permute_to_masked_gemm( if topk_ids.dtype != paddle.int64: topk_ids = topk_ids.cast(paddle.int64) - results = prefill_permute_to_masked_gemm(x, scale, topk_ids, num_local_experts, max_token_num) + results = prefill_permute_to_masked_gemm(x, scale, topk_ids, num_local_experts, max_token_num, False) return results[0], results[1], results[2], results[3] diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 9c6d174daa4..7d7240fe042 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -79,7 +79,7 @@ def call_prefill_permute_to_masked_gemm( if topk_ids.dtype != paddle.int64: topk_ids = topk_ids.cast(paddle.int64) - results = prefill_permute_to_masked_gemm(x, scale, topk_ids, num_local_experts, max_token_num) + results = prefill_permute_to_masked_gemm(x, scale, topk_ids, num_local_experts, max_token_num, False) return results[0], results[1], results[2], results[3] diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index e4c2d21b7e2..d3a34c429b6 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -81,6 +81,7 @@ def call_prefill_permute_to_masked_gemm( topk_ids: paddle.Tensor, num_local_experts: int, max_token_num: int, + swizzle_scale: bool = False, ): """ Permute input tokens and scales from token-major to expert-major layout @@ -92,6 +93,7 @@ def call_prefill_permute_to_masked_gemm( topk_ids: Expert routing indices [num_tokens, topk] (int64 or int32). num_local_experts: Number of local experts on this device. max_token_num: Maximum tokens per expert buffer. + swizzle_scale: Whether to directly write scale in flashinfer swizzled layout. Returns: tuple: (permute_x, permute_scale, permuted_indice_map, token_nums_per_expert) @@ -104,7 +106,7 @@ def call_prefill_permute_to_masked_gemm( if scale is None: scale = paddle.empty([0], dtype=paddle.float32) - results = prefill_permute_to_masked_gemm(x, scale, topk_ids, num_local_experts, max_token_num) + results = prefill_permute_to_masked_gemm(x, scale, topk_ids, num_local_experts, max_token_num, swizzle_scale) return results[0], results[1], results[2], results[3] @@ -366,7 +368,7 @@ def _create_weight_scales(self, layer, weight_scale_shape, weight_scale_2_shape, ) def process_weights_after_loading(self, layer) -> None: - logger.info("跑了Nvfp4") + input_scale_2 = layer.input_scale.max().to(paddle.float32) weight_scale_2 = layer.weight_scale_2.max().to(paddle.float32) alpha = input_scale_2 * weight_scale_2 @@ -606,7 +608,7 @@ def load_up_proj_weight_first(self) -> bool: def process_weights_after_loading(self, layer): """ """ # FlashInfer CUTLASS kernel assumes [Up, Gate] Proj as W13 - # logger.info(f"跑了fp4 moe了") + if self.backend == "flashinfer-cutlass": [a, b] = layer.up_gate_proj_weight.split(2, axis=1) layer.up_gate_proj_weight.set_value(paddle.concat([b, a], axis=1)) @@ -667,15 +669,31 @@ def apply_ep_prefill( topk_ids_hookfunc: Callable = None, shared_experts: nn.Layer = None, ) -> paddle.Tensor: + # 1. top experts and weights gate_out = gate(x.cast("float32")) - # gate_out = paddle.randn(gate_out.shape,dtype="float32") topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) hidden_size = x.shape[1] if topk_ids_hookfunc is not None: topk_ids_hookfunc(topk_ids=topk_idx) + use_fp4_comm_quant = envs.FD_USE_NVFP4_COMM_QUANT + + if use_fp4_comm_quant: + # FP4 communication quantization: quantize to FP4 before dispatch, + # reducing communication volume by ~2x vs BF16. + x_fp4, x_fp4_scale = fp4_quantize( + x, layer.up_gate_proj_input_scale_quant, sf_vec_size=16, is_sf_swizzled_layout=False + ) + x_fp4_scale = x_fp4_scale.view(paddle.float32) # float8_e4m3fn -> float32 + dispatch_input = x_fp4 + dispatch_scale = x_fp4_scale + else: + # BF16 communication: dispatch BF16 data without pre-quantization. + dispatch_input = x + dispatch_scale = None + event = deep_ep.Buffer.capture() if self.ep_prefill_runner.num_worst_tokens <= 0: @@ -690,11 +708,12 @@ def apply_ep_prefill( handle, event, ) = self.ep_prefill_runner.dispatch( - x, + dispatch_input, topk_idx, topk_weights, expert_alignment=128, previous_event=event, + x_scale_tensor=dispatch_scale, ) if self.ep_prefill_runner.num_worst_tokens > 0: @@ -735,7 +754,7 @@ def apply_ep_prefill( if self.ep_prefill_runner.num_worst_tokens > 0: use_tbo = os.getenv("USE_TBO", "0") - token_split_factor = 8 if int(use_tbo) == 1 else 1 + token_split_factor = 2 if int(use_tbo) == 1 else 1 max_tokens_per_rank = ( layer.fd_config.scheduler_config.max_num_batched_tokens // layer.fd_config.parallel_config.tensor_parallel_size @@ -749,28 +768,47 @@ def apply_ep_prefill( topk_ids=recv_topk_idx, num_local_experts=layer.num_local_experts, max_token_num=layer.ep_size * max_tokens_per_rank, + swizzle_scale=recv_x_scale is not None, ) ) - max_token_num = layer.ep_size * max_tokens_per_rank - permute_input = permute_input.reshape([layer.num_local_experts, max_token_num, recv_x_value.shape[-1]]) - - # ffn_out: [num_local_experts, m, hidden_size] - # NVFP4 dispatch returns BF16 (no pre-quantized scale), so permute_scale is empty. - # Use per-expert 1/input_scale (up_gate_proj_input_scale_quant) as input_global_scale, - # consistent with apply_ep_decode which also uses this value directly. - ffn_out = flashinfer_cutedsl_moe_masked( - hidden_states=(permute_input, None), - input_global_scale=layer.up_gate_proj_input_scale_quant.expand([layer.num_local_experts]), - w1=layer.up_gate_proj_weight, - w1_blockscale=layer.up_gate_proj_blockscale_swizzled, - w1_alpha=layer.g1_alphas, - w2=layer.down_proj_weight, - a2_global_scale=layer.down_proj_input_scale_quant.expand([layer.num_local_experts]), - w2_blockscale=layer.down_proj_blockscale_swizzled, - w2_alpha=layer.g2_alphas, - masked_m=token_nums_per_expert.squeeze(-1), - ) + if recv_x_scale is not None: + # FP4 pre-quantized dispatch path: + # permute_input is uint8 [E, M, hidden//2] (FP4 packed) + # permute_scale is packed float32 [E, M, hidden//64] whose + # underlying FP8 bytes were already written in swizzled layout + # by prefill_permute_to_masked_gemm. + permute_scale_swizzled = permute_scale.view(paddle.float8_e4m3fn) + permute_input_t = permute_input.transpose([1, 2, 0]) + permute_scale_swizzled_t = permute_scale_swizzled.transpose([1, 2, 0]) + + ffn_out = flashinfer_cutedsl_moe_masked( + hidden_states=(permute_input_t, permute_scale_swizzled_t), + input_global_scale=None, + w1=layer.up_gate_proj_weight, + w1_blockscale=layer.up_gate_proj_blockscale_swizzled, + w1_alpha=layer.g1_alphas, + w2=layer.down_proj_weight, + a2_global_scale=layer.down_proj_input_scale_quant.expand([layer.num_local_experts]), + w2_blockscale=layer.down_proj_blockscale_swizzled, + w2_alpha=layer.g2_alphas, + masked_m=token_nums_per_expert.squeeze(-1), + ) + else: + # BF16 dispatch path: permute_input is BF16, quantize to FP4 + # inside flashinfer_cutedsl_moe_masked + ffn_out = flashinfer_cutedsl_moe_masked( + hidden_states=(permute_input, None), + input_global_scale=layer.up_gate_proj_input_scale_quant.expand([layer.num_local_experts]), + w1=layer.up_gate_proj_weight, + w1_blockscale=layer.up_gate_proj_blockscale_swizzled, + w1_alpha=layer.g1_alphas, + w2=layer.down_proj_weight, + a2_global_scale=layer.down_proj_input_scale_quant.expand([layer.num_local_experts]), + w2_blockscale=layer.down_proj_blockscale_swizzled, + w2_alpha=layer.g2_alphas, + masked_m=token_nums_per_expert.squeeze(-1), + ) tmp_ffn_out = call_depermute_prefill_combine( x=ffn_out, diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 5f24278890f..c1167609862 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1084,8 +1084,8 @@ def get_input_length_list( # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan. # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP. - if self.fd_config.parallel_config.enable_expert_parallel: - input_length = min(input_length, 32) + # if self.fd_config.parallel_config.enable_expert_parallel: + # input_length = min(input_length, 32) block_num = ( input_length + self.cache_config.block_size - 1 @@ -2648,6 +2648,7 @@ def profile_run(self) -> None: logger.info( f"Dummy run with {num_tokens} tokens, mm_max_tokens_per_item: {self.model_config.mm_max_tokens_per_item}" ) + self._dummy_run( num_tokens=num_tokens, batch_size=self.scheduler_config.max_num_seqs, @@ -3209,7 +3210,7 @@ def capture_block_wise_graphs(self) -> None: Pre-captures graphs for designated token counts so that at runtime, matching sizes replay the graph while other sizes fall back to eager. """ - if envs.FD_USE_BLOCK_WISE_CUDA_GRAPH: + if not envs.FD_USE_BLOCK_WISE_CUDA_GRAPH: return from fastdeploy.model_executor.graph_optimization.cuda_graph_op import ( # Parse capture sizes from env var diff --git a/tests/operators/test_permute_prefill_masked_gemm.py b/tests/operators/test_permute_prefill_masked_gemm.py index 89c28dab744..020815605d1 100644 --- a/tests/operators/test_permute_prefill_masked_gemm.py +++ b/tests/operators/test_permute_prefill_masked_gemm.py @@ -46,7 +46,7 @@ def call_prefill_permute_to_masked_gemm( if topk_ids.dtype != paddle.int64: topk_ids = topk_ids.cast(paddle.int64) - results = prefill_permute_to_masked_gemm(x, scale, topk_ids, num_local_experts, max_token_num) + results = prefill_permute_to_masked_gemm(x, scale, topk_ids, num_local_experts, max_token_num, False) return results[0], results[1], results[2], results[3] From b64368389fb820499fd93b340c523468ed7d07e6 Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Fri, 15 May 2026 15:17:23 +0800 Subject: [PATCH 11/28] fix --- custom_ops/gpu_ops/helper.h | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/custom_ops/gpu_ops/helper.h b/custom_ops/gpu_ops/helper.h index bdc147c5534..fd5f3a27eb4 100644 --- a/custom_ops/gpu_ops/helper.h +++ b/custom_ops/gpu_ops/helper.h @@ -370,17 +370,10 @@ inline json readJsonFromFile(const std::string& filePath) { // paddle::GPUPlace() #ifdef PADDLE_DEV -<<<<<<< HEAD inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, const paddle::DataType& dtype, const paddle::Place& place) { phi::Allocator* allocator = nullptr; -======= -inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, - const paddle::DataType &dtype, - const paddle::Place &place) { - phi::Allocator *allocator = nullptr; ->>>>>>> 8790cec070ea2aa2ab19ea657f3707427cf46112 #if defined(PADDLE_WITH_CUDA) if (phi::backends::gpu::CUDAGraph::IsThisThreadCapturing()) { allocator = paddle::memory::allocation::AllocatorFacade::Instance() @@ -399,11 +392,11 @@ inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, return paddle::Tensor(std::make_shared(dense_tensor)); } -inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, - const common::DDim &strides, - const paddle::DataType &dtype, - const paddle::Place &place) { - phi::Allocator *allocator = nullptr; +inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, + const common::DDim& strides, + const paddle::DataType& dtype, + const paddle::Place& place) { + phi::Allocator* allocator = nullptr; #if defined(PADDLE_WITH_CUDA) if (phi::backends::gpu::CUDAGraph::IsThisThreadCapturing()) { allocator = paddle::memory::allocation::AllocatorFacade::Instance() From 2f4151c693a0a8c8e4285cde5bbd9ececd506496 Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Fri, 15 May 2026 16:30:28 +0800 Subject: [PATCH 12/28] add test --- tests/quantization/test_modelopt_nvfp4.py | 300 ++++++++++++++++++++++ 1 file changed, 300 insertions(+) diff --git a/tests/quantization/test_modelopt_nvfp4.py b/tests/quantization/test_modelopt_nvfp4.py index a68588d1779..d8c1995778a 100644 --- a/tests/quantization/test_modelopt_nvfp4.py +++ b/tests/quantization/test_modelopt_nvfp4.py @@ -570,5 +570,305 @@ def fake_cutlass_fused_moe(**kwargs): sys.modules["flashinfer.fused_moe"] = prev_fused +class TestFlashInferCuteDSLMoEHelpers(unittest.TestCase): + """Unit tests for helper functions in flashinfer_cutedsl_moe module.""" + + def setUp(self): + from fastdeploy.model_executor.layers.moe import ( + flashinfer_cutedsl_moe as cutedsl_mod, + ) + + self.mod = cutedsl_mod + + def test_dtype_str_normalizes_paddle_dtype(self): + """_dtype_str should strip the leading namespace from a paddle dtype.""" + t = paddle.zeros([1], dtype=paddle.float32) + self.assertEqual(self.mod._dtype_str(t.dtype), "float32") + + def test_is_dtype_matches_one_of_names(self): + """_is_dtype returns True when tensor dtype name is in the candidate list.""" + t = paddle.zeros([1], dtype=paddle.uint8) + self.assertTrue(self.mod._is_dtype(t, "uint8")) + self.assertTrue(self.mod._is_dtype(t, "float32", "uint8")) + self.assertFalse(self.mod._is_dtype(t, "float32", "bfloat16")) + + def test_perm_uses_transpose_for_paddle_tensor(self): + """_perm should perform a transpose (axis permutation) for paddle tensors.""" + t = paddle.zeros([2, 3, 4], dtype=paddle.float32) + out = self.mod._perm(t, 1, 2, 0) + self.assertEqual(list(out.shape), [3, 4, 2]) + + def test_get_cute_dtype_supported(self): + """get_cute_dtype maps supported paddle dtypes to flashinfer dtype strings.""" + bf = paddle.zeros([1], dtype=paddle.bfloat16) + f16 = paddle.zeros([1], dtype=paddle.float16) + f32 = paddle.zeros([1], dtype=paddle.float32) + self.assertEqual(self.mod.get_cute_dtype(bf), "bfloat16") + self.assertEqual(self.mod.get_cute_dtype(f16), "float16") + self.assertEqual(self.mod.get_cute_dtype(f32), "float32") + + def test_get_cute_dtype_unsupported_raises(self): + """get_cute_dtype raises ValueError for unsupported dtypes.""" + u8 = paddle.zeros([1], dtype=paddle.uint8) + with self.assertRaises(ValueError): + self.mod.get_cute_dtype(u8) + + +class TestFlashInferCuteDSLMoEMasked(unittest.TestCase): + """Unit tests for flashinfer_cutedsl_moe_masked covering both pre-quantized and standard paths. + + These tests fully mock the FlashInfer CuteDSL kernels (grouped_gemm_nt_masked, + silu_and_mul_scaled_nvfp4_experts_quantize, scaled_fp4_grouped_quantize) so the + suite runs on any GPU. The tests verify call ordering, tensor wiring (alpha + reshape, expert-last permutation), and the output shape contract. + """ + + def setUp(self): + from fastdeploy.model_executor.layers.moe import ( + flashinfer_cutedsl_moe as cutedsl_mod, + ) + + self.mod = cutedsl_mod + # Sizes chosen small but k must be a multiple of sf_vec_size (16). + self.num_experts = 2 + self.m = 4 + self.k = 32 # k//2 = 16, k//16 = 2 + self.n = 16 # intermediate_size; w2 last dim = n//2 = 8 + + def _make_weights(self): + E, m, k, n = self.num_experts, self.m, self.k, self.n + # FP4-packed weights as uint8 (k//2 bytes per row). + w1 = paddle.zeros([E, 2 * n, k // 2], dtype=paddle.uint8) + w2 = paddle.zeros([E, k, n // 2], dtype=paddle.uint8) + # Block scales (use uint8 stand-in for float8_e4m3fn; dtype check is mocked). + w1_bs = paddle.zeros([E, 2 * n, k // 16], dtype=paddle.uint8) + w2_bs = paddle.zeros([E, k, n // 16], dtype=paddle.uint8) + # Per-expert alphas / scales. + w1_alpha = paddle.ones([E], dtype=paddle.float32) + w2_alpha = paddle.ones([E], dtype=paddle.float32) + a2_gscale = paddle.ones([E], dtype=paddle.float32) + masked_m = paddle.full([E], m, dtype=paddle.int32) + return w1, w1_bs, w1_alpha, w2, w2_bs, w2_alpha, a2_gscale, masked_m + + def _patch_kernels(self): + """Install fake CuteDSL kernels that record calls and produce shape-correct outputs.""" + calls = {"gemm": [], "silu_quant": [], "fp4_quant": []} + + def fake_grouped_gemm_nt_masked(ab, w_pair, out, masked_m, **kwargs): + calls["gemm"].append({"ab_shapes": [t.shape for t in ab], "out_shape": list(out.shape), **kwargs}) + return out + + def fake_silu_and_mul_scaled_nvfp4_experts_quantize(gateup, masked_m, a2_gscale): + E, m, two_n = gateup.shape + n = two_n // 2 + diq = paddle.zeros([E, m, n // 2], dtype=paddle.uint8) + diq_sf = paddle.zeros([E, m, n // 16], dtype=paddle.uint8) + calls["silu_quant"].append({"in_shape": list(gateup.shape)}) + return diq, diq_sf + + def fake_scaled_fp4_grouped_quantize(x, masked_m, input_global_scale): + E, m, k = x.shape + a_q = paddle.zeros([E, m, k // 2], dtype=paddle.uint8) + a_q_sf = paddle.zeros([E, m, k // 16], dtype=paddle.uint8) + calls["fp4_quant"].append({"in_shape": list(x.shape)}) + return a_q, a_q_sf + + return calls, mock.patch.multiple( + self.mod, + grouped_gemm_nt_masked=fake_grouped_gemm_nt_masked, + silu_and_mul_scaled_nvfp4_experts_quantize=fake_silu_and_mul_scaled_nvfp4_experts_quantize, + scaled_fp4_grouped_quantize=fake_scaled_fp4_grouped_quantize, + _is_dtype=lambda *args, **kwargs: True, # bypass strict dtype assertions + ) + + def test_masked_moe_standard_bf16_path(self): + """Standard path: bf16 [E, m, k] activations are quantized internally via scaled_fp4_grouped_quantize.""" + w1, w1_bs, w1_alpha, w2, w2_bs, w2_alpha, a2_gscale, masked_m = self._make_weights() + E, m, k = self.num_experts, self.m, self.k + hidden = paddle.zeros([E, m, k], dtype=paddle.bfloat16) + input_gscale = paddle.ones([E], dtype=paddle.float32) + + calls, patcher = self._patch_kernels() + with patcher: + out = self.mod.flashinfer_cutedsl_moe_masked( + hidden_states=(hidden, None), + input_global_scale=input_gscale, + w1=w1, + w1_blockscale=w1_bs, + w1_alpha=w1_alpha, + w2=w2, + a2_global_scale=a2_gscale, + w2_blockscale=w2_bs, + w2_alpha=w2_alpha, + masked_m=masked_m, + ) + + # Standard path triggers scaled_fp4_grouped_quantize once. + self.assertEqual(len(calls["fp4_quant"]), 1) + # Two GEMMs (gate+up, then down) plus one silu+mul re-quant in between. + self.assertEqual(len(calls["gemm"]), 2) + self.assertEqual(len(calls["silu_quant"]), 1) + # Output is [num_experts, m, k] bf16. + self.assertEqual(list(out.shape), [E, m, k]) + self.assertEqual(self.mod._dtype_str(out.dtype), "bfloat16") + + def test_masked_moe_prequantized_fp4_path(self): + """Pre-quantized path: hidden_states[1] is not None — skips internal quantization.""" + w1, w1_bs, w1_alpha, w2, w2_bs, w2_alpha, a2_gscale, masked_m = self._make_weights() + E, m, k = self.num_experts, self.m, self.k + # Pre-quantized layout: [m, k//2, num_experts] uint8 + [m, k//16, num_experts] (fp8 stand-in). + a_q = paddle.zeros([m, k // 2, E], dtype=paddle.uint8) + a_q_sf = paddle.zeros([m, k // 16, E], dtype=paddle.uint8) + + calls, patcher = self._patch_kernels() + # input_global_scale is unused on this path; pass None to mirror the real call site. + with patcher: + out = self.mod.flashinfer_cutedsl_moe_masked( + hidden_states=(a_q, a_q_sf), + input_global_scale=None, + w1=w1, + w1_blockscale=w1_bs, + w1_alpha=w1_alpha, + w2=w2, + a2_global_scale=a2_gscale, + w2_blockscale=w2_bs, + w2_alpha=w2_alpha, + masked_m=masked_m, + ) + + # Pre-quantized path must NOT call scaled_fp4_grouped_quantize. + self.assertEqual(len(calls["fp4_quant"]), 0) + self.assertEqual(len(calls["gemm"]), 2) + self.assertEqual(len(calls["silu_quant"]), 1) + self.assertEqual(list(out.shape), [E, m, k]) + + def test_masked_moe_gemm_kwargs(self): + """Both GEMMs must be invoked with the expected FP4/FP8 dtype contract and per-expert alpha shape.""" + w1, w1_bs, w1_alpha, w2, w2_bs, w2_alpha, a2_gscale, masked_m = self._make_weights() + E, m, k = self.num_experts, self.m, self.k + hidden = paddle.zeros([E, m, k], dtype=paddle.bfloat16) + input_gscale = paddle.ones([E], dtype=paddle.float32) + + calls, patcher = self._patch_kernels() + with patcher: + self.mod.flashinfer_cutedsl_moe_masked( + hidden_states=(hidden, None), + input_global_scale=input_gscale, + w1=w1, + w1_blockscale=w1_bs, + w1_alpha=w1_alpha, + w2=w2, + a2_global_scale=a2_gscale, + w2_blockscale=w2_bs, + w2_alpha=w2_alpha, + masked_m=masked_m, + ) + + for gemm in calls["gemm"]: + self.assertEqual(gemm["ab_dtype"], "float4_e2m1fn") + self.assertEqual(gemm["sf_dtype"], "float8_e4m3fn") + self.assertEqual(gemm["c_dtype"], "bfloat16") + self.assertEqual(gemm["sf_vec_size"], 16) + # alpha is reshaped to [1, 1, num_experts] before being passed in. + self.assertEqual(list(gemm["alpha"].shape), [1, 1, E]) + + def test_masked_moe_assert_w1_uint8(self): + """If w1 is not uint8, the dtype assertion must trip (verifies the first dtype guard fires).""" + w1, w1_bs, w1_alpha, w2, w2_bs, w2_alpha, a2_gscale, masked_m = self._make_weights() + # Force w1 to a bad dtype. + w1_bad = paddle.zeros(w1.shape, dtype=paddle.float32) + E, m, k = self.num_experts, self.m, self.k + hidden = paddle.zeros([E, m, k], dtype=paddle.bfloat16) + input_gscale = paddle.ones([E], dtype=paddle.float32) + + # No _is_dtype patch here — let the real check run. + with self.assertRaises(AssertionError): + self.mod.flashinfer_cutedsl_moe_masked( + hidden_states=(hidden, None), + input_global_scale=input_gscale, + w1=w1_bad, + w1_blockscale=w1_bs, + w1_alpha=w1_alpha, + w2=w2, + a2_global_scale=a2_gscale, + w2_blockscale=w2_bs, + w2_alpha=w2_alpha, + masked_m=masked_m, + ) + + def test_masked_moe_assert_hidden_states_tuple_len(self): + """hidden_states must be a tuple of length 2.""" + w1, w1_bs, w1_alpha, w2, w2_bs, w2_alpha, a2_gscale, masked_m = self._make_weights() + with mock.patch.object(self.mod, "_is_dtype", lambda *a, **k: True): + with self.assertRaises(AssertionError): + self.mod.flashinfer_cutedsl_moe_masked( + hidden_states=(paddle.zeros([1, 1, self.k]),), # length 1, invalid + input_global_scale=paddle.ones([self.num_experts], dtype=paddle.float32), + w1=w1, + w1_blockscale=w1_bs, + w1_alpha=w1_alpha, + w2=w2, + a2_global_scale=a2_gscale, + w2_blockscale=w2_bs, + w2_alpha=w2_alpha, + masked_m=masked_m, + ) + + def test_masked_moe_assert_w1_shape_mismatch(self): + """A mismatched w1 last-2 dim must trigger the shape assertion.""" + _, w1_bs, w1_alpha, w2, w2_bs, w2_alpha, a2_gscale, masked_m = self._make_weights() + E, m, k, n = self.num_experts, self.m, self.k, self.n + # w1 with wrong middle dim (should be 2*n). + bad_w1 = paddle.zeros([E, 2 * n + 1, k // 2], dtype=paddle.uint8) + hidden = paddle.zeros([E, m, k], dtype=paddle.bfloat16) + input_gscale = paddle.ones([E], dtype=paddle.float32) + + _, patcher = self._patch_kernels() + with patcher: + with self.assertRaises(AssertionError): + self.mod.flashinfer_cutedsl_moe_masked( + hidden_states=(hidden, None), + input_global_scale=input_gscale, + w1=bad_w1, + w1_blockscale=w1_bs, + w1_alpha=w1_alpha, + w2=w2, + a2_global_scale=a2_gscale, + w2_blockscale=w2_bs, + w2_alpha=w2_alpha, + masked_m=masked_m, + ) + + def test_masked_moe_down_signals_kwargs_forwarded(self): + """When down_sm_count or down_signals are set, the second GEMM must receive them as kwargs.""" + w1, w1_bs, w1_alpha, w2, w2_bs, w2_alpha, a2_gscale, masked_m = self._make_weights() + E, m, k = self.num_experts, self.m, self.k + hidden = paddle.zeros([E, m, k], dtype=paddle.bfloat16) + input_gscale = paddle.ones([E], dtype=paddle.float32) + signals = paddle.zeros([1], dtype=paddle.int32) + + calls, patcher = self._patch_kernels() + with patcher: + self.mod.flashinfer_cutedsl_moe_masked( + hidden_states=(hidden, None), + input_global_scale=input_gscale, + w1=w1, + w1_blockscale=w1_bs, + w1_alpha=w1_alpha, + w2=w2, + a2_global_scale=a2_gscale, + w2_blockscale=w2_bs, + w2_alpha=w2_alpha, + masked_m=masked_m, + down_sm_count=8, + down_signals=signals, + ) + + # GEMM2 (second call) carries the down_* kwargs; GEMM1 does not. + self.assertNotIn("sm_count", calls["gemm"][0]) + self.assertEqual(calls["gemm"][1]["sm_count"], 8) + self.assertIs(calls["gemm"][1]["dst_signals"], signals) + + if __name__ == "__main__": unittest.main() From 8443d623d1a9e0ab13984bfe09d1b35cbff24717 Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Fri, 15 May 2026 16:44:20 +0800 Subject: [PATCH 13/28] update develop --- custom_ops/gpu_ops/helper.h | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/custom_ops/gpu_ops/helper.h b/custom_ops/gpu_ops/helper.h index 232d01bc47f..47c9002a5da 100644 --- a/custom_ops/gpu_ops/helper.h +++ b/custom_ops/gpu_ops/helper.h @@ -376,10 +376,10 @@ inline json readJsonFromFile(const std::string& filePath) { // paddle::GPUPlace() #ifdef PADDLE_DEV -inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, - const paddle::DataType &dtype, - const paddle::Place &place) { - phi::Allocator *allocator = nullptr; +inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, + const paddle::DataType& dtype, + const paddle::Place& place) { + phi::Allocator* allocator = nullptr; // TODO (yuzhe): remove !defined(PADDLE_WITH_COREX) when // https://github.com/PaddlePaddle/Paddle/pull/78813 merged #if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_COREX) @@ -400,23 +400,14 @@ inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, return paddle::Tensor(std::make_shared(dense_tensor)); } -<<<<<<< HEAD inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, const common::DDim& strides, const paddle::DataType& dtype, const paddle::Place& place) { phi::Allocator* allocator = nullptr; -#if defined(PADDLE_WITH_CUDA) -======= -inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, - const common::DDim &strides, - const paddle::DataType &dtype, - const paddle::Place &place) { - phi::Allocator *allocator = nullptr; // TODO (yuzhe): remove !defined(PADDLE_WITH_COREX) when // https://github.com/PaddlePaddle/Paddle/pull/78813 merged #if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_COREX) ->>>>>>> e3541c20c7251ce4ff201743331e2872d17fab2e if (phi::backends::gpu::CUDAGraph::IsThisThreadCapturing()) { allocator = paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place) From d7f98f0ac773ab59cf6fc44502a3e145a40214c1 Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Fri, 15 May 2026 17:29:21 +0800 Subject: [PATCH 14/28] fix --- fastdeploy/envs.py | 20 ++++++++++---------- fastdeploy/worker/gpu_model_runner.py | 3 --- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 24d07da2edf..350a3ce900b 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -261,6 +261,16 @@ def _validate_split_kv_size(value: int) -> int: "FD_SAVE_OUTPUT_CACHE_FOR_PREEMPTED_REQUEST": lambda: bool( int(os.getenv("FD_SAVE_OUTPUT_CACHE_FOR_PREEMPTED_REQUEST", "1")) ), + # Whether to enable block-wise CUDA Graph capture/replay. + # When enabled, individual layer forward methods decorated with @block_wise_cuda_graph_wrap + # will be captured and replayed as CUDA Graphs for improved performance. + # Set to 1 to enable; defaults to 0 (disabled). + "FD_USE_BLOCK_WISE_CUDA_GRAPH": lambda: bool(int(os.getenv("FD_USE_BLOCK_WISE_CUDA_GRAPH", "0"))), + # Comma-separated list of token counts to pre-capture for block-wise CUDA Graphs. + # Used during the warmup phase to pre-capture graphs for these specific sizes. + # At runtime, token counts not in this list fall back to eager execution. + # Example: "1,2,4,8,16,32,64,128,256,512" + "FD_BLOCK_WISE_CUDA_GRAPH_SIZES": lambda: os.getenv("FD_BLOCK_WISE_CUDA_GRAPH_SIZES", "128,256,512,1024,2048"), # Suspend rollouting routing replay "FD_SUSPEND_ROUTING_REPLAY": lambda: bool(int(os.getenv("FD_SUSPEND_ROUTING_REPLAY", "0"))), # train-infer consistency, used in RL @@ -283,16 +293,6 @@ def _validate_split_kv_size(value: int) -> int: "FD_FP8_QUANT_WITH_POW2SCALE": lambda: bool(int(os.getenv("FD_FP8_QUANT_WITH_POW2SCALE", "0"))), # enable kv cache manager v1 "ENABLE_V1_KVCACHE_MANAGER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_MANAGER", "0")), - # Whether to enable block-wise CUDA Graph capture/replay. - # When enabled, individual layer forward methods decorated with @block_wise_cuda_graph_wrap - # will be captured and replayed as CUDA Graphs for improved performance. - # Set to 1 to enable; defaults to 0 (disabled). - "FD_USE_BLOCK_WISE_CUDA_GRAPH": lambda: bool(int(os.getenv("FD_USE_BLOCK_WISE_CUDA_GRAPH", "0"))), - # Comma-separated list of token counts to pre-capture for block-wise CUDA Graphs. - # Used during the warmup phase to pre-capture graphs for these specific sizes. - # At runtime, token counts not in this list fall back to eager execution. - # Example: "1,2,4,8,16,32,64,128,256,512" - "FD_BLOCK_WISE_CUDA_GRAPH_SIZES": lambda: os.getenv("FD_BLOCK_WISE_CUDA_GRAPH_SIZES", "128,256,512,1024,2048"), # When set to 1, print which op / shape enters the block-wise CUDA Graph # during the capture phase. Defaults to 0 (silent). "FD_BLOCK_WISE_DEBUG": lambda: bool(int(os.getenv("FD_BLOCK_WISE_DEBUG", "0"))), diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index f18774029ee..9e97d7f7817 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1172,9 +1172,6 @@ def get_input_length_list( ) # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan. - # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP. - # if self.fd_config.parallel_config.enable_expert_parallel: - # input_length = min(input_length, 32) block_num = ( input_length + self.cache_config.block_size - 1 From 264dbd84ad2108a690d2da00e24a2ed81329fce1 Mon Sep 17 00:00:00 2001 From: lonelygsh <490991455@qq.com> Date: Fri, 15 May 2026 17:30:31 +0800 Subject: [PATCH 15/28] fix --- fastdeploy/worker/gpu_model_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 9e97d7f7817..a8f94c6972b 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1172,6 +1172,9 @@ def get_input_length_list( ) # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan. + # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP. + if self.fd_config.parallel_config.enable_expert_parallel: + input_length = min(input_length, 32) block_num = ( input_length + self.cache_config.block_size - 1 From eae9a559a72f1c1c4a5878de519c9defffe47c7e Mon Sep 17 00:00:00 2001 From: root Date: Mon, 18 May 2026 18:59:56 +0800 Subject: [PATCH 16/28] delete --- fastdeploy/model_executor/layers/linear.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 2f74f8c6f21..37989be0683 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -952,9 +952,6 @@ def all2all_transpose(self, x: paddle.Tensor) -> paddle.Tensor: out.reshape_([x.shape[0] // self.tp_size, self.input_size]) return out - # NOTE: do NOT wrap with @block_wise_cuda_graph_wrap here. - # This forward contains collective comms (alltoall / all_reduce) which - # cannot be captured into a CUDA Graph. def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor: if self.split_token: x = self.all2all_transpose(x) From 9205ac7798f7d2d33e874d1fcc7fa31be41bffb6 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 18 May 2026 19:18:58 +0800 Subject: [PATCH 17/28] update --- fastdeploy/envs.py | 2 +- fastdeploy/model_executor/layers/quantization/nvfp4.py | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 350a3ce900b..4a0af2e15a0 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -277,7 +277,7 @@ def _validate_split_kv_size(value: int) -> int: # Whether to align RoPE and moe gate precision with training "FD_ENABLE_RL": lambda: int(os.getenv("FD_ENABLE_RL", "0")), # Whether to enable FP4 communication quantization for DeepEP prefill dispatch - "FD_USE_NVFP4_COMM_QUANT": lambda: bool(int(os.getenv("FD_USE_NVFP4_COMM_QUANT", "0"))), + "FD_DISPATCH_USE_FP4": lambda: bool(int(os.getenv("FD_DISPATCH_USE_FP4", "0"))), # Whether to use phi FP8 quantization,if 1,use paddle default. "FD_USE_PHI_FP8_QUANT": lambda: bool(int(os.getenv("FD_USE_PHI_FP8_QUANT", "1"))), # Enables the Paddle/phi combined TopK operator only when topk_method == noaux_tc, diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 0546ab180b0..12a2c8f19fd 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -427,10 +427,6 @@ def apply( x_fp4, x_scale_interleaved = fp4_quantize(x, layer.input_scale_inv) assert x_fp4.dtype == paddle.uint8 - assert layer.weight.dtype == paddle.uint8 - assert layer.weight_scale_interleaved.dtype == paddle.float8_e4m3fn - assert layer.alpha.dtype == paddle.float32 - if self.backend.startswith("flashinfer-"): backend = self.backend[len("flashinfer-") :] else: @@ -680,14 +676,15 @@ def apply_ep_prefill( if topk_ids_hookfunc is not None: topk_ids_hookfunc(topk_ids=topk_idx) - use_fp4_comm_quant = envs.FD_USE_NVFP4_COMM_QUANT + dispatch_use_fp4 = envs.FD_DISPATCH_USE_FP4 - if use_fp4_comm_quant: + if dispatch_use_fp4: # FP4 communication quantization: quantize to FP4 before dispatch, # reducing communication volume by ~2x vs BF16. x_fp4, x_fp4_scale = fp4_quantize( x, layer.up_gate_proj_input_scale_quant, sf_vec_size=16, is_sf_swizzled_layout=False ) + assert x_fp4.dtype == paddle.uint8, f"x_fp4 must be packed as uint8, got {x_fp4.dtype}" x_fp4_scale = x_fp4_scale.view(paddle.float32) # float8_e4m3fn -> float32 dispatch_input = x_fp4 dispatch_scale = x_fp4_scale From 3d64926846dfdc3082e5125e273580350f9bc3a1 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 18 May 2026 19:19:59 +0800 Subject: [PATCH 18/28] update --- custom_ops/gpu_ops/helper.h | 168 ++++++++++++++++++------------------ 1 file changed, 84 insertions(+), 84 deletions(-) diff --git a/custom_ops/gpu_ops/helper.h b/custom_ops/gpu_ops/helper.h index 47c9002a5da..a97efdd86a4 100644 --- a/custom_ops/gpu_ops/helper.h +++ b/custom_ops/gpu_ops/helper.h @@ -95,7 +95,7 @@ using json = nlohmann::json; #ifdef PADDLE_WITH_HIP template -inline hipError_t GetNumBlocks(int64_t n, int* num_blocks) { +inline hipError_t GetNumBlocks(int64_t n, int *num_blocks) { int dev; { hipError_t err = hipGetDevice(&dev); @@ -127,7 +127,7 @@ inline hipError_t GetNumBlocks(int64_t n, int* num_blocks) { } #else template -inline cudaError_t GetNumBlocks(int64_t n, int* num_blocks) { +inline cudaError_t GetNumBlocks(int64_t n, int *num_blocks) { int dev; { cudaError_t err = cudaGetDevice(&dev); @@ -275,41 +275,41 @@ template struct alignas(sizeof(T) * Size) AlignedVector { T val[Size]; - HOSTDEVICE inline const T& operator[](int i) const { return val[i]; } - HOSTDEVICE inline T& operator[](int i) { return val[i]; } + HOSTDEVICE inline const T &operator[](int i) const { return val[i]; } + HOSTDEVICE inline T &operator[](int i) { return val[i]; } }; template -HOSTDEVICE inline void Load(const T* addr, AlignedVector* vec) { - const AlignedVector* addr_vec = - reinterpret_cast*>(addr); +HOSTDEVICE inline void Load(const T *addr, AlignedVector *vec) { + const AlignedVector *addr_vec = + reinterpret_cast *>(addr); *vec = *addr_vec; } template -HOSTDEVICE inline void Store(const AlignedVector& vec, T* addr) { - AlignedVector* addr_vec = - reinterpret_cast*>(addr); +HOSTDEVICE inline void Store(const AlignedVector &vec, T *addr) { + AlignedVector *addr_vec = + reinterpret_cast *>(addr); *addr_vec = vec; } #ifdef PADDLE_WITH_HIP template -HOSTDEVICE inline void Store(const AlignedVector& vec, - int8_t* addr) { +HOSTDEVICE inline void Store(const AlignedVector &vec, + int8_t *addr) { printf("Error: Store hip_bfloat16 to int8_t is not supported!"); } #else template -HOSTDEVICE inline void Store(const AlignedVector<__nv_bfloat16, Size>& vec, - int8_t* addr) { +HOSTDEVICE inline void Store(const AlignedVector<__nv_bfloat16, Size> &vec, + int8_t *addr) { printf("Error: Store __nv_bfloat16 to int8_t is not supported!"); } #endif template -HOSTDEVICE inline void Store(const AlignedVector& vec, - int8_t* addr) { +HOSTDEVICE inline void Store(const AlignedVector &vec, + int8_t *addr) { printf("Error: Store half to int8_t is not supported!"); } @@ -322,7 +322,7 @@ __device__ T max_func(const T a, const T b) { template struct MaxOp { - __device__ __forceinline__ T operator()(const T& a, const T& b) const { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { return max_func(a, b); } }; @@ -330,7 +330,7 @@ struct MaxOp { template <> struct MaxOp { // This is slightly faster - __device__ __forceinline__ float operator()(float const& x, float const& y) { + __device__ __forceinline__ float operator()(float const &x, float const &y) { return max(x, y); } }; @@ -350,7 +350,7 @@ inline int GetBlockSize(int vocab_size) { } #ifndef PADDLE_WITH_COREX -inline json readJsonFromFile(const std::string& filePath) { +inline json readJsonFromFile(const std::string &filePath) { std::ifstream file(filePath); if (!file.is_open()) { throw std::runtime_error("Unable to open file: " + filePath); @@ -376,10 +376,10 @@ inline json readJsonFromFile(const std::string& filePath) { // paddle::GPUPlace() #ifdef PADDLE_DEV -inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, - const paddle::DataType& dtype, - const paddle::Place& place) { - phi::Allocator* allocator = nullptr; +inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, + const paddle::DataType &dtype, + const paddle::Place &place) { + phi::Allocator *allocator = nullptr; // TODO (yuzhe): remove !defined(PADDLE_WITH_COREX) when // https://github.com/PaddlePaddle/Paddle/pull/78813 merged #if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_COREX) @@ -400,11 +400,11 @@ inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, return paddle::Tensor(std::make_shared(dense_tensor)); } -inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, - const common::DDim& strides, - const paddle::DataType& dtype, - const paddle::Place& place) { - phi::Allocator* allocator = nullptr; +inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, + const common::DDim &strides, + const paddle::DataType &dtype, + const paddle::Place &place) { + phi::Allocator *allocator = nullptr; // TODO (yuzhe): remove !defined(PADDLE_WITH_COREX) when // https://github.com/PaddlePaddle/Paddle/pull/78813 merged #if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_COREX) @@ -427,67 +427,67 @@ inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, } #endif -__global__ void free_and_dispatch_block(bool* stop_flags, - int* seq_lens_this_time, - int* seq_lens_decoder, - int* block_tables, - int* encoder_block_lens, - bool* is_block_step, - int* step_block_list, // [bsz] - int* step_len, - int* recover_block_list, - int* recover_len, - int* need_block_list, - int* need_block_len, - int* used_list_len, - int* free_list, - int* free_list_len, - int64_t* first_token_ids, +__global__ void free_and_dispatch_block(bool *stop_flags, + int *seq_lens_this_time, + int *seq_lens_decoder, + int *block_tables, + int *encoder_block_lens, + bool *is_block_step, + int *step_block_list, // [bsz] + int *step_len, + int *recover_block_list, + int *recover_len, + int *need_block_list, + int *need_block_len, + int *used_list_len, + int *free_list, + int *free_list_len, + int64_t *first_token_ids, const int bsz, const int block_size, const int block_num_per_seq, const int max_decoder_block_num); __global__ void speculate_free_and_dispatch_block( - bool* stop_flags, - int* seq_lens_this_time, - int* seq_lens_decoder, - int* block_tables, - int* encoder_block_lens, - bool* is_block_step, - int* step_block_list, // [bsz] - int* step_len, - int* recover_block_list, - int* recover_len, - int* need_block_list, - int* need_block_len, - int* used_list_len, - int* free_list, - int* free_list_len, - int64_t* first_token_ids, - int* accept_num, + bool *stop_flags, + int *seq_lens_this_time, + int *seq_lens_decoder, + int *block_tables, + int *encoder_block_lens, + bool *is_block_step, + int *step_block_list, // [bsz] + int *step_len, + int *recover_block_list, + int *recover_len, + int *need_block_list, + int *need_block_len, + int *used_list_len, + int *free_list, + int *free_list_len, + int64_t *first_token_ids, + int *accept_num, const int bsz, const int block_size, const int block_num_per_seq, const int max_decoder_block_num, const int max_draft_tokens); -__device__ bool speculate_free_and_dispatch_block(const int& qid, - int* need_block_list, - const int& need_block_len); +__device__ bool speculate_free_and_dispatch_block(const int &qid, + int *need_block_list, + const int &need_block_len); static std::string global_base64_chars = // NOLINT "Tokp9lA/BjimRVKx32edMPFftOzsbNQ8C15Xn+YUEGc4WD0uLIq7hyJ6vZaHSwrg"; // Base64 编码函数 -inline std::string base64_encode(const std::string& input) { +inline std::string base64_encode(const std::string &input) { std::string ret; int i = 0; int j = 0; unsigned char char_array_3[3]; unsigned char char_array_4[4]; - for (const auto& c : input) { + for (const auto &c : input) { char_array_3[i++] = c; if (i == 3) { char_array_4[0] = (char_array_3[0] & 0xfc) >> 2; @@ -529,7 +529,7 @@ inline std::string base64_encode(const std::string& input) { } // Base64 解码函数 -inline std::string base64_decode(const std::string& encoded_string) { +inline std::string base64_decode(const std::string &encoded_string) { int in_len = encoded_string.size(); int i = 0; int j = 0; @@ -584,9 +584,9 @@ inline std::string base64_decode(const std::string& encoded_string) { #ifndef PADDLE_WITH_COREX template -inline T get_relative_best(nlohmann::json* json_data, - const std::string& target_key, - const T& default_value) { +inline T get_relative_best(nlohmann::json *json_data, + const std::string &target_key, + const T &default_value) { if (json_data->contains(target_key)) { return json_data->at(target_key); } else { @@ -598,7 +598,7 @@ inline T get_relative_best(nlohmann::json* json_data, #endif __device__ inline bool is_in_end(const int64_t id, - const int64_t* end_ids, + const int64_t *end_ids, int length) { bool flag = false; for (int i = 0; i < length; i++) { @@ -622,7 +622,7 @@ __device__ __inline__ T ClipFunc(const T v, const T min, const T max) { } template -static void PrintMatrix3(const T* mat_d, int num, std::string name) { +static void PrintMatrix3(const T *mat_d, int num, std::string name) { std::vector tmp(num); #ifdef PADDLE_WITH_HIP hipMemcpy(tmp.data(), mat_d, sizeof(T) * num, hipMemcpyDeviceToHost); @@ -647,7 +647,7 @@ static void PrintMatrix3(const T* mat_d, int num, std::string name) { #ifndef PADDLE_WITH_HIP #ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU -__forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t* flag_addr, +__forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr, int mode = 0) { uint32_t flag; if (mode == 0) { @@ -666,7 +666,7 @@ __forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t* flag_addr, return flag; } -__forceinline__ __device__ void st_flag_release(uint32_t* flag_addr, +__forceinline__ __device__ void st_flag_release(uint32_t *flag_addr, uint32_t flag, int mode = 0) { if (mode == 0) { @@ -708,12 +708,12 @@ inline bool GetMlaUseTensorcore() { return mla_use_tensorcore; } -inline const char* getEnvVar(const char* varName) { +inline const char *getEnvVar(const char *varName) { return std::getenv(varName); } inline bool checkAttentionBackend() { - const char* backend = getEnvVar("FD_ATTENTION_BACKEND"); + const char *backend = getEnvVar("FD_ATTENTION_BACKEND"); if (backend && (std::strcmp(backend, "MLA_ATTN") == 0 || std::strcmp(backend, "DSA_ATTN") == 0)) { return true; @@ -725,17 +725,17 @@ inline bool checkAttentionBackend() { #define GPU_MEMORY_CHECKER_H class GPUMemoryChecker { public: - static GPUMemoryChecker* getInstance() { + static GPUMemoryChecker *getInstance() { static GPUMemoryChecker instance; return &instance; } - void addCheckPoint(const char* call_file, int call_line); + void addCheckPoint(const char *call_file, int call_line); unsigned int getGPUCount() const { return deviceCount_; } void getCUDAVisibleDevice(); - GPUMemoryChecker(const GPUMemoryChecker&) = delete; - void operator=(const GPUMemoryChecker&) = delete; + GPUMemoryChecker(const GPUMemoryChecker &) = delete; + void operator=(const GPUMemoryChecker &) = delete; private: GPUMemoryChecker(); @@ -771,8 +771,8 @@ __device__ __forceinline__ float blockReduceMax(float value) { return value; } -inline bool getBoolEnv(char const* name) { - char const* env = std::getenv(name); +inline bool getBoolEnv(char const *name) { + char const *env = std::getenv(name); return env && env[0] == '1' && env[1] == '\0'; } @@ -795,7 +795,7 @@ inline void launchWithPdlWhenEnabled(KernelFn kernelFn, dim3 block, size_t dynamicShmSize, cudaStream_t stream, - Args&&... args) { + Args &&...args) { #ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU (*kernelFn)<<>>( std::forward(args)...); @@ -815,4 +815,4 @@ inline void launchWithPdlWhenEnabled(KernelFn kernelFn, cudaLaunchKernelEx(&kernelConfig, kernelFn, std::forward(args)...); #endif } -#endif +#endif \ No newline at end of file From d2545c165b02ba3cc1417979eb0b11955a2a91c8 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 18 May 2026 19:34:34 +0800 Subject: [PATCH 19/28] add document --- docs/quantization/nvfp4.md | 14 +++++++++++++- docs/zh/quantization/nvfp4.md | 8 +++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/docs/quantization/nvfp4.md b/docs/quantization/nvfp4.md index 954517b45d0..f7bb6650a2a 100644 --- a/docs/quantization/nvfp4.md +++ b/docs/quantization/nvfp4.md @@ -64,6 +64,18 @@ python -m fastdeploy.entrypoints.openai.api_server \ flashinfer-cutedsl backend: ```bash +export FD_MOE_BACKEND="flashinfer-cutedsl" +export FD_USE_PFCC_DEEP_EP=1 +export CUDA_VISIBLE_DEVICES=4,5,6,7 + +# Enable block-wise CUDA graph +export FD_USE_BLOCK_WISE_CUDA_GRAPH=1 +# Customize pre-captured token counts (optional) +export FD_BLOCK_WISE_CUDA_GRAPH_SIZES="1,2,4,8,16,32,64,128,256,512,1024,2048,4096" + +# Enable FP4 communication quantization +export FD_DISPATCH_USE_FP4=1 + python -m fastdeploy.entrypoints.openai.multi_api_server \ --ports "9811,9812,9813,9814" \ --num-servers 4 \ @@ -79,7 +91,7 @@ python -m fastdeploy.entrypoints.openai.multi_api_server \ --gpu-memory-utilization 0.9 \ --max-num-batched-tokens 512 \ --ep-prefill-use-worst-num-tokens \ - --graph-optimization-config '{"use_cudagraph":false}' + --quantization '{"quantization": "mix_quant", "dense_quant_type":"block_wise_fp8", "is_moe_quantized":true,"moe_quant_type":"modelopt_fp4"}' ``` ### API Access diff --git a/docs/zh/quantization/nvfp4.md b/docs/zh/quantization/nvfp4.md index f9cd6451719..e6bd0404617 100644 --- a/docs/zh/quantization/nvfp4.md +++ b/docs/zh/quantization/nvfp4.md @@ -68,7 +68,13 @@ export FD_MOE_BACKEND="flashinfer-cutedsl" export FD_USE_PFCC_DEEP_EP=1 export CUDA_VISIBLE_DEVICES=4,5,6,7 +开启block_wise_cuda_graph +export FD_USE_BLOCK_WISE_CUDA_GRAPH=1 +自定义预捕获的token数(可选) +export FD_BLOCK_WISE_CUDA_GRAPH_SIZES="1,2,4,8,16,32,64,128,256,512,1024,2048,4096" +使用fp4通信量化 +export FD_DISPATCH_USE_FP4=1 python -m fastdeploy.entrypoints.openai.multi_api_server \ --ports "9811,9812,9813,9814" \ @@ -85,7 +91,7 @@ python -m fastdeploy.entrypoints.openai.multi_api_server \ --gpu-memory-utilization 0.9 \ --max-num-batched-tokens 512 \ --ep-prefill-use-worst-num-tokens \ - --graph-optimization-config '{"use_cudagraph":false}' + --quantization '{"quantization": "mix_quant", "dense_quant_type":"block_wise_fp8", "is_moe_quantized":true,"moe_quant_type":"modelopt_fp4"}' \ ``` ### 接口访问 From 7a58d12ce8e50f6495fd4829f4c4b00343df9a1e Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 19 May 2026 11:46:03 +0800 Subject: [PATCH 20/28] fix --- custom_ops/gpu_ops/helper.h | 168 ++++++++++++++++++------------------ 1 file changed, 84 insertions(+), 84 deletions(-) diff --git a/custom_ops/gpu_ops/helper.h b/custom_ops/gpu_ops/helper.h index a97efdd86a4..47c9002a5da 100644 --- a/custom_ops/gpu_ops/helper.h +++ b/custom_ops/gpu_ops/helper.h @@ -95,7 +95,7 @@ using json = nlohmann::json; #ifdef PADDLE_WITH_HIP template -inline hipError_t GetNumBlocks(int64_t n, int *num_blocks) { +inline hipError_t GetNumBlocks(int64_t n, int* num_blocks) { int dev; { hipError_t err = hipGetDevice(&dev); @@ -127,7 +127,7 @@ inline hipError_t GetNumBlocks(int64_t n, int *num_blocks) { } #else template -inline cudaError_t GetNumBlocks(int64_t n, int *num_blocks) { +inline cudaError_t GetNumBlocks(int64_t n, int* num_blocks) { int dev; { cudaError_t err = cudaGetDevice(&dev); @@ -275,41 +275,41 @@ template struct alignas(sizeof(T) * Size) AlignedVector { T val[Size]; - HOSTDEVICE inline const T &operator[](int i) const { return val[i]; } - HOSTDEVICE inline T &operator[](int i) { return val[i]; } + HOSTDEVICE inline const T& operator[](int i) const { return val[i]; } + HOSTDEVICE inline T& operator[](int i) { return val[i]; } }; template -HOSTDEVICE inline void Load(const T *addr, AlignedVector *vec) { - const AlignedVector *addr_vec = - reinterpret_cast *>(addr); +HOSTDEVICE inline void Load(const T* addr, AlignedVector* vec) { + const AlignedVector* addr_vec = + reinterpret_cast*>(addr); *vec = *addr_vec; } template -HOSTDEVICE inline void Store(const AlignedVector &vec, T *addr) { - AlignedVector *addr_vec = - reinterpret_cast *>(addr); +HOSTDEVICE inline void Store(const AlignedVector& vec, T* addr) { + AlignedVector* addr_vec = + reinterpret_cast*>(addr); *addr_vec = vec; } #ifdef PADDLE_WITH_HIP template -HOSTDEVICE inline void Store(const AlignedVector &vec, - int8_t *addr) { +HOSTDEVICE inline void Store(const AlignedVector& vec, + int8_t* addr) { printf("Error: Store hip_bfloat16 to int8_t is not supported!"); } #else template -HOSTDEVICE inline void Store(const AlignedVector<__nv_bfloat16, Size> &vec, - int8_t *addr) { +HOSTDEVICE inline void Store(const AlignedVector<__nv_bfloat16, Size>& vec, + int8_t* addr) { printf("Error: Store __nv_bfloat16 to int8_t is not supported!"); } #endif template -HOSTDEVICE inline void Store(const AlignedVector &vec, - int8_t *addr) { +HOSTDEVICE inline void Store(const AlignedVector& vec, + int8_t* addr) { printf("Error: Store half to int8_t is not supported!"); } @@ -322,7 +322,7 @@ __device__ T max_func(const T a, const T b) { template struct MaxOp { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { + __device__ __forceinline__ T operator()(const T& a, const T& b) const { return max_func(a, b); } }; @@ -330,7 +330,7 @@ struct MaxOp { template <> struct MaxOp { // This is slightly faster - __device__ __forceinline__ float operator()(float const &x, float const &y) { + __device__ __forceinline__ float operator()(float const& x, float const& y) { return max(x, y); } }; @@ -350,7 +350,7 @@ inline int GetBlockSize(int vocab_size) { } #ifndef PADDLE_WITH_COREX -inline json readJsonFromFile(const std::string &filePath) { +inline json readJsonFromFile(const std::string& filePath) { std::ifstream file(filePath); if (!file.is_open()) { throw std::runtime_error("Unable to open file: " + filePath); @@ -376,10 +376,10 @@ inline json readJsonFromFile(const std::string &filePath) { // paddle::GPUPlace() #ifdef PADDLE_DEV -inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, - const paddle::DataType &dtype, - const paddle::Place &place) { - phi::Allocator *allocator = nullptr; +inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, + const paddle::DataType& dtype, + const paddle::Place& place) { + phi::Allocator* allocator = nullptr; // TODO (yuzhe): remove !defined(PADDLE_WITH_COREX) when // https://github.com/PaddlePaddle/Paddle/pull/78813 merged #if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_COREX) @@ -400,11 +400,11 @@ inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, return paddle::Tensor(std::make_shared(dense_tensor)); } -inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, - const common::DDim &strides, - const paddle::DataType &dtype, - const paddle::Place &place) { - phi::Allocator *allocator = nullptr; +inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, + const common::DDim& strides, + const paddle::DataType& dtype, + const paddle::Place& place) { + phi::Allocator* allocator = nullptr; // TODO (yuzhe): remove !defined(PADDLE_WITH_COREX) when // https://github.com/PaddlePaddle/Paddle/pull/78813 merged #if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_COREX) @@ -427,67 +427,67 @@ inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, } #endif -__global__ void free_and_dispatch_block(bool *stop_flags, - int *seq_lens_this_time, - int *seq_lens_decoder, - int *block_tables, - int *encoder_block_lens, - bool *is_block_step, - int *step_block_list, // [bsz] - int *step_len, - int *recover_block_list, - int *recover_len, - int *need_block_list, - int *need_block_len, - int *used_list_len, - int *free_list, - int *free_list_len, - int64_t *first_token_ids, +__global__ void free_and_dispatch_block(bool* stop_flags, + int* seq_lens_this_time, + int* seq_lens_decoder, + int* block_tables, + int* encoder_block_lens, + bool* is_block_step, + int* step_block_list, // [bsz] + int* step_len, + int* recover_block_list, + int* recover_len, + int* need_block_list, + int* need_block_len, + int* used_list_len, + int* free_list, + int* free_list_len, + int64_t* first_token_ids, const int bsz, const int block_size, const int block_num_per_seq, const int max_decoder_block_num); __global__ void speculate_free_and_dispatch_block( - bool *stop_flags, - int *seq_lens_this_time, - int *seq_lens_decoder, - int *block_tables, - int *encoder_block_lens, - bool *is_block_step, - int *step_block_list, // [bsz] - int *step_len, - int *recover_block_list, - int *recover_len, - int *need_block_list, - int *need_block_len, - int *used_list_len, - int *free_list, - int *free_list_len, - int64_t *first_token_ids, - int *accept_num, + bool* stop_flags, + int* seq_lens_this_time, + int* seq_lens_decoder, + int* block_tables, + int* encoder_block_lens, + bool* is_block_step, + int* step_block_list, // [bsz] + int* step_len, + int* recover_block_list, + int* recover_len, + int* need_block_list, + int* need_block_len, + int* used_list_len, + int* free_list, + int* free_list_len, + int64_t* first_token_ids, + int* accept_num, const int bsz, const int block_size, const int block_num_per_seq, const int max_decoder_block_num, const int max_draft_tokens); -__device__ bool speculate_free_and_dispatch_block(const int &qid, - int *need_block_list, - const int &need_block_len); +__device__ bool speculate_free_and_dispatch_block(const int& qid, + int* need_block_list, + const int& need_block_len); static std::string global_base64_chars = // NOLINT "Tokp9lA/BjimRVKx32edMPFftOzsbNQ8C15Xn+YUEGc4WD0uLIq7hyJ6vZaHSwrg"; // Base64 编码函数 -inline std::string base64_encode(const std::string &input) { +inline std::string base64_encode(const std::string& input) { std::string ret; int i = 0; int j = 0; unsigned char char_array_3[3]; unsigned char char_array_4[4]; - for (const auto &c : input) { + for (const auto& c : input) { char_array_3[i++] = c; if (i == 3) { char_array_4[0] = (char_array_3[0] & 0xfc) >> 2; @@ -529,7 +529,7 @@ inline std::string base64_encode(const std::string &input) { } // Base64 解码函数 -inline std::string base64_decode(const std::string &encoded_string) { +inline std::string base64_decode(const std::string& encoded_string) { int in_len = encoded_string.size(); int i = 0; int j = 0; @@ -584,9 +584,9 @@ inline std::string base64_decode(const std::string &encoded_string) { #ifndef PADDLE_WITH_COREX template -inline T get_relative_best(nlohmann::json *json_data, - const std::string &target_key, - const T &default_value) { +inline T get_relative_best(nlohmann::json* json_data, + const std::string& target_key, + const T& default_value) { if (json_data->contains(target_key)) { return json_data->at(target_key); } else { @@ -598,7 +598,7 @@ inline T get_relative_best(nlohmann::json *json_data, #endif __device__ inline bool is_in_end(const int64_t id, - const int64_t *end_ids, + const int64_t* end_ids, int length) { bool flag = false; for (int i = 0; i < length; i++) { @@ -622,7 +622,7 @@ __device__ __inline__ T ClipFunc(const T v, const T min, const T max) { } template -static void PrintMatrix3(const T *mat_d, int num, std::string name) { +static void PrintMatrix3(const T* mat_d, int num, std::string name) { std::vector tmp(num); #ifdef PADDLE_WITH_HIP hipMemcpy(tmp.data(), mat_d, sizeof(T) * num, hipMemcpyDeviceToHost); @@ -647,7 +647,7 @@ static void PrintMatrix3(const T *mat_d, int num, std::string name) { #ifndef PADDLE_WITH_HIP #ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU -__forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr, +__forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t* flag_addr, int mode = 0) { uint32_t flag; if (mode == 0) { @@ -666,7 +666,7 @@ __forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr, return flag; } -__forceinline__ __device__ void st_flag_release(uint32_t *flag_addr, +__forceinline__ __device__ void st_flag_release(uint32_t* flag_addr, uint32_t flag, int mode = 0) { if (mode == 0) { @@ -708,12 +708,12 @@ inline bool GetMlaUseTensorcore() { return mla_use_tensorcore; } -inline const char *getEnvVar(const char *varName) { +inline const char* getEnvVar(const char* varName) { return std::getenv(varName); } inline bool checkAttentionBackend() { - const char *backend = getEnvVar("FD_ATTENTION_BACKEND"); + const char* backend = getEnvVar("FD_ATTENTION_BACKEND"); if (backend && (std::strcmp(backend, "MLA_ATTN") == 0 || std::strcmp(backend, "DSA_ATTN") == 0)) { return true; @@ -725,17 +725,17 @@ inline bool checkAttentionBackend() { #define GPU_MEMORY_CHECKER_H class GPUMemoryChecker { public: - static GPUMemoryChecker *getInstance() { + static GPUMemoryChecker* getInstance() { static GPUMemoryChecker instance; return &instance; } - void addCheckPoint(const char *call_file, int call_line); + void addCheckPoint(const char* call_file, int call_line); unsigned int getGPUCount() const { return deviceCount_; } void getCUDAVisibleDevice(); - GPUMemoryChecker(const GPUMemoryChecker &) = delete; - void operator=(const GPUMemoryChecker &) = delete; + GPUMemoryChecker(const GPUMemoryChecker&) = delete; + void operator=(const GPUMemoryChecker&) = delete; private: GPUMemoryChecker(); @@ -771,8 +771,8 @@ __device__ __forceinline__ float blockReduceMax(float value) { return value; } -inline bool getBoolEnv(char const *name) { - char const *env = std::getenv(name); +inline bool getBoolEnv(char const* name) { + char const* env = std::getenv(name); return env && env[0] == '1' && env[1] == '\0'; } @@ -795,7 +795,7 @@ inline void launchWithPdlWhenEnabled(KernelFn kernelFn, dim3 block, size_t dynamicShmSize, cudaStream_t stream, - Args &&...args) { + Args&&... args) { #ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU (*kernelFn)<<>>( std::forward(args)...); @@ -815,4 +815,4 @@ inline void launchWithPdlWhenEnabled(KernelFn kernelFn, cudaLaunchKernelEx(&kernelConfig, kernelFn, std::forward(args)...); #endif } -#endif \ No newline at end of file +#endif From aad041d7445e3ad2c3256d09c28c67937ff8ba5c Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 19 May 2026 11:50:22 +0800 Subject: [PATCH 21/28] revert helper.h to develop --- custom_ops/gpu_ops/helper.h | 166 ++++++++++++++++++------------------ 1 file changed, 83 insertions(+), 83 deletions(-) diff --git a/custom_ops/gpu_ops/helper.h b/custom_ops/gpu_ops/helper.h index 47c9002a5da..7a0a75283c3 100644 --- a/custom_ops/gpu_ops/helper.h +++ b/custom_ops/gpu_ops/helper.h @@ -95,7 +95,7 @@ using json = nlohmann::json; #ifdef PADDLE_WITH_HIP template -inline hipError_t GetNumBlocks(int64_t n, int* num_blocks) { +inline hipError_t GetNumBlocks(int64_t n, int *num_blocks) { int dev; { hipError_t err = hipGetDevice(&dev); @@ -127,7 +127,7 @@ inline hipError_t GetNumBlocks(int64_t n, int* num_blocks) { } #else template -inline cudaError_t GetNumBlocks(int64_t n, int* num_blocks) { +inline cudaError_t GetNumBlocks(int64_t n, int *num_blocks) { int dev; { cudaError_t err = cudaGetDevice(&dev); @@ -275,41 +275,41 @@ template struct alignas(sizeof(T) * Size) AlignedVector { T val[Size]; - HOSTDEVICE inline const T& operator[](int i) const { return val[i]; } - HOSTDEVICE inline T& operator[](int i) { return val[i]; } + HOSTDEVICE inline const T &operator[](int i) const { return val[i]; } + HOSTDEVICE inline T &operator[](int i) { return val[i]; } }; template -HOSTDEVICE inline void Load(const T* addr, AlignedVector* vec) { - const AlignedVector* addr_vec = - reinterpret_cast*>(addr); +HOSTDEVICE inline void Load(const T *addr, AlignedVector *vec) { + const AlignedVector *addr_vec = + reinterpret_cast *>(addr); *vec = *addr_vec; } template -HOSTDEVICE inline void Store(const AlignedVector& vec, T* addr) { - AlignedVector* addr_vec = - reinterpret_cast*>(addr); +HOSTDEVICE inline void Store(const AlignedVector &vec, T *addr) { + AlignedVector *addr_vec = + reinterpret_cast *>(addr); *addr_vec = vec; } #ifdef PADDLE_WITH_HIP template -HOSTDEVICE inline void Store(const AlignedVector& vec, - int8_t* addr) { +HOSTDEVICE inline void Store(const AlignedVector &vec, + int8_t *addr) { printf("Error: Store hip_bfloat16 to int8_t is not supported!"); } #else template -HOSTDEVICE inline void Store(const AlignedVector<__nv_bfloat16, Size>& vec, - int8_t* addr) { +HOSTDEVICE inline void Store(const AlignedVector<__nv_bfloat16, Size> &vec, + int8_t *addr) { printf("Error: Store __nv_bfloat16 to int8_t is not supported!"); } #endif template -HOSTDEVICE inline void Store(const AlignedVector& vec, - int8_t* addr) { +HOSTDEVICE inline void Store(const AlignedVector &vec, + int8_t *addr) { printf("Error: Store half to int8_t is not supported!"); } @@ -322,7 +322,7 @@ __device__ T max_func(const T a, const T b) { template struct MaxOp { - __device__ __forceinline__ T operator()(const T& a, const T& b) const { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { return max_func(a, b); } }; @@ -330,7 +330,7 @@ struct MaxOp { template <> struct MaxOp { // This is slightly faster - __device__ __forceinline__ float operator()(float const& x, float const& y) { + __device__ __forceinline__ float operator()(float const &x, float const &y) { return max(x, y); } }; @@ -350,7 +350,7 @@ inline int GetBlockSize(int vocab_size) { } #ifndef PADDLE_WITH_COREX -inline json readJsonFromFile(const std::string& filePath) { +inline json readJsonFromFile(const std::string &filePath) { std::ifstream file(filePath); if (!file.is_open()) { throw std::runtime_error("Unable to open file: " + filePath); @@ -376,10 +376,10 @@ inline json readJsonFromFile(const std::string& filePath) { // paddle::GPUPlace() #ifdef PADDLE_DEV -inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, - const paddle::DataType& dtype, - const paddle::Place& place) { - phi::Allocator* allocator = nullptr; +inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, + const paddle::DataType &dtype, + const paddle::Place &place) { + phi::Allocator *allocator = nullptr; // TODO (yuzhe): remove !defined(PADDLE_WITH_COREX) when // https://github.com/PaddlePaddle/Paddle/pull/78813 merged #if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_COREX) @@ -400,11 +400,11 @@ inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, return paddle::Tensor(std::make_shared(dense_tensor)); } -inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, - const common::DDim& strides, - const paddle::DataType& dtype, - const paddle::Place& place) { - phi::Allocator* allocator = nullptr; +inline paddle::Tensor GetEmptyTensor(const common::DDim &dims, + const common::DDim &strides, + const paddle::DataType &dtype, + const paddle::Place &place) { + phi::Allocator *allocator = nullptr; // TODO (yuzhe): remove !defined(PADDLE_WITH_COREX) when // https://github.com/PaddlePaddle/Paddle/pull/78813 merged #if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_COREX) @@ -427,67 +427,67 @@ inline paddle::Tensor GetEmptyTensor(const common::DDim& dims, } #endif -__global__ void free_and_dispatch_block(bool* stop_flags, - int* seq_lens_this_time, - int* seq_lens_decoder, - int* block_tables, - int* encoder_block_lens, - bool* is_block_step, - int* step_block_list, // [bsz] - int* step_len, - int* recover_block_list, - int* recover_len, - int* need_block_list, - int* need_block_len, - int* used_list_len, - int* free_list, - int* free_list_len, - int64_t* first_token_ids, +__global__ void free_and_dispatch_block(bool *stop_flags, + int *seq_lens_this_time, + int *seq_lens_decoder, + int *block_tables, + int *encoder_block_lens, + bool *is_block_step, + int *step_block_list, // [bsz] + int *step_len, + int *recover_block_list, + int *recover_len, + int *need_block_list, + int *need_block_len, + int *used_list_len, + int *free_list, + int *free_list_len, + int64_t *first_token_ids, const int bsz, const int block_size, const int block_num_per_seq, const int max_decoder_block_num); __global__ void speculate_free_and_dispatch_block( - bool* stop_flags, - int* seq_lens_this_time, - int* seq_lens_decoder, - int* block_tables, - int* encoder_block_lens, - bool* is_block_step, - int* step_block_list, // [bsz] - int* step_len, - int* recover_block_list, - int* recover_len, - int* need_block_list, - int* need_block_len, - int* used_list_len, - int* free_list, - int* free_list_len, - int64_t* first_token_ids, - int* accept_num, + bool *stop_flags, + int *seq_lens_this_time, + int *seq_lens_decoder, + int *block_tables, + int *encoder_block_lens, + bool *is_block_step, + int *step_block_list, // [bsz] + int *step_len, + int *recover_block_list, + int *recover_len, + int *need_block_list, + int *need_block_len, + int *used_list_len, + int *free_list, + int *free_list_len, + int64_t *first_token_ids, + int *accept_num, const int bsz, const int block_size, const int block_num_per_seq, const int max_decoder_block_num, const int max_draft_tokens); -__device__ bool speculate_free_and_dispatch_block(const int& qid, - int* need_block_list, - const int& need_block_len); +__device__ bool speculate_free_and_dispatch_block(const int &qid, + int *need_block_list, + const int &need_block_len); static std::string global_base64_chars = // NOLINT "Tokp9lA/BjimRVKx32edMPFftOzsbNQ8C15Xn+YUEGc4WD0uLIq7hyJ6vZaHSwrg"; // Base64 编码函数 -inline std::string base64_encode(const std::string& input) { +inline std::string base64_encode(const std::string &input) { std::string ret; int i = 0; int j = 0; unsigned char char_array_3[3]; unsigned char char_array_4[4]; - for (const auto& c : input) { + for (const auto &c : input) { char_array_3[i++] = c; if (i == 3) { char_array_4[0] = (char_array_3[0] & 0xfc) >> 2; @@ -529,7 +529,7 @@ inline std::string base64_encode(const std::string& input) { } // Base64 解码函数 -inline std::string base64_decode(const std::string& encoded_string) { +inline std::string base64_decode(const std::string &encoded_string) { int in_len = encoded_string.size(); int i = 0; int j = 0; @@ -584,9 +584,9 @@ inline std::string base64_decode(const std::string& encoded_string) { #ifndef PADDLE_WITH_COREX template -inline T get_relative_best(nlohmann::json* json_data, - const std::string& target_key, - const T& default_value) { +inline T get_relative_best(nlohmann::json *json_data, + const std::string &target_key, + const T &default_value) { if (json_data->contains(target_key)) { return json_data->at(target_key); } else { @@ -598,7 +598,7 @@ inline T get_relative_best(nlohmann::json* json_data, #endif __device__ inline bool is_in_end(const int64_t id, - const int64_t* end_ids, + const int64_t *end_ids, int length) { bool flag = false; for (int i = 0; i < length; i++) { @@ -622,7 +622,7 @@ __device__ __inline__ T ClipFunc(const T v, const T min, const T max) { } template -static void PrintMatrix3(const T* mat_d, int num, std::string name) { +static void PrintMatrix3(const T *mat_d, int num, std::string name) { std::vector tmp(num); #ifdef PADDLE_WITH_HIP hipMemcpy(tmp.data(), mat_d, sizeof(T) * num, hipMemcpyDeviceToHost); @@ -647,7 +647,7 @@ static void PrintMatrix3(const T* mat_d, int num, std::string name) { #ifndef PADDLE_WITH_HIP #ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU -__forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t* flag_addr, +__forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr, int mode = 0) { uint32_t flag; if (mode == 0) { @@ -666,7 +666,7 @@ __forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t* flag_addr, return flag; } -__forceinline__ __device__ void st_flag_release(uint32_t* flag_addr, +__forceinline__ __device__ void st_flag_release(uint32_t *flag_addr, uint32_t flag, int mode = 0) { if (mode == 0) { @@ -708,12 +708,12 @@ inline bool GetMlaUseTensorcore() { return mla_use_tensorcore; } -inline const char* getEnvVar(const char* varName) { +inline const char *getEnvVar(const char *varName) { return std::getenv(varName); } inline bool checkAttentionBackend() { - const char* backend = getEnvVar("FD_ATTENTION_BACKEND"); + const char *backend = getEnvVar("FD_ATTENTION_BACKEND"); if (backend && (std::strcmp(backend, "MLA_ATTN") == 0 || std::strcmp(backend, "DSA_ATTN") == 0)) { return true; @@ -725,17 +725,17 @@ inline bool checkAttentionBackend() { #define GPU_MEMORY_CHECKER_H class GPUMemoryChecker { public: - static GPUMemoryChecker* getInstance() { + static GPUMemoryChecker *getInstance() { static GPUMemoryChecker instance; return &instance; } - void addCheckPoint(const char* call_file, int call_line); + void addCheckPoint(const char *call_file, int call_line); unsigned int getGPUCount() const { return deviceCount_; } void getCUDAVisibleDevice(); - GPUMemoryChecker(const GPUMemoryChecker&) = delete; - void operator=(const GPUMemoryChecker&) = delete; + GPUMemoryChecker(const GPUMemoryChecker &) = delete; + void operator=(const GPUMemoryChecker &) = delete; private: GPUMemoryChecker(); @@ -771,8 +771,8 @@ __device__ __forceinline__ float blockReduceMax(float value) { return value; } -inline bool getBoolEnv(char const* name) { - char const* env = std::getenv(name); +inline bool getBoolEnv(char const *name) { + char const *env = std::getenv(name); return env && env[0] == '1' && env[1] == '\0'; } @@ -795,7 +795,7 @@ inline void launchWithPdlWhenEnabled(KernelFn kernelFn, dim3 block, size_t dynamicShmSize, cudaStream_t stream, - Args&&... args) { + Args &&...args) { #ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU (*kernelFn)<<>>( std::forward(args)...); From a96f2a53e2774afe85324bdba2adeb6e8ba2ae53 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 19 May 2026 17:09:03 +0800 Subject: [PATCH 22/28] make_scale_interleaved --- custom_ops/gpu_ops/cpp_extensions.cc | 4 +- .../moe/prefill_permute_to_masked_gemm.cu | 40 +++++++++++-------- .../layers/quantization/nvfp4.py | 10 +++-- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index 70322fa4e6a..eeb7a95552b 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -1217,7 +1217,7 @@ std::vector PrefillPermuteToMaskedGemm( const paddle::Tensor& topk_ids, const int num_local_experts, const int max_token_num, - const bool swizzle_scale); + const bool make_scale_interleaved); std::vector DepermutePrefillCombine( const paddle::Tensor& x, @@ -1939,7 +1939,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("topk_ids"), py::arg("num_local_experts"), py::arg("max_token_num"), - py::arg("swizzle_scale") = false, + py::arg("make_scale_interleaved") = false, "Prefill permute to masked GEMM for MoE"); m.def("depermute_prefill_combine", diff --git a/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu b/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu index 170286e0b0d..63f50eb62a6 100644 --- a/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu +++ b/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu @@ -142,7 +142,7 @@ std::vector PrefillPermuteToMaskedGemmDispatch( const paddle::Tensor& topk_ids, const int num_local_experts, const int max_token_num, - const bool swizzle_scale) { + const bool make_scale_interleaved) { typedef PDTraits traits_; typedef PDTraits scale_traits_; typedef typename traits_::DataType DataType_; @@ -162,13 +162,15 @@ std::vector PrefillPermuteToMaskedGemmDispatch( {num_local_experts, max_token_num, hidden}, x.dtype(), place); paddle::Tensor permute_scale; - if (swizzle_scale) { + if (make_scale_interleaved) { const int scale_bytes_per_token = hidden_scale * static_cast(sizeof(ScaleDataType_)); - PD_CHECK(max_token_num % 128 == 0, - "swizzle_scale requires max_token_num to be divisible by 128"); + PD_CHECK( + max_token_num % 128 == 0, + "make_scale_interleaved requires max_token_num to be divisible by 128"); PD_CHECK(scale_bytes_per_token % 4 == 0, - "swizzle_scale requires the unpacked FP8 scale dimension to be " + "make_scale_interleaved requires the unpacked FP8 scale dimension " + "to be " "divisible by 4"); permute_scale = GetEmptyTensor( {num_local_experts, max_token_num, hidden_scale}, ScaleD, place); @@ -228,7 +230,7 @@ std::vector PrefillPermuteToMaskedGemmDispatch( hidden_scale, \ max_token_num) - if (swizzle_scale) { + if (make_scale_interleaved) { LAUNCH_PREFILL_PERMUTE(true); } else { LAUNCH_PREFILL_PERMUTE(false); @@ -245,19 +247,25 @@ std::vector PrefillPermuteToMaskedGemm( const paddle::Tensor& topk_ids, const int num_local_experts, const int max_token_num, - const bool swizzle_scale) { - if (swizzle_scale) { - PD_CHECK(x.dtype() == paddle::DataType::UINT8 && - scale.dtype() == paddle::DataType::FLOAT32, - "swizzle_scale=true is only valid for UINT8 x + FLOAT32 scale " - "(FP4 comm quant path)"); + const bool make_scale_interleaved) { + if (make_scale_interleaved) { + PD_CHECK( + x.dtype() == paddle::DataType::UINT8 && + scale.dtype() == paddle::DataType::FLOAT32, + "make_scale_interleaved=true is only valid for UINT8 x + FLOAT32 scale " + "(FP4 comm quant path)"); } const int topk = topk_ids.shape()[1]; #define DISPATCH_TOPK(DTYPE, SCALE_DTYPE, TOPK_VAL) \ case TOPK_VAL: \ return PrefillPermuteToMaskedGemmDispatch( \ - x, scale, topk_ids, num_local_experts, max_token_num, swizzle_scale); + x, \ + scale, \ + topk_ids, \ + num_local_experts, \ + max_token_num, \ + make_scale_interleaved); switch (x.dtype()) { case paddle::DataType::FLOAT8_E4M3FN: { @@ -348,7 +356,7 @@ std::vector> PrefillPermuteToMaskedGemmInferShape( const std::vector& topk_ids_shape, const int num_local_experts, const int max_token_num, - const bool swizzle_scale) { + const bool make_scale_interleaved) { int64_t num_tokens = x_shape[0]; int64_t hidden = x_shape[1]; int64_t hidden_scale = scale_shape[1]; @@ -368,7 +376,7 @@ std::vector PrefillPermuteToMaskedGemmInferDtype( const paddle::DataType& topk_ids_dtype, const int num_local_experts, const int max_token_num, - const bool swizzle_scale) { + const bool make_scale_interleaved) { return { x_dtype, scale_dtype, paddle::DataType::INT32, paddle::DataType::INT32}; } @@ -381,7 +389,7 @@ PD_BUILD_STATIC_OP(prefill_permute_to_masked_gemm) "token_nums_per_expert"}) .Attrs({"num_local_experts: int", "max_token_num: int", - "swizzle_scale: bool"}) + "make_scale_interleaved: bool"}) .SetKernelFn(PD_KERNEL(PrefillPermuteToMaskedGemm)) .SetInferShapeFn(PD_INFER_SHAPE(PrefillPermuteToMaskedGemmInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(PrefillPermuteToMaskedGemmInferDtype)); diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 12a2c8f19fd..627901ddf20 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -81,7 +81,7 @@ def call_prefill_permute_to_masked_gemm( topk_ids: paddle.Tensor, num_local_experts: int, max_token_num: int, - swizzle_scale: bool = False, + make_scale_interleaved: bool = False, ): """ Permute input tokens and scales from token-major to expert-major layout @@ -93,7 +93,7 @@ def call_prefill_permute_to_masked_gemm( topk_ids: Expert routing indices [num_tokens, topk] (int64 or int32). num_local_experts: Number of local experts on this device. max_token_num: Maximum tokens per expert buffer. - swizzle_scale: Whether to directly write scale in flashinfer swizzled layout. + make_scale_interleaved: Whether to directly write scale in flashinfer swizzled layout. Returns: tuple: (permute_x, permute_scale, permuted_indice_map, token_nums_per_expert) @@ -106,7 +106,9 @@ def call_prefill_permute_to_masked_gemm( if scale is None: scale = paddle.empty([0], dtype=paddle.float32) - results = prefill_permute_to_masked_gemm(x, scale, topk_ids, num_local_experts, max_token_num, swizzle_scale) + results = prefill_permute_to_masked_gemm( + x, scale, topk_ids, num_local_experts, max_token_num, make_scale_interleaved + ) return results[0], results[1], results[2], results[3] @@ -767,7 +769,7 @@ def apply_ep_prefill( topk_ids=recv_topk_idx, num_local_experts=layer.num_local_experts, max_token_num=layer.ep_size * max_tokens_per_rank, - swizzle_scale=recv_x_scale is not None, + make_scale_interleaved=recv_x_scale is not None, ) ) From e20676d3e5b8ce03f608033b6509ac855782af09 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 19 May 2026 17:11:54 +0800 Subject: [PATCH 23/28] fix --- .../moe/prefill_permute_to_masked_gemm.cu | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu b/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu index 63f50eb62a6..c3e3dad14b8 100644 --- a/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu +++ b/custom_ops/gpu_ops/moe/prefill_permute_to_masked_gemm.cu @@ -22,7 +22,7 @@ template + bool MAKE_SCALE_INTERLEAVED> __global__ void PrefillPermuteToMaskedGemmKernel( T* __restrict__ permute_x, ScaleT* __restrict__ permute_scale, @@ -40,8 +40,8 @@ __global__ void PrefillPermuteToMaskedGemmKernel( const int tidx = threadIdx.x; const int x_num_vecs = hidden / VecSize; - // Pre-compute swizzle constants outside the slot loop (compile-time dead if - // !SWIZZLE_SCALE) + // Pre-compute interleaved scale constants outside the slot loop (compile-time + // dead if !MAKE_SCALE_INTERLEAVED) const int scale_bytes_per_token = hidden_scale * static_cast(sizeof(ScaleT)); const int m_tiles = max_tokens_per_expert / 128; @@ -84,10 +84,10 @@ __global__ void PrefillPermuteToMaskedGemmKernel( const ScaleT* src_scale = scale + static_cast(token_idx) * hidden_scale; - if constexpr (SWIZZLE_SCALE) { - // Directly write packed FP8 scale bytes into the swizzled layout used - // by flashinfer cutedsl: [E, M/128, K/4, 32, 4, 4]. The tensor is - // exposed to Paddle as packed float32 [E, M, K/4]. + if constexpr (MAKE_SCALE_INTERLEAVED) { + // Directly write packed FP8 scale bytes into the interleaved layout + // used by flashinfer cutedsl: [E, M/128, K/4, 32, 4, 4]. The tensor + // is exposed to Paddle as packed float32 [E, M, K/4]. const uint8_t* src_scale_bytes = reinterpret_cast(src_scale); uint8_t* dst_scale_bytes = reinterpret_cast(permute_scale); @@ -209,12 +209,12 @@ std::vector PrefillPermuteToMaskedGemmDispatch( cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev); int num_blocks = sm_count * 2; -#define LAUNCH_PREFILL_PERMUTE(SWIZZLE) \ +#define LAUNCH_PREFILL_PERMUTE(INTERLEAVED) \ PrefillPermuteToMaskedGemmKernel \ + INTERLEAVED> \ <<>>( \ reinterpret_cast(permute_x.data()), \ reinterpret_cast( \ From 528487908ae228b63861f88e09fc98dd253efaa9 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 20 May 2026 11:50:40 +0800 Subject: [PATCH 24/28] fix --- fastdeploy/model_executor/forward_meta.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py index f766b9fcf7d..516344a17f4 100644 --- a/fastdeploy/model_executor/forward_meta.py +++ b/fastdeploy/model_executor/forward_meta.py @@ -162,8 +162,6 @@ class ForwardMeta: # for prefill exist_prefill: bool = False - audio_token_num: int = 0 - # for mla & dsa position_ids: Optional[paddle.Tensor] = None # for kvcache slot From c22d3ea800e6b3ba8bc20b494c2d3e447154e295 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 20 May 2026 15:44:14 +0800 Subject: [PATCH 25/28] fix --- fastdeploy/worker/gpu_model_runner.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index a2e42f6cb80..992fe556603 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -3056,13 +3056,6 @@ def clear_parameters(self, pid): ): self.proposer.model.clear_graph_opt_backend() - # Clear block-wise CUDA graphs - if envs.FD_USE_BLOCK_WISE_CUDA_GRAPH: - from fastdeploy.model_executor.graph_optimization.cuda_graph_op import ( - clear_all_block_wise_graphs, - ) - - clear_all_block_wise_graphs() # Clear parameters and Send single self.dynamic_weight_manager.clear_parameters( pid, self.fd_config.parallel_config.shutdown_comm_group_if_worker_idle From 7eb643b5a98cd811959d6e187d81b59d3ba1bb9c Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 20 May 2026 15:46:48 +0800 Subject: [PATCH 26/28] fix --- fastdeploy/worker/gpu_model_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 992fe556603..d4a917bdb1e 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -2908,7 +2908,6 @@ def profile_run(self) -> None: logger.info( f"Dummy run with {num_tokens} tokens, mm_max_tokens_per_item: {self.model_config.mm_max_tokens_per_item}" ) - self._dummy_run( num_tokens=num_tokens, batch_size=self.scheduler_config.max_num_seqs, From 965400490220b98faaec6028e2e171cc30f6c6ac Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 20 May 2026 15:47:31 +0800 Subject: [PATCH 27/28] fix --- fastdeploy/worker/gpu_model_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index d4a917bdb1e..3e06c13927f 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -3054,7 +3054,6 @@ def clear_parameters(self, pid): and self.graph_opt_config.draft_model_use_cudagraph ): self.proposer.model.clear_graph_opt_backend() - # Clear parameters and Send single self.dynamic_weight_manager.clear_parameters( pid, self.fd_config.parallel_config.shutdown_comm_group_if_worker_idle From 47bee0aa1e73cd9cbd9bd5d82cfd9ca5821dc7f6 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 20 May 2026 19:08:45 +0800 Subject: [PATCH 28/28] update skills --- .../scripts/extract_metrics.py | 245 ++++++++++++------ .../scripts/generate_report.py | 75 ++++-- .../scripts/launch_service.sh | 79 +++++- 3 files changed, 295 insertions(+), 104 deletions(-) diff --git a/.claude/skills/benchmark-compare/scripts/extract_metrics.py b/.claude/skills/benchmark-compare/scripts/extract_metrics.py index 6423c139017..b3eabcc2b79 100644 --- a/.claude/skills/benchmark-compare/scripts/extract_metrics.py +++ b/.claude/skills/benchmark-compare/scripts/extract_metrics.py @@ -1,13 +1,18 @@ #!/usr/bin/env python3 """extract_metrics.py — 从 benchmark 结果文件提取指标,输出结构化 JSON +支持框架: fd (FastDeploy) / sg (SGLang) / vllm (vLLM) +任意框架结果均可缺省,缺省的不参与对比。 + 用法: python3 extract_metrics.py \ --fd-result \ --sg-result \ + --vllm-result \ --model-path \ --fd-config '{"gpu":"H800","tp":1,"concurrency":32}' \ --sg-config '{"gpu":"H800","tp":1,"concurrency":32}' \ + --vllm-config '{"gpu":"H800","tp":1,"concurrency":32}' \ --output """ @@ -18,12 +23,16 @@ import subprocess import sys +# 支持的框架列表 +FRAMEWORKS = ("fd", "sg", "vllm") + def parse_benchmark_result(filepath): """解析 benchmark_serving.py 的输出文件,提取所有指标""" metrics = {} - if not os.path.isfile(filepath): - print(f"[WARN] 结果文件不存在: {filepath}", file=sys.stderr) + if not filepath or not os.path.isfile(filepath): + if filepath: + print(f"[WARN] 结果文件不存在: {filepath}", file=sys.stderr) return metrics with open(filepath, "r") as f: @@ -110,70 +119,104 @@ def get_model_info(model_path): return info -def compute_comparison(fd_metrics, sg_metrics): - """计算对比指标(差异百分比、胜出方)""" +# 吞吐类指标:越高越好 +HIGHER_IS_BETTER = { + "total_token_throughput", + "output_token_throughput", + "request_throughput", + "mean_decode", + "median_decode", + "p80_decode", + "p95_decode", + "p99_decode", +} + +# 延迟类指标:越低越好 +LOWER_IS_BETTER = { + "mean_ttft", + "median_ttft", + "p80_ttft", + "p95_ttft", + "p99_ttft", + "mean_tpot", + "median_tpot", + "p80_tpot", + "p95_tpot", + "p99_tpot", + "mean_itl", + "median_itl", + "p80_itl", + "p95_itl", + "p99_itl", + "mean_e2el", + "median_e2el", + "p80_e2el", + "p95_e2el", + "p99_e2el", + "benchmark_duration", +} + + +def compute_comparison(all_metrics, baseline="sg"): + """计算多框架对比指标。 + + all_metrics: {"fd": {...}, "sg": {...}, "vllm": {...}}(任意 key 可为空 dict) + baseline: 用于计算 diff_pct 的基准框架(默认 SGLang) + + 返回: + { + metric_key: { + "fd": ..., "sg": ..., "vllm": ..., + "diff_pct": {"fd": ..., "vllm": ...}, # 相对 baseline + "winner": "fd" | "sg" | "vllm" | "tie" + } + } + """ comparison = {} - # 吞吐类指标:越高越好 - higher_is_better = { - "total_token_throughput", - "output_token_throughput", - "request_throughput", - "mean_decode", - "median_decode", - "p80_decode", - "p95_decode", - "p99_decode", - } + # 只比较实际有数据的框架 + active = [fw for fw in FRAMEWORKS if all_metrics.get(fw)] + if not active: + return comparison - # 延迟类指标:越低越好 - lower_is_better = { - "mean_ttft", - "median_ttft", - "p80_ttft", - "p95_ttft", - "p99_ttft", - "mean_tpot", - "median_tpot", - "p80_tpot", - "p95_tpot", - "p99_tpot", - "mean_itl", - "median_itl", - "p80_itl", - "p95_itl", - "p99_itl", - "mean_e2el", - "median_e2el", - "p80_e2el", - "p95_e2el", - "p99_e2el", - "benchmark_duration", - } - - all_keys = set(fd_metrics.keys()) | set(sg_metrics.keys()) + # 收集所有指标 key + all_keys = set() + for fw in active: + all_keys |= set(all_metrics[fw].keys()) for key in sorted(all_keys): - fd_val = fd_metrics.get(key) - sg_val = sg_metrics.get(key) - - if fd_val is None or sg_val is None: + entry = {} + per_fw_val = {} + for fw in active: + val = all_metrics[fw].get(key) + if val is None: + continue + entry[fw] = val + per_fw_val[fw] = val + + if len(per_fw_val) < 2: + # 单框架数据,无法对比但仍记录 + comparison[key] = entry continue - entry = {"fd": fd_val, "sg": sg_val} - - # 计算差异百分比 (FD 相对于 SG) - if sg_val != 0: - diff_pct = round((fd_val - sg_val) / sg_val * 100, 2) - else: - diff_pct = 0 - entry["diff_pct"] = diff_pct + # 计算相对 baseline 的差异百分比 + diff_pct = {} + base_val = per_fw_val.get(baseline) + for fw, val in per_fw_val.items(): + if fw == baseline or base_val is None: + continue + if base_val != 0: + diff_pct[fw] = round((val - base_val) / base_val * 100, 2) + else: + diff_pct[fw] = 0 + if diff_pct: + entry["diff_pct"] = diff_pct # 判断胜出方 - if key in higher_is_better: - entry["winner"] = "fd" if fd_val > sg_val else "sg" - elif key in lower_is_better: - entry["winner"] = "fd" if fd_val < sg_val else "sg" + if key in HIGHER_IS_BETTER: + entry["winner"] = max(per_fw_val, key=per_fw_val.get) + elif key in LOWER_IS_BETTER: + entry["winner"] = min(per_fw_val, key=per_fw_val.get) else: entry["winner"] = "tie" @@ -184,40 +227,65 @@ def compute_comparison(fd_metrics, sg_metrics): def main(): parser = argparse.ArgumentParser(description="从 benchmark 结果提取指标并生成对比 JSON") - parser.add_argument("--fd-result", required=True, help="FastDeploy 结果文件路径") - parser.add_argument("--sg-result", required=True, help="SGLang 结果文件路径") + parser.add_argument("--fd-result", default=None, help="FastDeploy 结果文件路径") + parser.add_argument("--sg-result", default=None, help="SGLang 结果文件路径") + parser.add_argument("--vllm-result", default=None, help="vLLM 结果文件路径") parser.add_argument("--model-path", required=True, help="模型权重目录路径") parser.add_argument("--fd-config", default="{}", help="FD 部署配置 JSON 字符串") parser.add_argument("--sg-config", default="{}", help="SG 部署配置 JSON 字符串") + parser.add_argument("--vllm-config", default="{}", help="vLLM 部署配置 JSON 字符串") + parser.add_argument( + "--baseline", default="sg", choices=FRAMEWORKS, help="对比基准框架(计算 diff_pct 用),默认 sg" + ) parser.add_argument("--output", default="metrics.json", help="输出 JSON 路径") args = parser.parse_args() - print(f"[INFO] 解析 FD 结果: {args.fd_result}") - fd_metrics = parse_benchmark_result(args.fd_result) - print(f"[INFO] 解析 SG 结果: {args.sg_result}") - sg_metrics = parse_benchmark_result(args.sg_result) + # 至少需要一份结果 + if not any([args.fd_result, args.sg_result, args.vllm_result]): + parser.error("至少需要提供 --fd-result / --sg-result / --vllm-result 中的一个") + + result_paths = { + "fd": args.fd_result, + "sg": args.sg_result, + "vllm": args.vllm_result, + } + config_strs = { + "fd": args.fd_config, + "sg": args.sg_config, + "vllm": args.vllm_config, + } + framework_display = {"fd": "FastDeploy", "sg": "SGLang", "vllm": "vLLM"} + + all_metrics = {} + for fw in FRAMEWORKS: + path = result_paths[fw] + if path: + print(f"[INFO] 解析 {framework_display[fw]} 结果: {path}") + all_metrics[fw] = parse_benchmark_result(path) + else: + all_metrics[fw] = {} print(f"[INFO] 读取模型信息: {args.model_path}") model_info = get_model_info(args.model_path) - print("[INFO] 计算对比指标...") - comparison = compute_comparison(fd_metrics, sg_metrics) + print(f"[INFO] 计算对比指标 (baseline={args.baseline})...") + comparison = compute_comparison(all_metrics, baseline=args.baseline) # 解析部署配置 - fd_config = json.loads(args.fd_config) if args.fd_config else {} - sg_config = json.loads(args.sg_config) if args.sg_config else {} + configs = {} + for fw in FRAMEWORKS: + try: + configs[fw] = json.loads(config_strs[fw]) if config_strs[fw] else {} + except json.JSONDecodeError as e: + print(f"[WARN] 解析 --{fw}-config 失败: {e}", file=sys.stderr) + configs[fw] = {} output = { "model": model_info, - "config": { - "fd": fd_config, - "sg": sg_config, - }, - "raw_metrics": { - "fd": fd_metrics, - "sg": sg_metrics, - }, + "config": configs, + "raw_metrics": all_metrics, "comparison": comparison, + "baseline": args.baseline, } with open(args.output, "w") as f: @@ -236,14 +304,29 @@ def main(): "mean_decode", "benchmark_duration", ] + active = [fw for fw in FRAMEWORKS if all_metrics.get(fw)] + if not active: + print("[WARN] 没有任何有效的结果数据") + return + print("\n========== 核心指标摘要 ==========") - print(f"{'Metric':<30} {'FD':>12} {'SG':>12} {'Diff%':>8} {'Winner':>8}") - print("-" * 72) + header = f"{'Metric':<30}" + for fw in active: + header += f" {framework_display[fw]:>12}" + header += f" {'Winner':>10}" + print(header) + print("-" * len(header)) for key in key_metrics: - if key in comparison: - c = comparison[key] - print(f"{key:<30} {c['fd']:>12.2f} {c['sg']:>12.2f} {c['diff_pct']:>+7.1f}% {c['winner']:>8}") - print("=" * 72) + if key not in comparison: + continue + c = comparison[key] + line = f"{key:<30}" + for fw in active: + val = c.get(fw) + line += f" {val:>12.2f}" if isinstance(val, (int, float)) else f" {'-':>12}" + line += f" {c.get('winner', '-'):>10}" + print(line) + print("=" * len(header)) if __name__ == "__main__": diff --git a/.claude/skills/benchmark-compare/scripts/generate_report.py b/.claude/skills/benchmark-compare/scripts/generate_report.py index d93444f443b..b1afd542720 100644 --- a/.claude/skills/benchmark-compare/scripts/generate_report.py +++ b/.claude/skills/benchmark-compare/scripts/generate_report.py @@ -86,8 +86,8 @@ def parse_benchmark_log(filepath): def scan_log_dir(log_dir): """扫描日志目录,自动识别场景并提取指标 - 文件命名约定: *_bs_[_].txt - 例如: GLM-4.7-Flash_long_bs32_fd.txt, GLM-4.7-Flash_long_bs512_fp8_sg.txt + 文件命名约定: *_bs_[_].txt + 例如: GLM-4.7-Flash_long_bs32_fd.txt, GLM-4.7-Flash_long_bs512_fp8_vllm.txt """ data = {} if not os.path.isdir(log_dir): @@ -101,11 +101,11 @@ def scan_log_dir(log_dir): filepath = os.path.join(root, fname) # 尝试从文件名解析场景信息 - # 格式: *_bs_[_].txt - m = re.search(r"_bs(\d+)_(?:(fp8|bf16|wint4|wint8)_)?(fd|sg)\.txt$", fname, re.IGNORECASE) + # 格式: *_bs_[_].txt + m = re.search(r"_bs(\d+)_(?:(fp8|bf16|wint4|wint8)_)?(fd|sg|vllm)\.txt$", fname, re.IGNORECASE) if not m: # 也尝试无 quant 的模式 (默认 bf16) - m = re.search(r"_bs(\d+)_(fd|sg)\.txt$", fname, re.IGNORECASE) + m = re.search(r"_bs(\d+)_(fd|sg|vllm)\.txt$", fname, re.IGNORECASE) if m: bs = m.group(1) quant = "bf16" @@ -826,11 +826,18 @@ def main(): parser.add_argument("--max-model-len", type=int, default=65536, help="最大模型长度") parser.add_argument("--fd-attention", default="MLA_ATTN (FlashAttn v3)", help="FD Attention Backend") parser.add_argument("--sg-attention", default="flashmla", help="SG Attention Backend") + parser.add_argument("--vllm-attention", default="flash-attn", help="vLLM Attention Backend") parser.add_argument("--sg-version", default="", help="SGLang 版本") + parser.add_argument("--vllm-version", default="", help="vLLM 版本") parser.add_argument("--fd-commit-date", default="", help="FD commit 日期") parser.add_argument("--fd-commit-short", default="", help="FD commit 短 hash") parser.add_argument("--fd-commit-full", default="", help="FD commit 完整 hash") + # 框架选择(三选二对比) + parser.add_argument( + "--frameworks", default="fd,sg", help="对比哪两个框架,逗号分隔(如 vllm,sg 或 fd,vllm),默认 fd,sg" + ) + # 显示配置 parser.add_argument("--default-quant", default="bf16", help="默认量化选择") parser.add_argument("--default-bs", default="512", help="默认并发选择") @@ -853,21 +860,40 @@ def main(): print("[ERROR] 未找到有效的 benchmark 数据", file=sys.stderr) sys.exit(1) - # 过滤掉不完整的场景(缺少 fd 或 sg) + # 解析 --frameworks,决定对比哪两个框架;把它们映射到现有 HTML 模板的 fd/sg 槽位 + fw_list = [x.strip().lower() for x in args.frameworks.split(",") if x.strip()] + if len(fw_list) != 2 or any(x not in ("fd", "sg", "vllm") for x in fw_list): + print(f"[ERROR] --frameworks 必须为 fd/sg/vllm 中的两个,逗号分隔,得到: {args.frameworks}", file=sys.stderr) + sys.exit(1) + left_fw, right_fw = fw_list[0], fw_list[1] + + framework_display = {"fd": "FastDeploy", "sg": "SGLang", "vllm": "vLLM"} + # framework_attn_key = {"fd": "fd_attention", "sg": "sg_attention", "vllm": "vllm_attention"} + framework_attn_val = { + "fd": args.fd_attention, + "sg": args.sg_attention, + "vllm": args.vllm_attention, + } + framework_version = {"fd": "", "sg": args.sg_version, "vllm": args.vllm_version} + + # 把所选两个框架的数据映射到 fd/sg 槽位 (left→fd, right→sg) valid_data = {} for key, val in benchmark_data.items(): - if "fd" in val and "sg" in val and val["fd"] and val["sg"]: - valid_data[key] = val + left_val = val.get(left_fw) + right_val = val.get(right_fw) + if left_val and right_val: + valid_data[key] = {"fd": left_val, "sg": right_val} else: - print(f"[WARN] 场景 {key} 数据不完整,跳过", file=sys.stderr) + print(f"[WARN] 场景 {key} 缺少 {left_fw} 或 {right_fw} 数据,跳过", file=sys.stderr) if not valid_data: - print("[ERROR] 没有完整的对比场景数据", file=sys.stderr) + print(f"[ERROR] 没有完整的 {left_fw} vs {right_fw} 对比场景数据", file=sys.stderr) sys.exit(1) + print(f"[INFO] 对比框架: {framework_display[left_fw]} vs {framework_display[right_fw]}") print(f"[INFO] 有效场景: {', '.join(sorted(valid_data.keys()))}") - # 构建配置 + # 构建配置(fd 槽=left_fw, sg 槽=right_fw) config = { "model_name": args.model_name, "model_type": args.model_type, @@ -879,12 +905,12 @@ def main(): "dp_size": args.dp, "ep_size": args.ep, "max_model_len": args.max_model_len, - "fd_attention": args.fd_attention, - "sg_attention": args.sg_attention, - "sg_version": args.sg_version, - "fd_commit_date": args.fd_commit_date, - "fd_commit_short": args.fd_commit_short, - "fd_commit_full": args.fd_commit_full, + "fd_attention": framework_attn_val[left_fw], + "sg_attention": framework_attn_val[right_fw], + "sg_version": framework_version[right_fw], + "fd_commit_date": args.fd_commit_date if left_fw == "fd" else "", + "fd_commit_short": args.fd_commit_short if left_fw == "fd" else "", + "fd_commit_full": args.fd_commit_full if left_fw == "fd" else "", "default_quant": args.default_quant, "default_bs": args.default_bs, "test_date": args.test_date, @@ -895,6 +921,21 @@ def main(): # 生成 HTML html = generate_html(valid_data, config) + # 把模板里的 "FastDeploy"/"SGLang" 文本标签替换为所选框架名 + # 注意:CSS 类名 .fd / .sg / fd-c / sg-c 等保持不变(只是颜色样式) + left_name = framework_display[left_fw] + right_name = framework_display[right_fw] + if left_name != "FastDeploy" or right_name != "SGLang": + # 用 placeholder 中转避免 FastDeploy→X 后再被 SGLang 替换误伤 + html = html.replace("FastDeploy", "__LEFT_FW__") + html = html.replace("SGLang", "__RIGHT_FW__") + html = html.replace("__LEFT_FW__", left_name) + html = html.replace("__RIGHT_FW__", right_name) + # FD 优势 / FD 文本也替换 + html = html.replace("FD 优势", f"{left_name} 优势") + html = html.replace(">FD<", f">{left_name}<") + html = html.replace(">SG<", f">{right_name}<") + with open(args.output, "w", encoding="utf-8") as f: f.write(html) diff --git a/.claude/skills/benchmark-compare/scripts/launch_service.sh b/.claude/skills/benchmark-compare/scripts/launch_service.sh index dae50f67d55..ba4eee30dea 100644 --- a/.claude/skills/benchmark-compare/scripts/launch_service.sh +++ b/.claude/skills/benchmark-compare/scripts/launch_service.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # launch_service.sh — 通用推理框架服务启动脚本 -# 支持 FastDeploy / SGLang,支持单卡/多卡 TP/DP/EP/PD 分离模式 +# 支持 FastDeploy / SGLang / vLLM,支持单卡/多卡 TP/DP/EP/PD 分离模式 set -euo pipefail # ============================================================ @@ -28,7 +28,7 @@ usage() { 用法: bash launch_service.sh [OPTIONS] 必需参数: - --framework 推理框架 (fd=FastDeploy, sg=SGLang) + --framework 推理框架 (fd=FastDeploy, sg=SGLang, vllm=vLLM) --model 模型权重路径 --port 服务端口 --gpus CUDA_VISIBLE_DEVICES (如 "0" 或 "0,1,2,3,4,5,6,7") @@ -40,6 +40,7 @@ usage() { --ep expert-parallel-size, MoE 模型专用 (默认: 0, 不启用) FD: 映射为 --enable-expert-parallel (EP=TP×DP 隐式) SG: 映射为 --ep-size N + vLLM: 映射为 --enable-expert-parallel --concurrency max-num-seqs / max-running-requests (默认: 32) --max-model-len 最大序列长度 (默认: 65536) --quantization 量化方式: none|block_wise_fp8|fp8|wint4|wint8 (默认: none) @@ -61,6 +62,10 @@ usage() { # TP=4 + DP=2 + EP=8 启动 SGLang (MoE, 8卡) bash launch_service.sh --framework sg --model /path/to/model --port 8280 \ --gpus 0,1,2,3,4,5,6,7 --tp 4 --dp 2 --ep 8 --venv /path/to/sglang_env/.venv + + # 单卡启动 vLLM + bash launch_service.sh --framework vllm --model /path/to/model --port 8380 \ + --gpus 2 --venv /path/to/vllm_env/.venv EOF exit "${1:-0}" } @@ -94,8 +99,8 @@ if [[ -z "$FRAMEWORK" || -z "$MODEL" || -z "$PORT" || -z "$GPUS" || -z "$VENV" ] usage 1 fi -if [[ "$FRAMEWORK" != "fd" && "$FRAMEWORK" != "sg" ]]; then - echo "错误: --framework 必须为 fd 或 sg" +if [[ "$FRAMEWORK" != "fd" && "$FRAMEWORK" != "sg" && "$FRAMEWORK" != "vllm" ]]; then + echo "错误: --framework 必须为 fd / sg / vllm" exit 1 fi @@ -267,12 +272,74 @@ launch_sglang() { echo "[INFO] SGLang PID: $! (已写入 /tmp/sg_pid_${PORT})" } +# ============================================================ +# 启动 vLLM +# ============================================================ +launch_vllm() { + echo "[INFO] 启动 vLLM 服务..." + echo " 模型: $MODEL" + echo " 端口: $PORT" + echo " GPU: $GPUS (TP=$TP, DP=$DP, EP=$EP)" + echo " 并发: $CONCURRENCY" + echo " 量化: $QUANTIZATION" + echo " 日志: $LOG_FILE" + + source "$VENV/bin/activate" + + export CUDA_VISIBLE_DEVICES="$GPUS" + + # DP 模式下设置 MASTER_PORT 避免冲突 + if [[ "$DP" -gt 1 ]]; then + export VLLM_MASTER_PORT=${VLLM_MASTER_PORT:-46000} + echo "[INFO] DP=$DP, 设置 VLLM_MASTER_PORT=$VLLM_MASTER_PORT 避免端口冲突" + fi + + # 构建命令 + local CMD="python -m vllm.entrypoints.openai.api_server" + CMD+=" --model $MODEL" + CMD+=" --host 0.0.0.0" + CMD+=" --port $PORT" + CMD+=" --tensor-parallel-size $TP" + CMD+=" --max-model-len $MAX_MODEL_LEN" + CMD+=" --max-num-seqs $CONCURRENCY" + CMD+=" --gpu-memory-utilization $GPU_MEM_UTIL" + CMD+=" --trust-remote-code" + + # DP (data parallelism) + if [[ "$DP" -gt 1 ]]; then + CMD+=" --data-parallel-size $DP" + fi + + # EP (expert parallelism) + if [[ "$EP" -gt 0 ]]; then + CMD+=" --enable-expert-parallel" + fi + + # 量化(vLLM 用 fp8 / awq / gptq 等;映射 FD 的 block_wise_fp8 → fp8) + if [[ "$QUANTIZATION" != "none" ]]; then + local VQ="$QUANTIZATION" + [[ "$VQ" == "block_wise_fp8" ]] && VQ="fp8" + CMD+=" --quantization $VQ" + fi + + # 额外参数 + if [[ -n "$EXTRA_ARGS" ]]; then + CMD+=" $EXTRA_ARGS" + fi + + echo "[INFO] 执行: $CMD" + nohup bash -c "$CMD" > "$LOG_FILE" 2>&1 & + echo $! > "/tmp/vllm_pid_${PORT}" + echo "[INFO] vLLM PID: $! (已写入 /tmp/vllm_pid_${PORT})" +} + # ============================================================ # 主入口 # ============================================================ case "$FRAMEWORK" in - fd) launch_fastdeploy ;; - sg) launch_sglang ;; + fd) launch_fastdeploy ;; + sg) launch_sglang ;; + vllm) launch_vllm ;; esac echo "[INFO] 服务已在后台启动,请使用 health_check.sh 等待就绪"