From d8fdf9d62a546a7a3b89f1881a1b3cf2e9de4bc8 Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Mon, 30 Mar 2026 13:35:15 +0800 Subject: [PATCH 1/6] support deepgeem for sm103 --- .../model_executor/layers/quantization/block_wise_fp8.py | 7 +++++++ fastdeploy/model_executor/layers/quantization/fp8_utils.py | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index 007cc0fddd2..da32f11de49 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -133,6 +133,13 @@ def deep_gemm_fp8_gemm_nt( linear_out, bias=bias, ) + elif get_sm_version() > 100 and current_platform.is_cuda(): + fp8_gemm_nt( + (x, x_scale_tensor), + (layer_weight, layer_weight_scale_inv), + linear_out, + disable_ue8m0_cast=False, + ) else: # disable_ue8m0_cast is default False for SM100 fp8_gemm_nt( diff --git a/fastdeploy/model_executor/layers/quantization/fp8_utils.py b/fastdeploy/model_executor/layers/quantization/fp8_utils.py index 65d30d4004d..a5cd230f601 100644 --- a/fastdeploy/model_executor/layers/quantization/fp8_utils.py +++ b/fastdeploy/model_executor/layers/quantization/fp8_utils.py @@ -65,7 +65,7 @@ def load_deep_gemm(): """ if current_platform.is_cuda(): - if get_sm_version() == 100: + if get_sm_version() >= 100: # SM100 should use PFCC DeepGemm paddle.compat.enable_torch_proxy(scope={"deep_gemm"}) try: @@ -245,7 +245,7 @@ def fused_stack_transpose_quant(expert_weight_list, use_ue8m0=False): # Blackwell (SM100) GPUs require pow2_scale quantization. # Guard with is_cuda() so non-CUDA environments do not call into # paddle.device.cuda.* and cause a crash. - use_pow2_scale = current_platform.is_cuda() and get_sm_version() == 100 + use_pow2_scale = current_platform.is_cuda() and get_sm_version() >= 100 w, scale = paddlefleet_ops.fuse_stack_transpose_fp8_quant( expert_weight_list, From 703c8287dec8db8c79757159c84e10aa98074d8b Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Mon, 30 Mar 2026 21:08:15 +0800 Subject: [PATCH 2/6] add assert --- .../layers/quantization/block_wise_fp8.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index da32f11de49..10580dc6b1a 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -125,6 +125,9 @@ def deep_gemm_fp8_gemm_nt( layer_output_size: int, bias: paddle.Tensor = None, ): + sm_version = get_sm_version() + if sm_version >= 100: + assert x_scale_tensor.dtype == paddle.uint8, "For SM100, x_scale_tensor must be uint8 dtype." if get_sm_version() == 100 and current_platform.is_cuda(): # disable_ue8m0_cast is default False for SM100 fp8_gemm_nt( @@ -133,13 +136,6 @@ def deep_gemm_fp8_gemm_nt( linear_out, bias=bias, ) - elif get_sm_version() > 100 and current_platform.is_cuda(): - fp8_gemm_nt( - (x, x_scale_tensor), - (layer_weight, layer_weight_scale_inv), - linear_out, - disable_ue8m0_cast=False, - ) else: # disable_ue8m0_cast is default False for SM100 fp8_gemm_nt( From 5a2b8112f0c1d670f6293a8c0fc1631ff667fd7a Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Mon, 30 Mar 2026 21:14:46 +0800 Subject: [PATCH 3/6] modify code style --- .../model_executor/layers/quantization/block_wise_fp8.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index 10580dc6b1a..a4d802fc05b 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -127,8 +127,8 @@ def deep_gemm_fp8_gemm_nt( ): sm_version = get_sm_version() if sm_version >= 100: - assert x_scale_tensor.dtype == paddle.uint8, "For SM100, x_scale_tensor must be uint8 dtype." - if get_sm_version() == 100 and current_platform.is_cuda(): + assert x_scale_tensor.dtype == paddle.uint8, "For sm100+, x_scale_tensor must be uint8 dtype." + if sm_version == 100 and current_platform.is_cuda(): # disable_ue8m0_cast is default False for SM100 fp8_gemm_nt( (x, x_scale_tensor), From a188b9a3be39a581f0aaca9168900b76caf1a761 Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Tue, 31 Mar 2026 16:05:53 +0800 Subject: [PATCH 4/6] add assert --- .../model_executor/layers/moe/fused_moe_deepgemm_backend.py | 4 ++++ .../model_executor/layers/quantization/block_wise_fp8.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index cf38ec57b0d..6b4c0244f26 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -160,6 +160,7 @@ def m_grouped_fp8_gemm_nt_contiguous_custom_python_op( permute_scale = permute_scale.transpose([1, 0]).contiguous() permute_scale = permute_scale.transpose([1, 0]) # disable_ue8m0_cast is False for SM100 + assert permute_scale.dtype == paddle.uint8, "For sm100+, scale must be uint8 dtype." m_grouped_fp8_gemm_nt_contiguous( (permute_input, permute_scale), (layer_added_weight_attrs_0, layer_added_scale_attrs_0), @@ -198,6 +199,7 @@ def m_grouped_fp8_gemm_nt_contiguous_custom_python_op( dtype=paddle.bfloat16, ) # disable_ue8m0_cast is False for SM100 + assert ffn_in_x_scale_tensor.dtype == paddle.uint8, "For sm100+, sclae must be uint8 dtype." m_grouped_fp8_gemm_nt_contiguous( (ffn_in_x, ffn_in_x_scale_tensor), (layer_added_weight_attrs_1, layer_added_scale_attrs_1), @@ -628,6 +630,7 @@ def apply_ep_prefill( (token_all_num, getattr(layer, self.added_weight_attrs[0]).shape[1]), dtype=paddle.bfloat16, ) + assert permute_scale.dtype == paddle.uint8, "For sm100+, sclae must be uint8 dtype." m_grouped_fp8_gemm_nt_contiguous( (permute_input, permute_scale), (getattr(layer, self.added_weight_attrs[0]), getattr(layer, self.added_scale_attrs[0])), @@ -665,6 +668,7 @@ def apply_ep_prefill( (token_all_num, getattr(layer, self.added_weight_attrs[1]).shape[1]), dtype=paddle.bfloat16, ) + assert ffn_in_x_scale_tensor.dtype == paddle.uint8, "For sm100+, sclae must be uint8 dtype." m_grouped_fp8_gemm_nt_contiguous( (ffn_in_x, ffn_in_x_scale_tensor), (getattr(layer, self.added_weight_attrs[1]), getattr(layer, self.added_scale_attrs[1])), diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index a4d802fc05b..5a453f86fd7 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -128,7 +128,7 @@ def deep_gemm_fp8_gemm_nt( sm_version = get_sm_version() if sm_version >= 100: assert x_scale_tensor.dtype == paddle.uint8, "For sm100+, x_scale_tensor must be uint8 dtype." - if sm_version == 100 and current_platform.is_cuda(): + if sm_version >= 100 and current_platform.is_cuda(): # disable_ue8m0_cast is default False for SM100 fp8_gemm_nt( (x, x_scale_tensor), From 5d32f41fbb8a4e995abcaacb7ec4688b436bb94a Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Tue, 31 Mar 2026 19:55:38 +0800 Subject: [PATCH 5/6] modify sm version condition --- fastdeploy/model_executor/layers/quantization/block_wise_fp8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index 5a453f86fd7..ae3d8e016e3 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -67,7 +67,7 @@ def __init__(self, weight_block_size: list = [-1, -1], is_checkpoint_bf16: bool self.quant_round_type = 1 self.use_deep_gemm = bool(envs.FD_USE_DEEP_GEMM) self.is_checkpoint_bf16 = is_checkpoint_bf16 - self.deepgemm_scale_ue8m0 = True if get_sm_version() == 100 else False + self.deepgemm_scale_ue8m0 = True if get_sm_version() >= 100 else False def name(self) -> str: return "block_wise_fp8" From 786e08a451df13c1e3a206bf8579a25d5f0cb921 Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Wed, 1 Apr 2026 15:13:28 +0800 Subject: [PATCH 6/6] remove assert --- .../model_executor/layers/moe/fused_moe_deepgemm_backend.py | 4 ---- .../model_executor/layers/quantization/block_wise_fp8.py | 2 -- 2 files changed, 6 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 6b4c0244f26..cf38ec57b0d 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -160,7 +160,6 @@ def m_grouped_fp8_gemm_nt_contiguous_custom_python_op( permute_scale = permute_scale.transpose([1, 0]).contiguous() permute_scale = permute_scale.transpose([1, 0]) # disable_ue8m0_cast is False for SM100 - assert permute_scale.dtype == paddle.uint8, "For sm100+, scale must be uint8 dtype." m_grouped_fp8_gemm_nt_contiguous( (permute_input, permute_scale), (layer_added_weight_attrs_0, layer_added_scale_attrs_0), @@ -199,7 +198,6 @@ def m_grouped_fp8_gemm_nt_contiguous_custom_python_op( dtype=paddle.bfloat16, ) # disable_ue8m0_cast is False for SM100 - assert ffn_in_x_scale_tensor.dtype == paddle.uint8, "For sm100+, sclae must be uint8 dtype." m_grouped_fp8_gemm_nt_contiguous( (ffn_in_x, ffn_in_x_scale_tensor), (layer_added_weight_attrs_1, layer_added_scale_attrs_1), @@ -630,7 +628,6 @@ def apply_ep_prefill( (token_all_num, getattr(layer, self.added_weight_attrs[0]).shape[1]), dtype=paddle.bfloat16, ) - assert permute_scale.dtype == paddle.uint8, "For sm100+, sclae must be uint8 dtype." m_grouped_fp8_gemm_nt_contiguous( (permute_input, permute_scale), (getattr(layer, self.added_weight_attrs[0]), getattr(layer, self.added_scale_attrs[0])), @@ -668,7 +665,6 @@ def apply_ep_prefill( (token_all_num, getattr(layer, self.added_weight_attrs[1]).shape[1]), dtype=paddle.bfloat16, ) - assert ffn_in_x_scale_tensor.dtype == paddle.uint8, "For sm100+, sclae must be uint8 dtype." m_grouped_fp8_gemm_nt_contiguous( (ffn_in_x, ffn_in_x_scale_tensor), (getattr(layer, self.added_weight_attrs[1]), getattr(layer, self.added_scale_attrs[1])), diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index ae3d8e016e3..a86170e0727 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -126,8 +126,6 @@ def deep_gemm_fp8_gemm_nt( bias: paddle.Tensor = None, ): sm_version = get_sm_version() - if sm_version >= 100: - assert x_scale_tensor.dtype == paddle.uint8, "For sm100+, x_scale_tensor must be uint8 dtype." if sm_version >= 100 and current_platform.is_cuda(): # disable_ue8m0_cast is default False for SM100 fp8_gemm_nt(