Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ max-line-length = 119
# E402: module level import not at top of file
per-file-ignores =
__init__.py:F401,F403,E402
fastdeploy/model_executor/layers/sample/ops/top_k_top_p_triton.py:E241,E121,E131,E266
4 changes: 3 additions & 1 deletion fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def _validate_split_kv_size(value: int) -> int:
# Set attention backend. "NATIVE_ATTN", "APPEND_ATTN"
# and "MLA_ATTN" can be set currently.
"FD_ATTENTION_BACKEND": lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"),
# Set sampling class. "base", "base_non_truncated", "air" and "rejection" can be set currently.
# Set sampling class. "base", "base_non_truncated", "air", "rejection" and "triton" can be set currently.
"FD_SAMPLING_CLASS": lambda: os.getenv("FD_SAMPLING_CLASS", "base"),
# Set moe backend."cutlass","marlin", "triton", "flashinfer-cutlass", "flashinfer-cutedsl" and "flashinfer-trtllm" can be set currently.
"FD_MOE_BACKEND": lambda: os.getenv("FD_MOE_BACKEND", "cutlass"),
Expand Down Expand Up @@ -287,6 +287,8 @@ def _validate_split_kv_size(value: int) -> int:
"FD_SiluAndMul_USE_PHI_SWIGLU": lambda: bool(int(os.getenv("FD_SiluAndMul_USE_PHI_SWIGLU", "0"))),
# Whether to enable FP8 quantization with pow2scale.
"FD_FP8_QUANT_WITH_POW2SCALE": lambda: bool(int(os.getenv("FD_FP8_QUANT_WITH_POW2SCALE", "0"))),
# Whether to enable top_p=1.0 optimization.
"FD_ENABLE_TOP_P_ONE_OPT": lambda: bool(int(os.getenv("FD_ENABLE_TOP_P_ONE_OPT", "0"))),
}


Expand Down
1 change: 1 addition & 0 deletions fastdeploy/model_executor/layers/sample/meta_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class SamplingMetadata:
step_idx: paddle.Tensor

top_p: paddle.Tensor
top_p_list: Optional[list] = None

This comment was marked as outdated.

# only GPU used
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 建议 top_p_list 字段使用裸 list 类型注解,与相邻字段(top_p: paddle.Tensor)风格不一致,且缺乏元素类型信息,IDE 和类型检查工具无法推导元素类型。

建议修改为:

top_p_list: Optional[List[float]] = None

(同时在文件顶部 import List 若尚未导入)

bad_words_token_len: Optional[paddle.Tensor] = None
top_k: Optional[paddle.Tensor] = None
Expand Down
7 changes: 6 additions & 1 deletion fastdeploy/model_executor/layers/sample/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@
speculate_get_accept_tokens_and_logits,
speculate_insert_first_token,
)
from .top_k_top_p_sampling import min_p_sampling, top_k_top_p_sampling
from .top_k_top_p_sampling import (
dispatch_top_k_renorm_probs,
min_p_sampling,
top_k_top_p_sampling,
)

__all__ = [
"apply_penalty_multi_scores",
Expand All @@ -33,4 +37,5 @@
"min_p_sampling",
"speculate_get_accept_tokens_and_logits",
"speculate_insert_first_token",
"dispatch_top_k_renorm_probs",
]
36 changes: 17 additions & 19 deletions fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,20 @@ def _reset_cuda_generator_for_determinism():
paddle.framework.core.default_cuda_generator(0).manual_seed(_DETERMINISTIC_RNG_SEED)


def dispatch_top_k_renorm_probs(probs, top_k):
try:
if current_platform.is_iluvatar():
from fastdeploy.model_executor.ops.iluvatar import top_k_renorm_probs
else:
from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
probs = top_k_renorm_probs(probs, top_k)

except ImportError:
logger.warning("top_k sampling is not supported on current platform, skipping top_k filtering.")

return probs


def top_k_top_p_sampling(
x: paddle.Tensor,
top_p: paddle.Tensor,
Expand Down Expand Up @@ -70,7 +84,6 @@ def top_k_top_p_sampling(

"""
top_p_class = envs.FD_SAMPLING_CLASS.lower()
topp_seed_device = None

# In deterministic mode, reset CUDA generator offset before sampling.
# paddle.tensor.top_p_sampling uses the global GPU generator offset even
Expand All @@ -85,29 +98,17 @@ def top_k_top_p_sampling(
_ = None
else:
if top_k_list and any(x > 0 for x in top_k_list):
try:
if current_platform.is_iluvatar():
from fastdeploy.model_executor.ops.iluvatar import (
top_k_renorm_probs,
)
else:
from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
x = top_k_renorm_probs(x, top_k)
except ImportError:
logger.warning("top_k sampling is not supported on current platform, skipping top_k filtering.")
x = dispatch_top_k_renorm_probs(x, top_k)

if top_p_class == "air":
_, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode)

elif top_p_class == "base_non_truncated":
if topp_seed is not None:
topp_seed_device = paddle.empty(shape=topp_seed.shape, dtype=topp_seed.dtype)
topp_seed_device.copy_(topp_seed, False)
_, ids = paddle.tensor.top_p_sampling(

This comment was marked as outdated.

x,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❓ 疑问 原代码明确将 topp_seed 从 CPU 拷贝到 GPU(paddle.empty(...).copy_(topp_seed, False))再传入 paddle.tensor.top_p_sampling,此处直接将原始 topp_seed 传入。

topp_seed 在调用侧已保证在 GPU 上,此简化正确;但若仍可能在 CPU 上(如从 input_batch CPU 侧构造),则会导致运行时 device mismatch 错误。

请确认 topp_seed 的来源保证在 GPU 上,或添加注释说明。

top_p,
threshold=threshold,
topp_seed=topp_seed_device,
topp_seed=topp_seed,
seed=seed,
k=k,
mode="non-truncated",
Expand All @@ -122,14 +123,11 @@ def top_k_top_p_sampling(

_, ids = native_top_p_sampling(x, top_p)
else:
if topp_seed is not None:
topp_seed_device = paddle.empty(shape=topp_seed.shape, dtype=topp_seed.dtype)
topp_seed_device.copy_(topp_seed, False)
_, ids = paddle.tensor.top_p_sampling(
x,
top_p,
threshold=threshold,
topp_seed=topp_seed_device,
topp_seed=topp_seed,
seed=seed,
k=k,
mode="truncated",
Expand Down
Loading
Loading