Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 36 additions & 17 deletions fastdeploy/model_executor/entropy_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,32 @@ def get_entropy(logits):

def calculate_logits_entropy(logits, share_inputs, temperature):
real_bsz = share_inputs["seq_lens_this_time"].shape[0]
seq_lens_encoder = share_inputs["seq_lens_encoder"][:real_bsz]
seq_lens_this_time = share_inputs["seq_lens_this_time"]
# GPU runner uses 1D [N], HPU/GCU uses 2D [N,1]; flatten to 1D
if seq_lens_encoder.ndim == 2:
seq_lens_encoder = seq_lens_encoder.squeeze(1)
if seq_lens_this_time.ndim == 2:
seq_lens_this_time = seq_lens_this_time.squeeze(1)
real_seq_lens = paddle.where(
share_inputs["seq_lens_encoder"][:real_bsz].squeeze(1) != 0,
seq_lens_encoder != 0,
paddle.ones([1], dtype="int32"),
share_inputs["seq_lens_this_time"].squeeze(1),
seq_lens_this_time,
)

batch_indices = paddle.arange(real_bsz, dtype="int32")
batch_id_per_token = paddle.repeat_interleave(batch_indices, real_seq_lens)
for i in range(logits.shape[0]):
if temperature[batch_id_per_token[i]] > 0 and temperature[batch_id_per_token[i]] != 1.0:
logits[i] = logits[i].scale_(1 / temperature[batch_id_per_token[i]])
for i in range(real_bsz):
if int(real_seq_lens[i]) == 0:
continue
t = temperature[i]
if t > 0 and t != 1.0:
logits[i] = logits[i].scale_(1 / t)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 建议 get_entropy(logits[:real_bsz]) 隐含了 logits 形状为 [batch_size, vocab_size] 的假设。

对于 GPU runner decode 步(每序列1个 token),logits 确实是 [batch_size, vocab_size],此处切片正确。

但对于 HPU/GCU prefill 场景,若 logits 为 [total_tokens, vocab_size] 排布,logits[:real_bsz] 只取前 real_bsz 行(即前 real_bsz 个 token 的 logits),而非每个序列最后一个 token 的 logits,会导致 entropy 计算结果错误。

建议在函数入口或注释中明确 logits 的形状约定,或添加断言:

# logits expected shape: [batch_size, vocab_size] (GPU runner)
# For HPU/GCU prefill, caller should pass only the last-token logits per sequence
assert logits.shape[0] >= real_bsz, f"logits dim0 {logits.shape[0]} < real_bsz {real_bsz}"

entropy_tensor = get_entropy(logits)
entropy = entropy_tensor.tolist()
entropy_tensor = get_entropy(logits[:real_bsz])

for i in range(real_bsz):
for _ in range(real_seq_lens[i]):
share_inputs["entropy_list"][i].append(entropy.pop(0))
if int(real_seq_lens[i]) == 0:
continue
share_inputs["entropy_list"][i].append(float(entropy_tensor[i]))
if (
share_inputs["stop_flags"][i]
and share_inputs["seq_lens_decoder"][i] != 0
Expand All @@ -67,10 +75,17 @@ def speculate_calculate_logits_entropy(logits, share_inputs, temperature):
# get accepted logits
real_bsz = share_inputs["seq_lens_this_time"].shape[0]
total_accepted_num = paddle.sum(share_inputs["accept_num"])
seq_lens_encoder = share_inputs["seq_lens_encoder"][:real_bsz]
seq_lens_this_time = share_inputs["seq_lens_this_time"]
# GPU runner uses 1D [N], HPU/GCU uses 2D [N,1]; flatten to 1D
if seq_lens_encoder.ndim == 2:
seq_lens_encoder = seq_lens_encoder.squeeze(1)
if seq_lens_this_time.ndim == 2:
seq_lens_this_time = seq_lens_this_time.squeeze(1)
real_seq_lens = paddle.where(
share_inputs["seq_lens_encoder"][:real_bsz].squeeze(1) != 0,
seq_lens_encoder != 0,
paddle.ones([1], dtype="int32"),
share_inputs["seq_lens_this_time"].squeeze(1),
seq_lens_this_time,
)
seq_start_idx = paddle.concat([paddle.zeros([1], dtype="int32"), paddle.cumsum(real_seq_lens, dtype="int32")])
repeated_starts = paddle.repeat_interleave(seq_start_idx[:-1], share_inputs["accept_num"][:real_bsz])
Expand All @@ -83,16 +98,20 @@ def speculate_calculate_logits_entropy(logits, share_inputs, temperature):
for i in range(total_accepted_num):
accepted_logits[i] = logits[accepted_idx[i]]

batch_indices = paddle.arange(share_inputs["accept_num"].shape[0], dtype="int32")
batch_id_per_token = paddle.repeat_interleave(batch_indices, share_inputs["accept_num"])
batch_indices = paddle.arange(real_bsz, dtype="int32")
batch_id_per_token = paddle.repeat_interleave(batch_indices, share_inputs["accept_num"][:real_bsz])
for i in range(accepted_logits.shape[0]):
if temperature[batch_id_per_token[i]] > 0 and temperature[batch_id_per_token[i]] != 1.0:
accepted_logits[i] = accepted_logits[i].scale_(1 / temperature[batch_id_per_token[i]])
bid = int(batch_id_per_token[i])
t = temperature[bid]
if t > 0 and t != 1.0:
accepted_logits[i] = accepted_logits[i].scale_(1 / t)

entropy_tensor = get_entropy(accepted_logits)
entropy = entropy_tensor.tolist()

for i in range(real_bsz):
if int(real_seq_lens[i]) == 0:
continue
for _ in range(share_inputs["accept_num"][i]):
share_inputs["entropy_list"][i].append(entropy.pop(0))
if (
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1241,7 +1241,7 @@ def _dummy_prefill_inputs(self, input_length_list: List[int], max_dec_len_list:
self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(
idx * block_num, (idx + 1) * block_num, 1
)
self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"]
self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"][:batch_size]
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 建议 A6 多硬件同步检查:此处修复了 seq_lens_this_time_buffer[:batch_size] 切片,建议同步确认 hpu_model_runner.pygcu_model_runner.pydcu_model_runner.py 等其他 _dummy_prefill_inputs 实现中是否存在同类未切片的 buffer 赋值,避免其他硬件在 entropy 使能时遇到相同的 shape 不匹配问题。


def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_profile_run=False) -> None:
"""Prepare the model inputs"""
Expand Down
Loading