From 8c97537fd08642410251b224855f76d4beca43a2 Mon Sep 17 00:00:00 2001 From: songyuxing Date: Fri, 22 May 2026 01:08:42 +0800 Subject: [PATCH] support compute entropy in fd-runner --- fastdeploy/model_executor/entropy_utils.py | 53 +++++++++++++++------- fastdeploy/worker/gpu_model_runner.py | 2 +- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/fastdeploy/model_executor/entropy_utils.py b/fastdeploy/model_executor/entropy_utils.py index 21d1b3421e9..5bfe47215a9 100644 --- a/fastdeploy/model_executor/entropy_utils.py +++ b/fastdeploy/model_executor/entropy_utils.py @@ -34,24 +34,32 @@ def get_entropy(logits): def calculate_logits_entropy(logits, share_inputs, temperature): real_bsz = share_inputs["seq_lens_this_time"].shape[0] + seq_lens_encoder = share_inputs["seq_lens_encoder"][:real_bsz] + seq_lens_this_time = share_inputs["seq_lens_this_time"] + # GPU runner uses 1D [N], HPU/GCU uses 2D [N,1]; flatten to 1D + if seq_lens_encoder.ndim == 2: + seq_lens_encoder = seq_lens_encoder.squeeze(1) + if seq_lens_this_time.ndim == 2: + seq_lens_this_time = seq_lens_this_time.squeeze(1) real_seq_lens = paddle.where( - share_inputs["seq_lens_encoder"][:real_bsz].squeeze(1) != 0, + seq_lens_encoder != 0, paddle.ones([1], dtype="int32"), - share_inputs["seq_lens_this_time"].squeeze(1), + seq_lens_this_time, ) - batch_indices = paddle.arange(real_bsz, dtype="int32") - batch_id_per_token = paddle.repeat_interleave(batch_indices, real_seq_lens) - for i in range(logits.shape[0]): - if temperature[batch_id_per_token[i]] > 0 and temperature[batch_id_per_token[i]] != 1.0: - logits[i] = logits[i].scale_(1 / temperature[batch_id_per_token[i]]) + for i in range(real_bsz): + if int(real_seq_lens[i]) == 0: + continue + t = temperature[i] + if t > 0 and t != 1.0: + logits[i] = logits[i].scale_(1 / t) - entropy_tensor = get_entropy(logits) - entropy = entropy_tensor.tolist() + entropy_tensor = get_entropy(logits[:real_bsz]) for i in range(real_bsz): - for _ in range(real_seq_lens[i]): - share_inputs["entropy_list"][i].append(entropy.pop(0)) + if int(real_seq_lens[i]) == 0: + continue + share_inputs["entropy_list"][i].append(float(entropy_tensor[i])) if ( share_inputs["stop_flags"][i] and share_inputs["seq_lens_decoder"][i] != 0 @@ -67,10 +75,17 @@ def speculate_calculate_logits_entropy(logits, share_inputs, temperature): # get accepted logits real_bsz = share_inputs["seq_lens_this_time"].shape[0] total_accepted_num = paddle.sum(share_inputs["accept_num"]) + seq_lens_encoder = share_inputs["seq_lens_encoder"][:real_bsz] + seq_lens_this_time = share_inputs["seq_lens_this_time"] + # GPU runner uses 1D [N], HPU/GCU uses 2D [N,1]; flatten to 1D + if seq_lens_encoder.ndim == 2: + seq_lens_encoder = seq_lens_encoder.squeeze(1) + if seq_lens_this_time.ndim == 2: + seq_lens_this_time = seq_lens_this_time.squeeze(1) real_seq_lens = paddle.where( - share_inputs["seq_lens_encoder"][:real_bsz].squeeze(1) != 0, + seq_lens_encoder != 0, paddle.ones([1], dtype="int32"), - share_inputs["seq_lens_this_time"].squeeze(1), + seq_lens_this_time, ) seq_start_idx = paddle.concat([paddle.zeros([1], dtype="int32"), paddle.cumsum(real_seq_lens, dtype="int32")]) repeated_starts = paddle.repeat_interleave(seq_start_idx[:-1], share_inputs["accept_num"][:real_bsz]) @@ -83,16 +98,20 @@ def speculate_calculate_logits_entropy(logits, share_inputs, temperature): for i in range(total_accepted_num): accepted_logits[i] = logits[accepted_idx[i]] - batch_indices = paddle.arange(share_inputs["accept_num"].shape[0], dtype="int32") - batch_id_per_token = paddle.repeat_interleave(batch_indices, share_inputs["accept_num"]) + batch_indices = paddle.arange(real_bsz, dtype="int32") + batch_id_per_token = paddle.repeat_interleave(batch_indices, share_inputs["accept_num"][:real_bsz]) for i in range(accepted_logits.shape[0]): - if temperature[batch_id_per_token[i]] > 0 and temperature[batch_id_per_token[i]] != 1.0: - accepted_logits[i] = accepted_logits[i].scale_(1 / temperature[batch_id_per_token[i]]) + bid = int(batch_id_per_token[i]) + t = temperature[bid] + if t > 0 and t != 1.0: + accepted_logits[i] = accepted_logits[i].scale_(1 / t) entropy_tensor = get_entropy(accepted_logits) entropy = entropy_tensor.tolist() for i in range(real_bsz): + if int(real_seq_lens[i]) == 0: + continue for _ in range(share_inputs["accept_num"][i]): share_inputs["entropy_list"][i].append(entropy.pop(0)) if ( diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 172e61808be..135f4c5b661 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1241,7 +1241,7 @@ def _dummy_prefill_inputs(self, input_length_list: List[int], max_dec_len_list: self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange( idx * block_num, (idx + 1) * block_num, 1 ) - self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"] + self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"][:batch_size] def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_profile_run=False) -> None: """Prepare the model inputs"""