From 8c97537fd08642410251b224855f76d4beca43a2 Mon Sep 17 00:00:00 2001
From: songyuxing <songyuxing@baidu.com>
Date: Fri, 22 May 2026 01:08:42 +0800
Subject: [PATCH] support compute entropy in fd-runner

---
 fastdeploy/model_executor/entropy_utils.py | 53 +++++++++++++++-------
 fastdeploy/worker/gpu_model_runner.py      |  2 +-
 2 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/fastdeploy/model_executor/entropy_utils.py b/fastdeploy/model_executor/entropy_utils.py
index 21d1b3421e9..5bfe47215a9 100644
--- a/fastdeploy/model_executor/entropy_utils.py
+++ b/fastdeploy/model_executor/entropy_utils.py
@@ -34,24 +34,32 @@ def get_entropy(logits):
 
 def calculate_logits_entropy(logits, share_inputs, temperature):
     real_bsz = share_inputs["seq_lens_this_time"].shape[0]
+    seq_lens_encoder = share_inputs["seq_lens_encoder"][:real_bsz]
+    seq_lens_this_time = share_inputs["seq_lens_this_time"]
+    # GPU runner uses 1D [N], HPU/GCU uses 2D [N,1]; flatten to 1D
+    if seq_lens_encoder.ndim == 2:
+        seq_lens_encoder = seq_lens_encoder.squeeze(1)
+    if seq_lens_this_time.ndim == 2:
+        seq_lens_this_time = seq_lens_this_time.squeeze(1)
     real_seq_lens = paddle.where(
-        share_inputs["seq_lens_encoder"][:real_bsz].squeeze(1) != 0,
+        seq_lens_encoder != 0,
         paddle.ones([1], dtype="int32"),
-        share_inputs["seq_lens_this_time"].squeeze(1),
+        seq_lens_this_time,
     )
 
-    batch_indices = paddle.arange(real_bsz, dtype="int32")
-    batch_id_per_token = paddle.repeat_interleave(batch_indices, real_seq_lens)
-    for i in range(logits.shape[0]):
-        if temperature[batch_id_per_token[i]] > 0 and temperature[batch_id_per_token[i]] != 1.0:
-            logits[i] = logits[i].scale_(1 / temperature[batch_id_per_token[i]])
+    for i in range(real_bsz):
+        if int(real_seq_lens[i]) == 0:
+            continue
+        t = temperature[i]
+        if t > 0 and t != 1.0:
+            logits[i] = logits[i].scale_(1 / t)
 
-    entropy_tensor = get_entropy(logits)
-    entropy = entropy_tensor.tolist()
+    entropy_tensor = get_entropy(logits[:real_bsz])
 
     for i in range(real_bsz):
-        for _ in range(real_seq_lens[i]):
-            share_inputs["entropy_list"][i].append(entropy.pop(0))
+        if int(real_seq_lens[i]) == 0:
+            continue
+        share_inputs["entropy_list"][i].append(float(entropy_tensor[i]))
         if (
             share_inputs["stop_flags"][i]
             and share_inputs["seq_lens_decoder"][i] != 0
@@ -67,10 +75,17 @@ def speculate_calculate_logits_entropy(logits, share_inputs, temperature):
     # get accepted logits
     real_bsz = share_inputs["seq_lens_this_time"].shape[0]
     total_accepted_num = paddle.sum(share_inputs["accept_num"])
+    seq_lens_encoder = share_inputs["seq_lens_encoder"][:real_bsz]
+    seq_lens_this_time = share_inputs["seq_lens_this_time"]
+    # GPU runner uses 1D [N], HPU/GCU uses 2D [N,1]; flatten to 1D
+    if seq_lens_encoder.ndim == 2:
+        seq_lens_encoder = seq_lens_encoder.squeeze(1)
+    if seq_lens_this_time.ndim == 2:
+        seq_lens_this_time = seq_lens_this_time.squeeze(1)
     real_seq_lens = paddle.where(
-        share_inputs["seq_lens_encoder"][:real_bsz].squeeze(1) != 0,
+        seq_lens_encoder != 0,
         paddle.ones([1], dtype="int32"),
-        share_inputs["seq_lens_this_time"].squeeze(1),
+        seq_lens_this_time,
     )
     seq_start_idx = paddle.concat([paddle.zeros([1], dtype="int32"), paddle.cumsum(real_seq_lens, dtype="int32")])
     repeated_starts = paddle.repeat_interleave(seq_start_idx[:-1], share_inputs["accept_num"][:real_bsz])
@@ -83,16 +98,20 @@ def speculate_calculate_logits_entropy(logits, share_inputs, temperature):
     for i in range(total_accepted_num):
         accepted_logits[i] = logits[accepted_idx[i]]
 
-    batch_indices = paddle.arange(share_inputs["accept_num"].shape[0], dtype="int32")
-    batch_id_per_token = paddle.repeat_interleave(batch_indices, share_inputs["accept_num"])
+    batch_indices = paddle.arange(real_bsz, dtype="int32")
+    batch_id_per_token = paddle.repeat_interleave(batch_indices, share_inputs["accept_num"][:real_bsz])
     for i in range(accepted_logits.shape[0]):
-        if temperature[batch_id_per_token[i]] > 0 and temperature[batch_id_per_token[i]] != 1.0:
-            accepted_logits[i] = accepted_logits[i].scale_(1 / temperature[batch_id_per_token[i]])
+        bid = int(batch_id_per_token[i])
+        t = temperature[bid]
+        if t > 0 and t != 1.0:
+            accepted_logits[i] = accepted_logits[i].scale_(1 / t)
 
     entropy_tensor = get_entropy(accepted_logits)
     entropy = entropy_tensor.tolist()
 
     for i in range(real_bsz):
+        if int(real_seq_lens[i]) == 0:
+            continue
         for _ in range(share_inputs["accept_num"][i]):
             share_inputs["entropy_list"][i].append(entropy.pop(0))
         if (
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 172e61808be..135f4c5b661 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1241,7 +1241,7 @@ def _dummy_prefill_inputs(self, input_length_list: List[int], max_dec_len_list:
             self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(
                 idx * block_num, (idx + 1) * block_num, 1
             )
-        self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"]
+        self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"][:batch_size]
 
     def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_profile_run=False) -> None:
         """Prepare the model inputs"""