From e38008c03f781584983ff6ef6c7b3346b2462bc5 Mon Sep 17 00:00:00 2001 From: Taksh Date: Mon, 6 Apr 2026 19:02:11 +0530 Subject: [PATCH 1/2] Fix UnboundLocalError in generate_gate when generator yields nothing Initialize x before the loop to prevent UnboundLocalError if generate_stream_gate yields no items. Fixes #3786 Co-Authored-By: Claude Opus 4.6 (1M context) --- fastchat/serve/model_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py index 683a78556..9cacface3 100644 --- a/fastchat/serve/model_worker.py +++ b/fastchat/serve/model_worker.py @@ -144,6 +144,7 @@ def generate_stream_gate(self, params): yield json.dumps(ret).encode() + b"\0" def generate_gate(self, params): + x = b"{}\0" for x in self.generate_stream_gate(params): pass return json.loads(x[:-1].decode()) From 4004defdd4437ac45eb20e8395a99d93b3d77e80 Mon Sep 17 00:00:00 2001 From: Taksh Date: Mon, 6 Apr 2026 19:02:39 +0530 Subject: [PATCH 2/2] Fix batch embedding averaging for batch_size > 1 Compute per-sequence token counts instead of a single scalar across the entire batch. This fixes incorrect embeddings when batch_size > 1. Fixes #3785 Co-Authored-By: Claude Opus 4.6 (1M context) --- fastchat/serve/model_worker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py index 9cacface3..c21b8c36c 100644 --- a/fastchat/serve/model_worker.py +++ b/fastchat/serve/model_worker.py @@ -172,7 +172,7 @@ def __process_embed_chunk(self, input_ids, attention_mask, **model_type_dict): mask = attention_mask.unsqueeze(-1).expand(data.size()).float() masked_embeddings = data * mask sum_embeddings = torch.sum(masked_embeddings, dim=1) - token_num = torch.sum(attention_mask).item() + token_num = attention_mask.sum(dim=1, keepdim=True) return sum_embeddings, token_num @@ -225,7 +225,7 @@ def get_embeddings(self, params): ): embedding = embedding / token_num normalized_embeddings = F.normalize(embedding, p=2, dim=1) - ret["token_num"] = token_num + ret["token_num"] = token_num.sum().item() else: all_embeddings = [] all_token_num = 0 @@ -274,7 +274,7 @@ def get_embeddings(self, params): embedding = torch.sum(all_embeddings_tensor, dim=0) / all_token_num normalized_embeddings = F.normalize(embedding, p=2, dim=1) - ret["token_num"] = all_token_num + ret["token_num"] = all_token_num.sum().item() if base64_encode == "base64": out_embeddings = self.__encode_base64(normalized_embeddings)