From 6b2f806181ab9fcf97f9c73cedfa148a8a3ee4fe Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 20 May 2026 20:07:51 +0800 Subject: [PATCH 01/10] first commit --- fastdeploy/entrypoints/openai/protocol.py | 22 ++ .../tool_parsers/abstract_tool_parser.py | 62 +++++ fastdeploy/input/base_processor.py | 72 +++++- .../tool_parsers/test_abstract_tool_parser.py | 99 ++++++++ tests/input/test_text_processor.py | 214 ++++++++++++++++++ 5 files changed, 465 insertions(+), 4 deletions(-) create mode 100644 tests/entrypoints/openai/tool_parsers/test_abstract_tool_parser.py diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 82cdd26d92d..c25ade1a38a 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -242,6 +242,22 @@ class ChatCompletionToolsParam(BaseModel): function: FunctionDefinition +class ChatCompletionNamedFunction(BaseModel): + """Named function for ``tool_choice`` when forcing a specific tool.""" + + name: str + + +class ChatCompletionNamedToolChoiceParam(BaseModel): + """OpenAI-compatible named tool choice — forces the model to call a + specific tool by name. Used as one of the values of + :attr:`ChatCompletionRequest.tool_choice`. + """ + + function: ChatCompletionNamedFunction + type: Literal["function"] = "function" + + class ChatMessage(BaseModel): """ Chat message. @@ -668,6 +684,12 @@ class ChatCompletionRequest(BaseModel): # https://platform.openai.com/docs/api-reference/chat/create messages: Union[List[Any], List[int]] tools: Optional[List[ChatCompletionToolsParam]] = None + tool_choice: Optional[ + Union[ + Literal["none", "auto", "required"], + ChatCompletionNamedToolChoiceParam, + ] + ] = "none" model: Optional[str] = "default" frequency_penalty: Optional[float] = Field(None, le=2, ge=-2) logprobs: Optional[bool] = False diff --git a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 906483f445a..641f56dc82b 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -34,6 +34,14 @@ class ToolParser: derived classes. """ + # Subclasses should override these with the literal tool-call sentinel + # tokens they recognize (e.g. ``""`` / ``""``). + # Used by :meth:`detect_tool_prefix` to support ``tool_choice=required`` + # style prompt-prefix injection. Empty defaults make the detection a no-op + # for parsers that have not opted in. + tool_call_start_token: str = "" + tool_call_end_token: str = "" + def __init__(self, tokenizer): self.prev_tool_call_arr: list[dict] = [] # the index of the tool call that is currently being parsed @@ -43,6 +51,21 @@ def __init__(self, tokenizer): self.model_tokenizer = tokenizer + # Per-request tool-prefix state, populated by the serving layer when + # ``tool_choice=required`` (or similar) causes a tool-call prefix to be + # appended to the rendered prompt by the chat template. The parser + # itself does not compute these — the serving layer calls + # :meth:`detect_tool_prefix` and stashes the result here. + self._tool_prefix: str = "" + # Idempotency flag: the serving layer may invoke its preparation hook + # once per streaming chunk, but the prefix only needs to be computed + # once per request. Set to ``True`` after the first computation. + self._tool_prefix_computed: bool = False + # Whether the prefix has already been spliced into ``delta_text`` for + # the streaming path. Only the first streaming call needs the splice; + # subsequent calls keep ``delta_text`` untouched. + self._tool_prefix_injected_to_delta: bool = False + @cached_property def vocab(self) -> dict[str, int]: # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab @@ -55,6 +78,45 @@ def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionReques """ return request + def detect_tool_prefix(self, prompt: str) -> str: + """Detect a tool-call prefix that the chat template injected at the tail + of the rendered prompt to force tool output (``tool_choice=required``). + + The check is generic: find the **last** occurrence of + :attr:`tool_call_start_token` in ``prompt`` and, if it is **not** closed + by a subsequent :attr:`tool_call_end_token`, treat the substring from + that position to the end of the prompt as the injected prefix. The + injected prefix must reach the very end of the prompt (modulo trailing + whitespace) — anything else is treated as historical / unrelated and + we conservatively return an empty string. + + Returns ``""`` for parsers that have not declared their sentinel tokens + or for prompts where no such prefix is detected. + + Subclasses with non-paired tag formats (e.g. a single sentinel without + a closing counterpart) may override this method. + """ + start = self.tool_call_start_token + if not start or not prompt: + return "" + + last_start = prompt.rfind(start) + if last_start == -1: + return "" + + end = self.tool_call_end_token + if end and prompt.find(end, last_start + len(start)) != -1: + # The last start token is closed — this is a historical, completed + # tool-call (e.g. from a previous assistant turn), not an injected + # forced prefix. + return "" + + # By construction, ``prompt[last_start:]`` reaches the end of the + # prompt. We treat the whole tail as the injected prefix. Subclasses + # whose chat templates place additional content after the prefix can + # override this method to apply stricter validation. + return prompt[last_start:] + def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation: """ Static method that should be implemented for extracting tool calls from diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py index c65e0c42cf4..0f655f60bcd 100644 --- a/fastdeploy/input/base_processor.py +++ b/fastdeploy/input/base_processor.py @@ -57,6 +57,21 @@ _SAMPLING_EPS = 1e-5 +def _is_forced_tool_choice(tool_choice) -> bool: + """Return True iff ``tool_choice`` requires the chat template to inject + a tool-call prefix into the prompt — i.e. ``"required"`` or a named-tool + choice (``{"type": "function", "function": {...}}``). + + By the time this runs, the request has already been dumped to dict form, + so ``tool_choice`` is either a string or a dict. + """ + if isinstance(tool_choice, str): + return tool_choice == "required" + if isinstance(tool_choice, dict): + return tool_choice.get("type") == "function" + return False + + class BaseTextProcessor(ABC): """Abstract base class shared by all text / VL processors. @@ -266,6 +281,32 @@ def process_response_dict(self, response_dict, **kwargs): else: return self.process_response_dict_normal(response_dict, **kwargs) + def _prepare_tool_prefix(self, tool_parser, request): + """Compute and cache on ``tool_parser`` the tool-call prefix that the + chat template may have injected at the tail of the rendered prompt + (e.g. for ``tool_choice=required``). + + The detection is delegated to the parser itself + (:meth:`ToolParser.detect_tool_prefix`) so each parser controls + which sentinel tokens it recognizes. We compute once per parser + instance — for non-streaming a fresh instance is created per request, + for streaming the instance is cached per ``request_id``. + """ + if tool_parser._tool_prefix_computed: + return + tool_parser._tool_prefix_computed = True + tool_parser._tool_prefix = "" + if not request: + return + prompt_str = request.get("prompt_tokens") + if not prompt_str or not isinstance(prompt_str, str): + return + try: + tool_parser._tool_prefix = tool_parser.detect_tool_prefix(prompt_str) or "" + except Exception: + data_processor_logger.exception("detect_tool_prefix failed; falling back to empty prefix") + tool_parser._tool_prefix = "" + def process_response_dict_normal(self, response_dict, **kwargs): """Accumulate tokens and build the full completion text (non-streaming).""" token_ids = response_dict["outputs"]["token_ids"] @@ -300,7 +341,12 @@ def process_response_dict_normal(self, response_dict, **kwargs): if self.tool_parser_obj: tool_parser = self.tool_parser_obj(self.tokenizer) - tool_call_info = tool_parser.extract_tool_calls(full_text, request) + parser_input = full_text + if _is_forced_tool_choice(request.get("tool_choice")): + self._prepare_tool_prefix(tool_parser, request) + if tool_parser._tool_prefix: + parser_input = tool_parser._tool_prefix + full_text + tool_call_info = tool_parser.extract_tool_calls(parser_input, request) if tool_call_info.tools_called: response_dict["outputs"]["tool_calls"] = tool_call_info.tool_calls @@ -354,10 +400,28 @@ def process_response_dict_streaming(self, response_dict, **kwargs): if req_id not in self.tool_parser_dict: self.tool_parser_dict[req_id] = self.tool_parser_obj(self.tokenizer) tool_parser = self.tool_parser_dict[req_id] + stream_previous = previous_texts + stream_current = previous_texts + delta_text + stream_delta = delta_text + if _is_forced_tool_choice(request.get("tool_choice")): + self._prepare_tool_prefix(tool_parser, request) + prefix = tool_parser._tool_prefix + # When the chat template injected a forced tool-call prefix into + # the prompt, the model output starts mid-tool-call. We splice + # the prefix back into the streaming arguments so the parser + # sees a complete sequence and its existing state machine works + # unchanged. ``delta_text`` only needs the splice on the first + # call so the parser's start-token detection fires once. + if prefix: + stream_previous = prefix + stream_previous + stream_current = prefix + stream_current + if not tool_parser._tool_prefix_injected_to_delta: + stream_delta = prefix + stream_delta + tool_parser._tool_prefix_injected_to_delta = True tool_call_delta_message = tool_parser.extract_tool_calls_streaming( - previous_texts, - previous_texts + delta_text, - delta_text, + stream_previous, + stream_current, + stream_delta, previous_token_ids, previous_token_ids + token_ids, token_ids, diff --git a/tests/entrypoints/openai/tool_parsers/test_abstract_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_abstract_tool_parser.py new file mode 100644 index 00000000000..d8fd3acec0f --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_abstract_tool_parser.py @@ -0,0 +1,99 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import unittest + +from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser + + +class _DummyTokenizer: + def get_vocab(self): + return {} + + +class _PairedTagParser(ToolParser): + """A concrete parser declaring paired sentinel tokens for testing.""" + + tool_call_start_token = "" + tool_call_end_token = "" + + +class _NoSentinelParser(ToolParser): + """A parser that did not opt in to prefix detection.""" + + +class TestDetectToolPrefix(unittest.TestCase): + def setUp(self): + self.tokenizer = _DummyTokenizer() + self.parser = _PairedTagParser(self.tokenizer) + + def test_initial_state(self): + self.assertEqual(self.parser._tool_prefix, "") + self.assertFalse(self.parser._tool_prefix_computed) + self.assertFalse(self.parser._tool_prefix_injected_to_delta) + + def test_empty_prompt_returns_empty(self): + self.assertEqual(self.parser.detect_tool_prefix(""), "") + + def test_no_start_token_returns_empty(self): + self.assertEqual( + self.parser.detect_tool_prefix("user: hello\nassistant: hi"), + "", + ) + + def test_parser_without_sentinel_returns_empty(self): + parser = _NoSentinelParser(self.tokenizer) + self.assertEqual( + parser.detect_tool_prefix("anything here"), + "", + ) + + def test_trailing_start_token_only(self): + prompt = "user: q\n" + self.assertEqual(self.parser.detect_tool_prefix(prompt), "") + + def test_trailing_start_with_invoke_prefix(self): + prompt = "history\n{...}\nuser: next" + self.assertEqual(self.parser.detect_tool_prefix(prompt), "") + + def test_history_closed_plus_new_injected_prefix(self): + prompt = "{a:1}\n{a:1}\n" "{b:2}\n" "assistant: done" + self.assertEqual(self.parser.detect_tool_prefix(prompt), "") + + def test_trailing_whitespace_after_start(self): + prompt = "history\n " + self.assertEqual( + self.parser.detect_tool_prefix(prompt), + " ", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py index ebb4c9ff127..cf143aaef0b 100644 --- a/tests/input/test_text_processor.py +++ b/tests/input/test_text_processor.py @@ -753,5 +753,219 @@ def custom_convert(tokens): self.assertEqual(processor.update_bad_words(["combo", "oversize"], []), []) +class IsForcedToolChoiceTest(unittest.TestCase): + """Tests for the module-level ``_is_forced_tool_choice`` helper.""" + + def setUp(self): + from fastdeploy.input import base_processor + + self._is_forced = base_processor._is_forced_tool_choice + + def test_required_string(self): + self.assertTrue(self._is_forced("required")) + + def test_other_strings(self): + self.assertFalse(self._is_forced("auto")) + self.assertFalse(self._is_forced("none")) + self.assertFalse(self._is_forced("")) + + def test_named_function_dict(self): + self.assertTrue(self._is_forced({"type": "function", "function": {"name": "f"}})) + + def test_dict_without_function_type(self): + self.assertFalse(self._is_forced({"type": "other"})) + self.assertFalse(self._is_forced({})) + + def test_none_and_other_types(self): + self.assertFalse(self._is_forced(None)) + self.assertFalse(self._is_forced(123)) + self.assertFalse(self._is_forced(["required"])) + + +class _RecordingToolParser: + """Minimal tool parser that records inputs and exposes the prefix-state + fields the serving layer reads/writes.""" + + def __init__(self, tokenizer, tool_prefix="", detect_raises=False): + self.tokenizer = tokenizer + self._configured_prefix = tool_prefix + self._detect_raises = detect_raises + self._tool_prefix = "" + self._tool_prefix_computed = False + self._tool_prefix_injected_to_delta = False + self.detect_calls = [] + self.extract_calls = [] + self.streaming_calls = [] + + def detect_tool_prefix(self, prompt): + self.detect_calls.append(prompt) + if self._detect_raises: + raise RuntimeError("boom") + return self._configured_prefix if prompt and prompt.endswith(self._configured_prefix) else "" + + def extract_tool_calls(self, model_output, request): + self.extract_calls.append(model_output) + return SimpleNamespace(tools_called=True, tool_calls=["tc"]) + + def extract_tool_calls_streaming( + self, + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + delta_token_ids, + request, + ): + self.streaming_calls.append( + { + "previous_text": previous_text, + "current_text": current_text, + "delta_text": delta_text, + } + ) + tool_calls = [ + DeltaToolCall( + index=0, + type="function", + id="x", + function=DeltaFunctionCall(name="t").model_dump(exclude_none=True), + ) + ] + return DeltaMessage(tool_calls=tool_calls, content="c") + + +class ToolPrefixCompensationTest(unittest.TestCase): + """Tests for the ``tool_choice=required`` prefix compensation logic in + ``BaseTextProcessor``.""" + + def setUp(self): + module, cleanup = _import_text_processor() + self.text_processor_module = module + self.addCleanup(cleanup) + self.processor = module.TextProcessor("stub-model") + + def _make_parser_factory(self, parser): + return lambda tokenizer: parser + + def test_prepare_tool_prefix_idempotent(self): + parser = _RecordingToolParser(self.processor.tokenizer) + request = {"prompt_tokens": "history\n"} + + self.processor._prepare_tool_prefix(parser, request) + self.assertTrue(parser._tool_prefix_computed) + self.assertEqual(parser._tool_prefix, "") + self.assertEqual(len(parser.detect_calls), 1) + + # Second call must not invoke detect again. + self.processor._prepare_tool_prefix(parser, request) + self.assertEqual(len(parser.detect_calls), 1) + + def test_prepare_tool_prefix_no_prompt(self): + parser = _RecordingToolParser(self.processor.tokenizer) + self.processor._prepare_tool_prefix(parser, {}) + self.assertTrue(parser._tool_prefix_computed) + self.assertEqual(parser._tool_prefix, "") + self.assertEqual(parser.detect_calls, []) + + def test_prepare_tool_prefix_handles_exception(self): + parser = _RecordingToolParser(self.processor.tokenizer, detect_raises=True) + request = {"prompt_tokens": "history\n"} + self.processor._prepare_tool_prefix(parser, request) + self.assertTrue(parser._tool_prefix_computed) + self.assertEqual(parser._tool_prefix, "") + + def test_normal_path_splices_prefix_when_required(self): + processor = self.processor + parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") + processor.tool_parser_obj = self._make_parser_factory(parser) + + response = { + "request_id": "req-normal", + "finished": True, + "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, + } + request = { + "tool_choice": "required", + "prompt_tokens": "user msg\n", + } + + processor.process_response_dict_normal(response, request=request) + self.assertEqual(len(parser.extract_calls), 1) + # Model output is "7" after decoding token 7; prefix must be prepended. + self.assertTrue(parser.extract_calls[0].startswith("")) + self.assertEqual(response["outputs"]["tool_calls"], ["tc"]) + + def test_normal_path_no_splice_when_not_required(self): + processor = self.processor + parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") + processor.tool_parser_obj = self._make_parser_factory(parser) + + response = { + "request_id": "req-auto", + "finished": True, + "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, + } + request = {"tool_choice": "auto", "prompt_tokens": "user msg\n"} + + processor.process_response_dict_normal(response, request=request) + # detect_tool_prefix must NOT be called for non-forced choices. + self.assertEqual(parser.detect_calls, []) + self.assertFalse(parser.extract_calls[0].startswith("")) + + def test_streaming_path_splices_prefix_only_on_first_delta(self): + processor = self.processor + parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") + processor.tool_parser_obj = self._make_parser_factory(parser) + request = { + "tool_choice": "required", + "prompt_tokens": "user msg\n", + } + + # First chunk + first = { + "finished": False, + "request_id": "stream-req", + "outputs": {"token_ids": [7]}, + } + processor.process_response_dict_streaming(first, request=request) + first_call = parser.streaming_calls[0] + # delta_text decodes to "7"; previous="" current="7" + self.assertEqual(first_call["previous_text"], "") + self.assertEqual(first_call["current_text"], "7") + self.assertEqual(first_call["delta_text"], "7") + self.assertTrue(parser._tool_prefix_injected_to_delta) + + # Second chunk: delta must NOT be re-spliced, but previous/current are. + second = { + "finished": True, + "request_id": "stream-req", + "outputs": {"token_ids": [8, processor.tokenizer.eos_token_id]}, + } + processor.process_response_dict_streaming(second, request=request) + second_call = parser.streaming_calls[1] + self.assertEqual(second_call["previous_text"], "7") + self.assertEqual(second_call["current_text"], "78") + self.assertEqual(second_call["delta_text"], "8") # no extra prefix splice + # detect should only run once across the whole stream. + self.assertEqual(len(parser.detect_calls), 1) + + def test_streaming_path_no_splice_when_no_prefix_detected(self): + processor = self.processor + # Empty configured prefix => detect returns "" even with required. + parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") + processor.tool_parser_obj = self._make_parser_factory(parser) + request = {"tool_choice": "required", "prompt_tokens": "no sentinel"} + + first = { + "finished": False, + "request_id": "stream-noprefix", + "outputs": {"token_ids": [7]}, + } + processor.process_response_dict_streaming(first, request=request) + self.assertEqual(parser.streaming_calls[0]["delta_text"], "7") + self.assertFalse(parser._tool_prefix_injected_to_delta) + + if __name__ == "__main__": unittest.main() From 47a7e23fddcdc0ad84c5e68d465797c7d208a2ec Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 21 May 2026 16:49:32 +0800 Subject: [PATCH 02/10] fix bug --- .../entrypoints/openai/response_processors.py | 3 +- fastdeploy/entrypoints/openai/serving_chat.py | 2 + fastdeploy/input/base_processor.py | 32 ++++---- tests/input/test_text_processor.py | 82 +++++++++++++------ 4 files changed, 73 insertions(+), 46 deletions(-) diff --git a/fastdeploy/entrypoints/openai/response_processors.py b/fastdeploy/entrypoints/openai/response_processors.py index ffaaf0f4aa5..b0c9e6adcd1 100644 --- a/fastdeploy/entrypoints/openai/response_processors.py +++ b/fastdeploy/entrypoints/openai/response_processors.py @@ -72,7 +72,7 @@ def accumulate_token_ids(self, request_output): else: self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output}) - async def process_response_chat(self, request_outputs, stream, include_stop_str_in_output, request): + async def process_response_chat(self, request_outputs, stream, include_stop_str_in_output, request, prompt_tokens): """ Process a list of responses into a generator that yields each processed response as it's generated. Args: @@ -101,6 +101,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_ audio_tokens=all_audio_tokens, tts=tts, request=request, + prompt_tokens=prompt_tokens, ) else: response = self.data_processor.process_response_dict( diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index d6429521f05..25b77220d27 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -317,6 +317,7 @@ async def chat_completion_stream_generator( stream=True, include_stop_str_in_output=include_stop_str_in_output, request=request, + prompt_tokens=prompt_tokens, ) async for res in generator: @@ -650,6 +651,7 @@ async def chat_completion_full_generator( stream=False, include_stop_str_in_output=include_stop_str_in_output, request=request, + prompt_tokens=prompt_tokens, ) async for data in generator: idx = get_choice_index(data["request_id"]) diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py index 0f655f60bcd..019635e160e 100644 --- a/fastdeploy/input/base_processor.py +++ b/fastdeploy/input/base_processor.py @@ -60,16 +60,13 @@ def _is_forced_tool_choice(tool_choice) -> bool: """Return True iff ``tool_choice`` requires the chat template to inject a tool-call prefix into the prompt — i.e. ``"required"`` or a named-tool - choice (``{"type": "function", "function": {...}}``). - - By the time this runs, the request has already been dumped to dict form, - so ``tool_choice`` is either a string or a dict. + choice (a ``ChatCompletionNamedToolChoiceParam`` pydantic model). """ if isinstance(tool_choice, str): return tool_choice == "required" - if isinstance(tool_choice, dict): - return tool_choice.get("type") == "function" - return False + # Duck-type the pydantic ``ChatCompletionNamedToolChoiceParam`` via its + # ``type`` attribute to avoid importing the protocol module here. + return getattr(tool_choice, "type", None) == "function" class BaseTextProcessor(ABC): @@ -281,12 +278,14 @@ def process_response_dict(self, response_dict, **kwargs): else: return self.process_response_dict_normal(response_dict, **kwargs) - def _prepare_tool_prefix(self, tool_parser, request): + def _prepare_tool_prefix(self, tool_parser, prompt_tokens): """Compute and cache on ``tool_parser`` the tool-call prefix that the chat template may have injected at the tail of the rendered prompt (e.g. for ``tool_choice=required``). - The detection is delegated to the parser itself + ``prompt_tokens`` is the rendered-prompt string passed in by the + serving layer (see ``response_processors.process_response_chat``). + The detection itself is delegated to the parser (:meth:`ToolParser.detect_tool_prefix`) so each parser controls which sentinel tokens it recognizes. We compute once per parser instance — for non-streaming a fresh instance is created per request, @@ -296,13 +295,10 @@ def _prepare_tool_prefix(self, tool_parser, request): return tool_parser._tool_prefix_computed = True tool_parser._tool_prefix = "" - if not request: - return - prompt_str = request.get("prompt_tokens") - if not prompt_str or not isinstance(prompt_str, str): + if not prompt_tokens or not isinstance(prompt_tokens, str): return try: - tool_parser._tool_prefix = tool_parser.detect_tool_prefix(prompt_str) or "" + tool_parser._tool_prefix = tool_parser.detect_tool_prefix(prompt_tokens) or "" except Exception: data_processor_logger.exception("detect_tool_prefix failed; falling back to empty prefix") tool_parser._tool_prefix = "" @@ -342,8 +338,8 @@ def process_response_dict_normal(self, response_dict, **kwargs): if self.tool_parser_obj: tool_parser = self.tool_parser_obj(self.tokenizer) parser_input = full_text - if _is_forced_tool_choice(request.get("tool_choice")): - self._prepare_tool_prefix(tool_parser, request) + if _is_forced_tool_choice(request.tool_choice): + self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens")) if tool_parser._tool_prefix: parser_input = tool_parser._tool_prefix + full_text tool_call_info = tool_parser.extract_tool_calls(parser_input, request) @@ -403,8 +399,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs): stream_previous = previous_texts stream_current = previous_texts + delta_text stream_delta = delta_text - if _is_forced_tool_choice(request.get("tool_choice")): - self._prepare_tool_prefix(tool_parser, request) + if _is_forced_tool_choice(request.tool_choice): + self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens")) prefix = tool_parser._tool_prefix # When the chat template injected a forced tool-call prefix into # the prompt, the model output starts mid-tool-call. We splice diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py index cf143aaef0b..e3fd91782f5 100644 --- a/tests/input/test_text_processor.py +++ b/tests/input/test_text_processor.py @@ -769,17 +769,17 @@ def test_other_strings(self): self.assertFalse(self._is_forced("none")) self.assertFalse(self._is_forced("")) - def test_named_function_dict(self): - self.assertTrue(self._is_forced({"type": "function", "function": {"name": "f"}})) + def test_pydantic_named_tool_choice(self): + named = SimpleNamespace(type="function", function=SimpleNamespace(name="f")) + self.assertTrue(self._is_forced(named)) - def test_dict_without_function_type(self): - self.assertFalse(self._is_forced({"type": "other"})) - self.assertFalse(self._is_forced({})) + def test_pydantic_other_type(self): + self.assertFalse(self._is_forced(SimpleNamespace(type="other"))) + self.assertFalse(self._is_forced(SimpleNamespace())) def test_none_and_other_types(self): self.assertFalse(self._is_forced(None)) self.assertFalse(self._is_forced(123)) - self.assertFalse(self._is_forced(["required"])) class _RecordingToolParser: @@ -850,28 +850,32 @@ def _make_parser_factory(self, parser): def test_prepare_tool_prefix_idempotent(self): parser = _RecordingToolParser(self.processor.tokenizer) - request = {"prompt_tokens": "history\n"} + prompt = "history\n" - self.processor._prepare_tool_prefix(parser, request) + self.processor._prepare_tool_prefix(parser, prompt) self.assertTrue(parser._tool_prefix_computed) self.assertEqual(parser._tool_prefix, "") self.assertEqual(len(parser.detect_calls), 1) # Second call must not invoke detect again. - self.processor._prepare_tool_prefix(parser, request) + self.processor._prepare_tool_prefix(parser, prompt) self.assertEqual(len(parser.detect_calls), 1) def test_prepare_tool_prefix_no_prompt(self): parser = _RecordingToolParser(self.processor.tokenizer) - self.processor._prepare_tool_prefix(parser, {}) + self.processor._prepare_tool_prefix(parser, None) self.assertTrue(parser._tool_prefix_computed) self.assertEqual(parser._tool_prefix, "") self.assertEqual(parser.detect_calls, []) + parser2 = _RecordingToolParser(self.processor.tokenizer) + self.processor._prepare_tool_prefix(parser2, "") + self.assertEqual(parser2._tool_prefix, "") + self.assertEqual(parser2.detect_calls, []) + def test_prepare_tool_prefix_handles_exception(self): parser = _RecordingToolParser(self.processor.tokenizer, detect_raises=True) - request = {"prompt_tokens": "history\n"} - self.processor._prepare_tool_prefix(parser, request) + self.processor._prepare_tool_prefix(parser, "history\n") self.assertTrue(parser._tool_prefix_computed) self.assertEqual(parser._tool_prefix, "") @@ -885,12 +889,13 @@ def test_normal_path_splices_prefix_when_required(self): "finished": True, "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, } - request = { - "tool_choice": "required", - "prompt_tokens": "user msg\n", - } + request = SimpleNamespace(tool_choice="required") - processor.process_response_dict_normal(response, request=request) + processor.process_response_dict_normal( + response, + request=request, + prompt_tokens="user msg\n", + ) self.assertEqual(len(parser.extract_calls), 1) # Model output is "7" after decoding token 7; prefix must be prepended. self.assertTrue(parser.extract_calls[0].startswith("")) @@ -906,21 +911,44 @@ def test_normal_path_no_splice_when_not_required(self): "finished": True, "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, } - request = {"tool_choice": "auto", "prompt_tokens": "user msg\n"} + request = SimpleNamespace(tool_choice="auto") - processor.process_response_dict_normal(response, request=request) + processor.process_response_dict_normal( + response, + request=request, + prompt_tokens="user msg\n", + ) # detect_tool_prefix must NOT be called for non-forced choices. self.assertEqual(parser.detect_calls, []) self.assertFalse(parser.extract_calls[0].startswith("")) - def test_streaming_path_splices_prefix_only_on_first_delta(self): + def test_normal_path_named_tool_choice_pydantic(self): + """A pydantic ``ChatCompletionNamedToolChoiceParam`` (duck-typed via + ``type='function'``) must also trigger prefix splicing.""" processor = self.processor parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") processor.tool_parser_obj = self._make_parser_factory(parser) - request = { - "tool_choice": "required", - "prompt_tokens": "user msg\n", + + response = { + "request_id": "req-named", + "finished": True, + "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, } + request = SimpleNamespace(tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f"))) + + processor.process_response_dict_normal( + response, + request=request, + prompt_tokens="user msg\n", + ) + self.assertTrue(parser.extract_calls[0].startswith("")) + + def test_streaming_path_splices_prefix_only_on_first_delta(self): + processor = self.processor + parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") + processor.tool_parser_obj = self._make_parser_factory(parser) + request = SimpleNamespace(tool_choice="required") + prompt_tokens = "user msg\n" # First chunk first = { @@ -928,7 +956,7 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self): "request_id": "stream-req", "outputs": {"token_ids": [7]}, } - processor.process_response_dict_streaming(first, request=request) + processor.process_response_dict_streaming(first, request=request, prompt_tokens=prompt_tokens) first_call = parser.streaming_calls[0] # delta_text decodes to "7"; previous="" current="7" self.assertEqual(first_call["previous_text"], "") @@ -942,7 +970,7 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self): "request_id": "stream-req", "outputs": {"token_ids": [8, processor.tokenizer.eos_token_id]}, } - processor.process_response_dict_streaming(second, request=request) + processor.process_response_dict_streaming(second, request=request, prompt_tokens=prompt_tokens) second_call = parser.streaming_calls[1] self.assertEqual(second_call["previous_text"], "7") self.assertEqual(second_call["current_text"], "78") @@ -955,14 +983,14 @@ def test_streaming_path_no_splice_when_no_prefix_detected(self): # Empty configured prefix => detect returns "" even with required. parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") processor.tool_parser_obj = self._make_parser_factory(parser) - request = {"tool_choice": "required", "prompt_tokens": "no sentinel"} + request = SimpleNamespace(tool_choice="required") first = { "finished": False, "request_id": "stream-noprefix", "outputs": {"token_ids": [7]}, } - processor.process_response_dict_streaming(first, request=request) + processor.process_response_dict_streaming(first, request=request, prompt_tokens="no sentinel") self.assertEqual(parser.streaming_calls[0]["delta_text"], "7") self.assertFalse(parser._tool_prefix_injected_to_delta) From 7041f47790f58388bbd82528e2786b81fab22e5e Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 21 May 2026 17:53:49 +0800 Subject: [PATCH 03/10] fix unit test --- fastdeploy/input/base_processor.py | 33 +++++++--- tests/input/test_text_processor.py | 97 +++++++++++++++++++++++++----- 2 files changed, 108 insertions(+), 22 deletions(-) diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py index 019635e160e..de906cdf552 100644 --- a/fastdeploy/input/base_processor.py +++ b/fastdeploy/input/base_processor.py @@ -57,16 +57,33 @@ _SAMPLING_EPS = 1e-5 -def _is_forced_tool_choice(tool_choice) -> bool: - """Return True iff ``tool_choice`` requires the chat template to inject - a tool-call prefix into the prompt — i.e. ``"required"`` or a named-tool - choice (a ``ChatCompletionNamedToolChoiceParam`` pydantic model). +def _is_forced_tool_choice(request) -> bool: + """Return True iff the request asks the chat template to inject a + tool-call prefix into the prompt. Two ways are recognized: + + 1. ``request.tool_choice == "required"`` or a named-tool choice (a + ``ChatCompletionNamedToolChoiceParam`` pydantic model with + ``type == "function"``). + 2. ``request.chat_template_kwargs.options.tool_choice.mode == "force"`` + — used by chat templates that drive forced tool calls through their + own ``options`` dict instead of the OpenAI-style ``tool_choice`` + field. """ + tool_choice = getattr(request, "tool_choice", None) if isinstance(tool_choice, str): - return tool_choice == "required" + if tool_choice == "required": + return True # Duck-type the pydantic ``ChatCompletionNamedToolChoiceParam`` via its # ``type`` attribute to avoid importing the protocol module here. - return getattr(tool_choice, "type", None) == "function" + elif getattr(tool_choice, "type", None) == "function": + return True + + chat_template_kwargs = getattr(request, "chat_template_kwargs", None) or {} + options = chat_template_kwargs.get("options") if isinstance(chat_template_kwargs, dict) else None + inner = options.get("tool_choice") if isinstance(options, dict) else None + if isinstance(inner, dict) and inner.get("mode") == "force": + return True + return False class BaseTextProcessor(ABC): @@ -338,7 +355,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): if self.tool_parser_obj: tool_parser = self.tool_parser_obj(self.tokenizer) parser_input = full_text - if _is_forced_tool_choice(request.tool_choice): + if _is_forced_tool_choice(request): self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens")) if tool_parser._tool_prefix: parser_input = tool_parser._tool_prefix + full_text @@ -399,7 +416,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): stream_previous = previous_texts stream_current = previous_texts + delta_text stream_delta = delta_text - if _is_forced_tool_choice(request.tool_choice): + if _is_forced_tool_choice(request): self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens")) prefix = tool_parser._tool_prefix # When the chat template injected a forced tool-call prefix into diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py index e3fd91782f5..faf88ec03f4 100644 --- a/tests/input/test_text_processor.py +++ b/tests/input/test_text_processor.py @@ -130,6 +130,8 @@ def _create_dummy_modules(): info=lambda *args, **kwargs: None, warning=lambda *args, **kwargs: None, debug=lambda *args, **kwargs: None, + exception=lambda *args, **kwargs: None, + error=lambda *args, **kwargs: None, ) CHOICE_SEPARATOR = "::n::" @@ -570,7 +572,9 @@ def test_process_response_with_reasoning_and_tools(self): "outputs": {"token_ids": [1, processor.tokenizer.eos_token_id]}, } - processed = processor.process_response_dict(response, stream=False) + processed = processor.process_response_dict( + response, stream=False, request=SimpleNamespace(tool_choice="none") + ) self.assertEqual(processed["outputs"]["reasoning_content"], "think") self.assertEqual(processed["outputs"]["tool_calls"], ["tool"]) @@ -597,7 +601,9 @@ def test_process_response_streaming_with_reasoning_and_tools(self): "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, } - result = processor.process_response_dict_streaming(response, enable_thinking=True) + result = processor.process_response_dict_streaming( + response, enable_thinking=True, request=SimpleNamespace(tool_choice="none") + ) self.assertEqual(result["outputs"]["completion_tokens"], "7") self.assertEqual(result["outputs"]["text"], "tool-text") self.assertEqual(result["outputs"]["reasoning_content"], "because") @@ -615,7 +621,9 @@ def test_process_response_dict_normal_with_reasoning(self): "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, } - result = processor.process_response_dict_normal(response, enable_thinking=True) + result = processor.process_response_dict_normal( + response, enable_thinking=True, request=SimpleNamespace(tool_choice="none") + ) self.assertEqual(result["outputs"]["completion_tokens"], "7") self.assertEqual(result["outputs"]["reasoning_content"], "because") self.assertEqual(result["outputs"]["reasoning_token_num"], 1) @@ -754,32 +762,68 @@ def custom_convert(tokens): class IsForcedToolChoiceTest(unittest.TestCase): - """Tests for the module-level ``_is_forced_tool_choice`` helper.""" + """Tests for the module-level ``_is_forced_tool_choice`` helper. + + The helper takes a request-like object (something with ``tool_choice`` + and ``chat_template_kwargs`` attributes) and returns whether the chat + template will inject a tool-call prefix. + """ def setUp(self): from fastdeploy.input import base_processor self._is_forced = base_processor._is_forced_tool_choice + def _req(self, *, tool_choice=None, chat_template_kwargs=None): + return SimpleNamespace( + tool_choice=tool_choice, + chat_template_kwargs=chat_template_kwargs, + ) + def test_required_string(self): - self.assertTrue(self._is_forced("required")) + self.assertTrue(self._is_forced(self._req(tool_choice="required"))) def test_other_strings(self): - self.assertFalse(self._is_forced("auto")) - self.assertFalse(self._is_forced("none")) - self.assertFalse(self._is_forced("")) + self.assertFalse(self._is_forced(self._req(tool_choice="auto"))) + self.assertFalse(self._is_forced(self._req(tool_choice="none"))) + self.assertFalse(self._is_forced(self._req(tool_choice=""))) def test_pydantic_named_tool_choice(self): named = SimpleNamespace(type="function", function=SimpleNamespace(name="f")) - self.assertTrue(self._is_forced(named)) + self.assertTrue(self._is_forced(self._req(tool_choice=named))) def test_pydantic_other_type(self): - self.assertFalse(self._is_forced(SimpleNamespace(type="other"))) - self.assertFalse(self._is_forced(SimpleNamespace())) + self.assertFalse(self._is_forced(self._req(tool_choice=SimpleNamespace(type="other")))) + self.assertFalse(self._is_forced(self._req(tool_choice=SimpleNamespace()))) - def test_none_and_other_types(self): - self.assertFalse(self._is_forced(None)) - self.assertFalse(self._is_forced(123)) + def test_no_tool_choice_no_options(self): + self.assertFalse(self._is_forced(self._req())) + + def test_chat_template_options_force_mode(self): + kwargs = { + "options": { + "tool_choice": {"mode": "force", "name": "get_current_weather"}, + } + } + self.assertTrue(self._is_forced(self._req(chat_template_kwargs=kwargs))) + + def test_chat_template_options_non_force_mode(self): + kwargs = {"options": {"tool_choice": {"mode": "auto"}}} + self.assertFalse(self._is_forced(self._req(chat_template_kwargs=kwargs))) + + def test_chat_template_options_missing_tool_choice(self): + self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": {}}))) + self.assertFalse(self._is_forced(self._req(chat_template_kwargs={}))) + + def test_chat_template_options_malformed(self): + # Non-dict options/inner must be tolerated (no crash, returns False). + self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": "x"}))) + self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": {"tool_choice": "x"}}))) + + def test_tool_choice_takes_priority_over_options(self): + kwargs = {"options": {"tool_choice": {"mode": "auto"}}} + # Even with non-force mode in options, an explicit "required" wins. + self.assertTrue(self._is_forced(self._req(tool_choice="required", chat_template_kwargs=kwargs))) class _RecordingToolParser: @@ -943,6 +987,31 @@ def test_normal_path_named_tool_choice_pydantic(self): ) self.assertTrue(parser.extract_calls[0].startswith("")) + def test_normal_path_chat_template_force_mode(self): + """Forcing through ``chat_template_kwargs.options.tool_choice.mode`` + must also trigger prefix splicing even when ``tool_choice`` is unset + (the default ``"none"``).""" + processor = self.processor + parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") + processor.tool_parser_obj = self._make_parser_factory(parser) + + response = { + "request_id": "req-cti", + "finished": True, + "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, + } + request = SimpleNamespace( + tool_choice="none", + chat_template_kwargs={"options": {"tool_choice": {"mode": "force", "name": "get_current_weather"}}}, + ) + + processor.process_response_dict_normal( + response, + request=request, + prompt_tokens="user msg\n", + ) + self.assertTrue(parser.extract_calls[0].startswith("")) + def test_streaming_path_splices_prefix_only_on_first_delta(self): processor = self.processor parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") From 3017744f0dab25f35a44257805423c8a2566a02e Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 21 May 2026 19:22:53 +0800 Subject: [PATCH 04/10] fix --- fastdeploy/input/base_processor.py | 17 ++++++++--------- tests/input/test_text_processor.py | 30 +++++++++++++++++++----------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py index de906cdf552..1e0640e878f 100644 --- a/fastdeploy/input/base_processor.py +++ b/fastdeploy/input/base_processor.py @@ -61,21 +61,20 @@ def _is_forced_tool_choice(request) -> bool: """Return True iff the request asks the chat template to inject a tool-call prefix into the prompt. Two ways are recognized: - 1. ``request.tool_choice == "required"`` or a named-tool choice (a + 1. ``request.tool_choice`` is a named-tool choice (a ``ChatCompletionNamedToolChoiceParam`` pydantic model with - ``type == "function"``). + ``type == "function"``). The plain ``"required"`` string does NOT + trigger prefix injection in the chat template. 2. ``request.chat_template_kwargs.options.tool_choice.mode == "force"`` — used by chat templates that drive forced tool calls through their own ``options`` dict instead of the OpenAI-style ``tool_choice`` field. """ - tool_choice = getattr(request, "tool_choice", None) - if isinstance(tool_choice, str): - if tool_choice == "required": - return True - # Duck-type the pydantic ``ChatCompletionNamedToolChoiceParam`` via its - # ``type`` attribute to avoid importing the protocol module here. - elif getattr(tool_choice, "type", None) == "function": + tool_choice = request.tool_choice + # Named-tool choices are pydantic ``ChatCompletionNamedToolChoiceParam`` + # objects (``type == "function"``); plain string values such as + # ``"required"`` / ``"auto"`` / ``"none"`` are skipped here. + if not isinstance(tool_choice, str) and getattr(tool_choice, "type", None) == "function": return True chat_template_kwargs = getattr(request, "chat_template_kwargs", None) or {} diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py index faf88ec03f4..b75f457a7cb 100644 --- a/tests/input/test_text_processor.py +++ b/tests/input/test_text_processor.py @@ -780,10 +780,10 @@ def _req(self, *, tool_choice=None, chat_template_kwargs=None): chat_template_kwargs=chat_template_kwargs, ) - def test_required_string(self): - self.assertTrue(self._is_forced(self._req(tool_choice="required"))) - - def test_other_strings(self): + def test_string_tool_choice_never_forces(self): + # Plain string tool_choice values do NOT cause the chat template to + # inject a tool-call prefix, even when the value is ``"required"``. + self.assertFalse(self._is_forced(self._req(tool_choice="required"))) self.assertFalse(self._is_forced(self._req(tool_choice="auto"))) self.assertFalse(self._is_forced(self._req(tool_choice="none"))) self.assertFalse(self._is_forced(self._req(tool_choice=""))) @@ -821,9 +821,10 @@ def test_chat_template_options_malformed(self): self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": {"tool_choice": "x"}}))) def test_tool_choice_takes_priority_over_options(self): - kwargs = {"options": {"tool_choice": {"mode": "auto"}}} - # Even with non-force mode in options, an explicit "required" wins. - self.assertTrue(self._is_forced(self._req(tool_choice="required", chat_template_kwargs=kwargs))) + kwargs = {"options": {"tool_choice": {"mode": "force"}}} + # Named-tool pydantic choice combined with options.force still forces. + named = SimpleNamespace(type="function", function=SimpleNamespace(name="f")) + self.assertTrue(self._is_forced(self._req(tool_choice=named, chat_template_kwargs=kwargs))) class _RecordingToolParser: @@ -933,7 +934,10 @@ def test_normal_path_splices_prefix_when_required(self): "finished": True, "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, } - request = SimpleNamespace(tool_choice="required") + # Named-tool pydantic choice triggers prefix injection. + request = SimpleNamespace( + tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")), + ) processor.process_response_dict_normal( response, @@ -1016,7 +1020,9 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self): processor = self.processor parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") processor.tool_parser_obj = self._make_parser_factory(parser) - request = SimpleNamespace(tool_choice="required") + request = SimpleNamespace( + tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")), + ) prompt_tokens = "user msg\n" # First chunk @@ -1049,10 +1055,12 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self): def test_streaming_path_no_splice_when_no_prefix_detected(self): processor = self.processor - # Empty configured prefix => detect returns "" even with required. + # Empty configured prefix => detect returns "" even when forced. parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") processor.tool_parser_obj = self._make_parser_factory(parser) - request = SimpleNamespace(tool_choice="required") + request = SimpleNamespace( + tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")), + ) first = { "finished": False, From 2ac3eb99cdf9161fd185a2707272a49761f08d15 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 21 May 2026 19:39:35 +0800 Subject: [PATCH 05/10] fix review --- .../tool_parsers/abstract_tool_parser.py | 17 +++----- fastdeploy/input/base_processor.py | 43 +++++++++++++++---- tests/input/test_text_processor.py | 15 +++++++ 3 files changed, 55 insertions(+), 20 deletions(-) diff --git a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 641f56dc82b..83d3ab4a924 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -51,19 +51,14 @@ def __init__(self, tokenizer): self.model_tokenizer = tokenizer - # Per-request tool-prefix state, populated by the serving layer when - # ``tool_choice=required`` (or similar) causes a tool-call prefix to be - # appended to the rendered prompt by the chat template. The parser - # itself does not compute these — the serving layer calls - # :meth:`detect_tool_prefix` and stashes the result here. + # Per-request tool-prefix state populated by the serving layer when + # the chat template injects a forced tool-call prefix into the prompt. self._tool_prefix: str = "" - # Idempotency flag: the serving layer may invoke its preparation hook - # once per streaming chunk, but the prefix only needs to be computed - # once per request. Set to ``True`` after the first computation. + self._tool_prefix_token_ids: list[int] = [] + # Set after the prefix is computed once for this request. self._tool_prefix_computed: bool = False - # Whether the prefix has already been spliced into ``delta_text`` for - # the streaming path. Only the first streaming call needs the splice; - # subsequent calls keep ``delta_text`` untouched. + # Set after the prefix has been spliced into the streaming delta + # (only the first chunk needs it). self._tool_prefix_injected_to_delta: bool = False @cached_property diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py index 1e0640e878f..896a27707ce 100644 --- a/fastdeploy/input/base_processor.py +++ b/fastdeploy/input/base_processor.py @@ -311,13 +311,26 @@ def _prepare_tool_prefix(self, tool_parser, prompt_tokens): return tool_parser._tool_prefix_computed = True tool_parser._tool_prefix = "" + tool_parser._tool_prefix_token_ids = [] if not prompt_tokens or not isinstance(prompt_tokens, str): return try: - tool_parser._tool_prefix = tool_parser.detect_tool_prefix(prompt_tokens) or "" + prefix = tool_parser.detect_tool_prefix(prompt_tokens) or "" except Exception: data_processor_logger.exception("detect_tool_prefix failed; falling back to empty prefix") - tool_parser._tool_prefix = "" + return + tool_parser._tool_prefix = prefix + if not prefix: + return + # Encode the prefix into token ids so the streaming path can also + # splice ``previous/current/delta_token_ids`` — some parsers gate on + # ``tool_call_start_token_id in current_token_ids`` rather than on + # text (e.g. ``Ernie45VLThinkingToolParser``). + try: + tool_parser._tool_prefix_token_ids = list(self.tokenizer.encode(prefix, add_special_tokens=False)) + except Exception: + data_processor_logger.exception("encode tool prefix to token ids failed; token-id splice disabled") + tool_parser._tool_prefix_token_ids = [] def process_response_dict_normal(self, response_dict, **kwargs): """Accumulate tokens and build the full completion text (non-streaming).""" @@ -415,28 +428,40 @@ def process_response_dict_streaming(self, response_dict, **kwargs): stream_previous = previous_texts stream_current = previous_texts + delta_text stream_delta = delta_text + stream_previous_token_ids = previous_token_ids + stream_current_token_ids = previous_token_ids + token_ids + stream_delta_token_ids = token_ids if _is_forced_tool_choice(request): self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens")) prefix = tool_parser._tool_prefix + prefix_ids = tool_parser._tool_prefix_token_ids # When the chat template injected a forced tool-call prefix into # the prompt, the model output starts mid-tool-call. We splice - # the prefix back into the streaming arguments so the parser - # sees a complete sequence and its existing state machine works - # unchanged. ``delta_text`` only needs the splice on the first - # call so the parser's start-token detection fires once. + # the prefix back into both the text and token-id streaming + # arguments so parsers that gate on either form (e.g. + # ``Ernie45VLThinkingToolParser`` checks + # ``tool_call_start_token_id in current_token_ids``) see a + # complete sequence and their existing state machines work + # unchanged. The ``delta_*`` forms only need the splice on the + # first call so the parser's start detection fires once. if prefix: stream_previous = prefix + stream_previous stream_current = prefix + stream_current + if prefix_ids: + stream_previous_token_ids = list(prefix_ids) + list(stream_previous_token_ids) + stream_current_token_ids = list(prefix_ids) + list(stream_current_token_ids) if not tool_parser._tool_prefix_injected_to_delta: stream_delta = prefix + stream_delta + if prefix_ids: + stream_delta_token_ids = list(prefix_ids) + list(stream_delta_token_ids) tool_parser._tool_prefix_injected_to_delta = True tool_call_delta_message = tool_parser.extract_tool_calls_streaming( stream_previous, stream_current, stream_delta, - previous_token_ids, - previous_token_ids + token_ids, - token_ids, + stream_previous_token_ids, + stream_current_token_ids, + stream_delta_token_ids, request, ) if tool_call_delta_message: diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py index b75f457a7cb..d1329c6d01b 100644 --- a/tests/input/test_text_processor.py +++ b/tests/input/test_text_processor.py @@ -867,6 +867,9 @@ def extract_tool_calls_streaming( "previous_text": previous_text, "current_text": current_text, "delta_text": delta_text, + "previous_token_ids": list(previous_token_ids), + "current_token_ids": list(current_token_ids), + "delta_token_ids": list(delta_token_ids), } ) tool_calls = [ @@ -1037,7 +1040,13 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self): self.assertEqual(first_call["previous_text"], "") self.assertEqual(first_call["current_text"], "7") self.assertEqual(first_call["delta_text"], "7") + # token_ids must be spliced too — DummyTokenizer.encode("") -> [11]. + prefix_ids = [11] + self.assertEqual(first_call["previous_token_ids"], prefix_ids) + self.assertEqual(first_call["current_token_ids"], prefix_ids + [7]) + self.assertEqual(first_call["delta_token_ids"], prefix_ids + [7]) self.assertTrue(parser._tool_prefix_injected_to_delta) + self.assertEqual(parser._tool_prefix_token_ids, prefix_ids) # Second chunk: delta must NOT be re-spliced, but previous/current are. second = { @@ -1050,6 +1059,12 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self): self.assertEqual(second_call["previous_text"], "7") self.assertEqual(second_call["current_text"], "78") self.assertEqual(second_call["delta_text"], "8") # no extra prefix splice + self.assertEqual(second_call["previous_token_ids"], prefix_ids + [7]) + self.assertEqual( + second_call["current_token_ids"], + prefix_ids + [7, 8, processor.tokenizer.eos_token_id], + ) + self.assertEqual(second_call["delta_token_ids"], [8, processor.tokenizer.eos_token_id]) # detect should only run once across the whole stream. self.assertEqual(len(parser.detect_calls), 1) From 01eb97c6bc8cd3be7cb4ef0e65e4e75a2ebb5af2 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Thu, 21 May 2026 20:00:14 +0800 Subject: [PATCH 06/10] fix review --- fastdeploy/entrypoints/openai/response_processors.py | 11 ++++++++++- fastdeploy/input/base_processor.py | 5 ++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fastdeploy/entrypoints/openai/response_processors.py b/fastdeploy/entrypoints/openai/response_processors.py index b0c9e6adcd1..2cfef290201 100644 --- a/fastdeploy/entrypoints/openai/response_processors.py +++ b/fastdeploy/entrypoints/openai/response_processors.py @@ -72,7 +72,9 @@ def accumulate_token_ids(self, request_output): else: self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output}) - async def process_response_chat(self, request_outputs, stream, include_stop_str_in_output, request, prompt_tokens): + async def process_response_chat( + self, request_outputs, stream, include_stop_str_in_output, request, prompt_tokens=None + ): """ Process a list of responses into a generator that yields each processed response as it's generated. Args: @@ -111,6 +113,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_ audio_tokens=all_audio_tokens, tts=tts, request=request, + prompt_tokens=prompt_tokens, ) yield response elif decode_type == 2: # audio @@ -129,6 +132,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_ stream=stream, include_stop_str_in_output=include_stop_str_in_output, request=request, + prompt_tokens=prompt_tokens, ) else: response = self.data_processor.process_response_dict( @@ -136,6 +140,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_ stream=stream, include_stop_str_in_output=include_stop_str_in_output, request=request, + prompt_tokens=prompt_tokens, ) yield response elif stream: @@ -169,6 +174,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_ stream=stream, include_stop_str_in_output=include_stop_str_in_output, request=request, + prompt_tokens=prompt_tokens, ) else: self.data_processor.process_response_dict( @@ -176,6 +182,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_ stream=stream, include_stop_str_in_output=include_stop_str_in_output, request=request, + prompt_tokens=prompt_tokens, ) text = {"type": "text", "text": request_output["outputs"]["text"]} request_output["outputs"]["multipart"] = [text] @@ -198,6 +205,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_ stream=False, include_stop_str_in_output=include_stop_str_in_output, request=request, + prompt_tokens=prompt_tokens, ) else: self.data_processor.process_response_dict( @@ -205,6 +213,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_ stream=stream, include_stop_str_in_output=include_stop_str_in_output, request=request, + prompt_tokens=prompt_tokens, ) text = {"type": "text", "text": part["request_output"]["outputs"]["text"]} multipart.append(text) diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py index 896a27707ce..7ffc34db97d 100644 --- a/fastdeploy/input/base_processor.py +++ b/fastdeploy/input/base_processor.py @@ -70,7 +70,10 @@ def _is_forced_tool_choice(request) -> bool: own ``options`` dict instead of the OpenAI-style ``tool_choice`` field. """ - tool_choice = request.tool_choice + if request is None: + return False + + tool_choice = getattr(request, "tool_choice", None) # Named-tool choices are pydantic ``ChatCompletionNamedToolChoiceParam`` # objects (``type == "function"``); plain string values such as # ``"required"`` / ``"auto"`` / ``"none"`` are skipped here. From 7c5af98ba8c1e11b8e5020f49d78cfc5994e492c Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Fri, 22 May 2026 11:13:32 +0800 Subject: [PATCH 07/10] fix unit test --- .../tool_parsers/abstract_tool_parser.py | 10 ++- fastdeploy/input/base_processor.py | 85 +++++++++---------- .../entrypoints/openai/test_finish_reason.py | 8 +- .../openai/test_max_streaming_tokens.py | 8 +- tests/input/test_text_processor.py | 14 +-- 5 files changed, 65 insertions(+), 60 deletions(-) diff --git a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 83d3ab4a924..c0e1367f086 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -36,9 +36,10 @@ class ToolParser: # Subclasses should override these with the literal tool-call sentinel # tokens they recognize (e.g. ``""`` / ``""``). - # Used by :meth:`detect_tool_prefix` to support ``tool_choice=required`` - # style prompt-prefix injection. Empty defaults make the detection a no-op - # for parsers that have not opted in. + # Used by :meth:`detect_tool_prefix` to support forced tool-call prompt + # prefix injection (named-tool ``tool_choice`` or + # ``chat_template_kwargs.options.tool_choice.mode == "force"``). Empty + # defaults make the detection a no-op for parsers that have not opted in. tool_call_start_token: str = "" tool_call_end_token: str = "" @@ -75,7 +76,8 @@ def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionReques def detect_tool_prefix(self, prompt: str) -> str: """Detect a tool-call prefix that the chat template injected at the tail - of the rendered prompt to force tool output (``tool_choice=required``). + of the rendered prompt to force tool output (named-tool ``tool_choice`` + or ``chat_template_kwargs.options.tool_choice.mode == "force"``). The check is generic: find the **last** occurrence of :attr:`tool_call_start_token` in ``prompt`` and, if it is **not** closed diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py index 7ffc34db97d..903fec68596 100644 --- a/fastdeploy/input/base_processor.py +++ b/fastdeploy/input/base_processor.py @@ -58,17 +58,12 @@ def _is_forced_tool_choice(request) -> bool: - """Return True iff the request asks the chat template to inject a - tool-call prefix into the prompt. Two ways are recognized: - - 1. ``request.tool_choice`` is a named-tool choice (a - ``ChatCompletionNamedToolChoiceParam`` pydantic model with - ``type == "function"``). The plain ``"required"`` string does NOT - trigger prefix injection in the chat template. - 2. ``request.chat_template_kwargs.options.tool_choice.mode == "force"`` - — used by chat templates that drive forced tool calls through their - own ``options`` dict instead of the OpenAI-style ``tool_choice`` - field. + """Return True iff the chat template should inject a forced tool-call + prefix into the prompt. Two recognized triggers: + + 1. ``request.tool_choice`` is a named-tool choice (pydantic model with + ``type == "function"``). Plain ``"required"`` does NOT trigger. + 2. ``request.chat_template_kwargs.options.tool_choice.mode == "force"``. """ if request is None: return False @@ -169,6 +164,28 @@ def text2ids(self, text, max_model_len=None, **kwargs): ) return tokens["input_ids"][0] + def _text_to_token_ids(self, text: str) -> list: + """Encode ``text`` to a ``list[int]``, shared by :meth:`messages2ids` + and :meth:`_prepare_tool_prefix`. + + ``ernie4_5`` tokenizer hangs on long inputs via ``.encode()``, so it + goes through ``tokenize`` + ``convert_tokens_to_ids``. Other tokenizers + use ``.encode()`` and the result is normalized to a plain list. + """ + if self.tokenizer_type == "ernie4_5": + # NOTE: ernie4_5 tokenizer will hang when meet long input when use .encode() + return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) + token_ids = self.tokenizer.encode(text, add_special_tokens=False) + if hasattr(token_ids, "input_ids") or (isinstance(token_ids, dict) and "input_ids" in token_ids): + token_ids = token_ids["input_ids"] + if hasattr(token_ids, "ndim") and token_ids.ndim > 1: + token_ids = token_ids[0] + if hasattr(token_ids, "tolist"): + token_ids = token_ids.tolist() + if not isinstance(token_ids, list): + token_ids = list(token_ids) + return token_ids + def messages2ids(self, request, **kwargs): """Convert a chat-template request into a token-ID list. @@ -190,19 +207,7 @@ def messages2ids(self, request, **kwargs): ) request["prompt_tokens"] = spliced_message req_id = request.get("request_id", None) if isinstance(request, dict) else None - if self.tokenizer_type == "ernie4_5": - # NOTE: ernie4_5 tokenizer will hang when meet long input when use .encode() - token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(spliced_message)) - else: - token_ids = self.tokenizer.encode(spliced_message, add_special_tokens=False) - if hasattr(token_ids, "input_ids") or (isinstance(token_ids, dict) and "input_ids" in token_ids): - token_ids = token_ids["input_ids"] - if hasattr(token_ids, "ndim") and token_ids.ndim > 1: - token_ids = token_ids[0] - if hasattr(token_ids, "tolist"): - token_ids = token_ids.tolist() - if not isinstance(token_ids, list): - token_ids = list(token_ids) + token_ids = self._text_to_token_ids(spliced_message) log_request( level=1, message="req_id:{req_id}, token_ids: {token_ids}", @@ -264,12 +269,13 @@ def ids2tokens(self, token_id, task_id): self.decode_status[task_id] = [0, 0, [], ""] status = self.decode_status[task_id] previous_texts = status[3] + previous_token_ids = list(status[2]) status[2].extend(token_id) decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(status[2], status[0], status[1]) status[0] = prefix_offset status[1] = read_offset status[3] += decode_str - return decode_str, status[2], previous_texts + return decode_str, previous_token_ids, previous_texts # ------------------------------------------------------------------ # Response processing @@ -298,17 +304,10 @@ def process_response_dict(self, response_dict, **kwargs): return self.process_response_dict_normal(response_dict, **kwargs) def _prepare_tool_prefix(self, tool_parser, prompt_tokens): - """Compute and cache on ``tool_parser`` the tool-call prefix that the - chat template may have injected at the tail of the rendered prompt - (e.g. for ``tool_choice=required``). - - ``prompt_tokens`` is the rendered-prompt string passed in by the - serving layer (see ``response_processors.process_response_chat``). - The detection itself is delegated to the parser - (:meth:`ToolParser.detect_tool_prefix`) so each parser controls - which sentinel tokens it recognizes. We compute once per parser - instance — for non-streaming a fresh instance is created per request, - for streaming the instance is cached per ``request_id``. + """Detect and cache on ``tool_parser`` the tool-call prefix that the + chat template injected at the tail of ``prompt_tokens`` (the rendered + prompt string from the serving layer). Computed once per parser + instance via the parser's :meth:`ToolParser.detect_tool_prefix`. """ if tool_parser._tool_prefix_computed: return @@ -330,7 +329,7 @@ def _prepare_tool_prefix(self, tool_parser, prompt_tokens): # ``tool_call_start_token_id in current_token_ids`` rather than on # text (e.g. ``Ernie45VLThinkingToolParser``). try: - tool_parser._tool_prefix_token_ids = list(self.tokenizer.encode(prefix, add_special_tokens=False)) + tool_parser._tool_prefix_token_ids = self._text_to_token_ids(prefix) except Exception: data_processor_logger.exception("encode tool prefix to token ids failed; token-id splice disabled") tool_parser._tool_prefix_token_ids = [] @@ -438,15 +437,11 @@ def process_response_dict_streaming(self, response_dict, **kwargs): self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens")) prefix = tool_parser._tool_prefix prefix_ids = tool_parser._tool_prefix_token_ids - # When the chat template injected a forced tool-call prefix into - # the prompt, the model output starts mid-tool-call. We splice - # the prefix back into both the text and token-id streaming - # arguments so parsers that gate on either form (e.g. + # Splice the injected prefix back into both text and token-id + # streaming args so parsers that gate on either form (e.g. # ``Ernie45VLThinkingToolParser`` checks - # ``tool_call_start_token_id in current_token_ids``) see a - # complete sequence and their existing state machines work - # unchanged. The ``delta_*`` forms only need the splice on the - # first call so the parser's start detection fires once. + # ``tool_call_start_token_id in current_token_ids``) work + # unchanged. ``delta_*`` only spliced on the first call. if prefix: stream_previous = prefix + stream_previous stream_current = prefix + stream_current diff --git a/tests/entrypoints/openai/test_finish_reason.py b/tests/entrypoints/openai/test_finish_reason.py index 067b80ca0e5..74ce54e21cd 100644 --- a/tests/entrypoints/openai/test_finish_reason.py +++ b/tests/entrypoints/openai/test_finish_reason.py @@ -262,7 +262,9 @@ async def test_chat_full_max_tokens(self, mock_data_logger, mock_processor_class mock_processor_instance = Mock() mock_processor_instance.enable_multimodal_content.return_value = True - async def mock_process_response_chat_async(response, stream, include_stop_str_in_output, request=None): + async def mock_process_response_chat_async( + response, stream, include_stop_str_in_output, request=None, prompt_tokens=None + ): yield response mock_processor_instance.process_response_chat = mock_process_response_chat_async @@ -445,7 +447,9 @@ async def test_chat_stream_max_tokens(self, mock_api_logger, mock_processor_clas mock_processor_instance = Mock() mock_processor_instance.enable_multimodal_content.return_value = False - async def mock_process_response_chat_async(response, stream, include_stop_str_in_output, request=None): + async def mock_process_response_chat_async( + response, stream, include_stop_str_in_output, request=None, prompt_tokens=None + ): if isinstance(response, list): for res in response: yield res diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py index 63db437cc5d..c2efcdd03a0 100644 --- a/tests/entrypoints/openai/test_max_streaming_tokens.py +++ b/tests/entrypoints/openai/test_max_streaming_tokens.py @@ -222,7 +222,9 @@ async def test_integration_with_chat_stream_generator(self, mock_processor_class mock_processor_instance = Mock() - async def mock_process_response_chat_single(response, stream, include_stop_str_in_output, request=None): + async def mock_process_response_chat_single( + response, stream, include_stop_str_in_output, request=None, prompt_tokens=None + ): yield response mock_processor_instance.process_response_chat = mock_process_response_chat_single @@ -639,7 +641,9 @@ async def test_chat_stream_usage_fields(self, mock_response_processor, api_serve mock_processor_instance = Mock() - async def mock_process_response_chat(response, stream, include_stop_str_in_output, request=None): + async def mock_process_response_chat( + response, stream, include_stop_str_in_output, request=None, prompt_tokens=None + ): delta_msg_mock = Mock() delta_msg_mock.content = response["outputs"]["text"] if response["outputs"]["text"] == "a": diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py index d1329c6d01b..20661d8b0ab 100644 --- a/tests/input/test_text_processor.py +++ b/tests/input/test_text_processor.py @@ -884,8 +884,9 @@ def extract_tool_calls_streaming( class ToolPrefixCompensationTest(unittest.TestCase): - """Tests for the ``tool_choice=required`` prefix compensation logic in - ``BaseTextProcessor``.""" + """Tests for the forced-tool-call prefix compensation logic in + ``BaseTextProcessor`` (named-tool ``tool_choice`` and + ``chat_template_kwargs.options.tool_choice.mode == "force"``).""" def setUp(self): module, cleanup = _import_text_processor() @@ -1059,12 +1060,11 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self): self.assertEqual(second_call["previous_text"], "7") self.assertEqual(second_call["current_text"], "78") self.assertEqual(second_call["delta_text"], "8") # no extra prefix splice + # ``is_end=True`` causes the eos token to be stripped before ids2tokens, + # so token_ids fed to the parser is just [8]. self.assertEqual(second_call["previous_token_ids"], prefix_ids + [7]) - self.assertEqual( - second_call["current_token_ids"], - prefix_ids + [7, 8, processor.tokenizer.eos_token_id], - ) - self.assertEqual(second_call["delta_token_ids"], [8, processor.tokenizer.eos_token_id]) + self.assertEqual(second_call["current_token_ids"], prefix_ids + [7, 8]) + self.assertEqual(second_call["delta_token_ids"], [8]) # detect should only run once across the whole stream. self.assertEqual(len(parser.detect_calls), 1) From 2d06f3ee5da89d15fc9fc68af9b8a2d570e3c440 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Fri, 22 May 2026 14:52:31 +0800 Subject: [PATCH 08/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=9D=A1=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastdeploy/entrypoints/openai/protocol.py | 22 --- .../tool_parsers/abstract_tool_parser.py | 24 +-- fastdeploy/input/base_processor.py | 70 +++----- tests/input/test_text_processor.py | 158 +++--------------- 4 files changed, 48 insertions(+), 226 deletions(-) diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index c25ade1a38a..82cdd26d92d 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -242,22 +242,6 @@ class ChatCompletionToolsParam(BaseModel): function: FunctionDefinition -class ChatCompletionNamedFunction(BaseModel): - """Named function for ``tool_choice`` when forcing a specific tool.""" - - name: str - - -class ChatCompletionNamedToolChoiceParam(BaseModel): - """OpenAI-compatible named tool choice — forces the model to call a - specific tool by name. Used as one of the values of - :attr:`ChatCompletionRequest.tool_choice`. - """ - - function: ChatCompletionNamedFunction - type: Literal["function"] = "function" - - class ChatMessage(BaseModel): """ Chat message. @@ -684,12 +668,6 @@ class ChatCompletionRequest(BaseModel): # https://platform.openai.com/docs/api-reference/chat/create messages: Union[List[Any], List[int]] tools: Optional[List[ChatCompletionToolsParam]] = None - tool_choice: Optional[ - Union[ - Literal["none", "auto", "required"], - ChatCompletionNamedToolChoiceParam, - ] - ] = "none" model: Optional[str] = "default" frequency_penalty: Optional[float] = Field(None, le=2, ge=-2) logprobs: Optional[bool] = False diff --git a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py index c0e1367f086..461f702cd1b 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -75,23 +75,13 @@ def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionReques return request def detect_tool_prefix(self, prompt: str) -> str: - """Detect a tool-call prefix that the chat template injected at the tail - of the rendered prompt to force tool output (named-tool ``tool_choice`` - or ``chat_template_kwargs.options.tool_choice.mode == "force"``). - - The check is generic: find the **last** occurrence of - :attr:`tool_call_start_token` in ``prompt`` and, if it is **not** closed - by a subsequent :attr:`tool_call_end_token`, treat the substring from - that position to the end of the prompt as the injected prefix. The - injected prefix must reach the very end of the prompt (modulo trailing - whitespace) — anything else is treated as historical / unrelated and - we conservatively return an empty string. - - Returns ``""`` for parsers that have not declared their sentinel tokens - or for prompts where no such prefix is detected. - - Subclasses with non-paired tag formats (e.g. a single sentinel without - a closing counterpart) may override this method. + """Detect the tool-call prefix injected at the tail of the rendered + prompt by a forced ``tool_choice``. + + Finds the **last** :attr:`tool_call_start_token` in ``prompt`` that is + not closed by a later :attr:`tool_call_end_token` and reaches the + prompt end (modulo trailing whitespace). Returns ``""`` otherwise. + Subclasses with non-paired tag formats may override. """ start = self.tool_call_start_token if not start or not prompt: diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py index 903fec68596..4baad8ee22e 100644 --- a/fastdeploy/input/base_processor.py +++ b/fastdeploy/input/base_processor.py @@ -57,32 +57,6 @@ _SAMPLING_EPS = 1e-5 -def _is_forced_tool_choice(request) -> bool: - """Return True iff the chat template should inject a forced tool-call - prefix into the prompt. Two recognized triggers: - - 1. ``request.tool_choice`` is a named-tool choice (pydantic model with - ``type == "function"``). Plain ``"required"`` does NOT trigger. - 2. ``request.chat_template_kwargs.options.tool_choice.mode == "force"``. - """ - if request is None: - return False - - tool_choice = getattr(request, "tool_choice", None) - # Named-tool choices are pydantic ``ChatCompletionNamedToolChoiceParam`` - # objects (``type == "function"``); plain string values such as - # ``"required"`` / ``"auto"`` / ``"none"`` are skipped here. - if not isinstance(tool_choice, str) and getattr(tool_choice, "type", None) == "function": - return True - - chat_template_kwargs = getattr(request, "chat_template_kwargs", None) or {} - options = chat_template_kwargs.get("options") if isinstance(chat_template_kwargs, dict) else None - inner = options.get("tool_choice") if isinstance(options, dict) else None - if isinstance(inner, dict) and inner.get("mode") == "force": - return True - return False - - class BaseTextProcessor(ABC): """Abstract base class shared by all text / VL processors. @@ -369,10 +343,9 @@ def process_response_dict_normal(self, response_dict, **kwargs): if self.tool_parser_obj: tool_parser = self.tool_parser_obj(self.tokenizer) parser_input = full_text - if _is_forced_tool_choice(request): - self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens")) - if tool_parser._tool_prefix: - parser_input = tool_parser._tool_prefix + full_text + self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens")) + if tool_parser._tool_prefix: + parser_input = tool_parser._tool_prefix + full_text tool_call_info = tool_parser.extract_tool_calls(parser_input, request) if tool_call_info.tools_called: response_dict["outputs"]["tool_calls"] = tool_call_info.tool_calls @@ -433,26 +406,25 @@ def process_response_dict_streaming(self, response_dict, **kwargs): stream_previous_token_ids = previous_token_ids stream_current_token_ids = previous_token_ids + token_ids stream_delta_token_ids = token_ids - if _is_forced_tool_choice(request): - self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens")) - prefix = tool_parser._tool_prefix - prefix_ids = tool_parser._tool_prefix_token_ids - # Splice the injected prefix back into both text and token-id - # streaming args so parsers that gate on either form (e.g. - # ``Ernie45VLThinkingToolParser`` checks - # ``tool_call_start_token_id in current_token_ids``) work - # unchanged. ``delta_*`` only spliced on the first call. - if prefix: - stream_previous = prefix + stream_previous - stream_current = prefix + stream_current + self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens")) + prefix = tool_parser._tool_prefix + prefix_ids = tool_parser._tool_prefix_token_ids + # Splice the injected prefix back into both text and token-id + # streaming args so parsers that gate on either form (e.g. + # ``Ernie45VLThinkingToolParser`` checks + # ``tool_call_start_token_id in current_token_ids``) work + # unchanged. ``delta_*`` only spliced on the first call. + if prefix: + stream_previous = prefix + stream_previous + stream_current = prefix + stream_current + if prefix_ids: + stream_previous_token_ids = list(prefix_ids) + list(stream_previous_token_ids) + stream_current_token_ids = list(prefix_ids) + list(stream_current_token_ids) + if not tool_parser._tool_prefix_injected_to_delta: + stream_delta = prefix + stream_delta if prefix_ids: - stream_previous_token_ids = list(prefix_ids) + list(stream_previous_token_ids) - stream_current_token_ids = list(prefix_ids) + list(stream_current_token_ids) - if not tool_parser._tool_prefix_injected_to_delta: - stream_delta = prefix + stream_delta - if prefix_ids: - stream_delta_token_ids = list(prefix_ids) + list(stream_delta_token_ids) - tool_parser._tool_prefix_injected_to_delta = True + stream_delta_token_ids = list(prefix_ids) + list(stream_delta_token_ids) + tool_parser._tool_prefix_injected_to_delta = True tool_call_delta_message = tool_parser.extract_tool_calls_streaming( stream_previous, stream_current, diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py index 20661d8b0ab..0efa4e9f7fc 100644 --- a/tests/input/test_text_processor.py +++ b/tests/input/test_text_processor.py @@ -573,7 +573,7 @@ def test_process_response_with_reasoning_and_tools(self): } processed = processor.process_response_dict( - response, stream=False, request=SimpleNamespace(tool_choice="none") + response, stream=False, request=SimpleNamespace(chat_template_kwargs=None) ) self.assertEqual(processed["outputs"]["reasoning_content"], "think") self.assertEqual(processed["outputs"]["tool_calls"], ["tool"]) @@ -602,7 +602,7 @@ def test_process_response_streaming_with_reasoning_and_tools(self): } result = processor.process_response_dict_streaming( - response, enable_thinking=True, request=SimpleNamespace(tool_choice="none") + response, enable_thinking=True, request=SimpleNamespace(chat_template_kwargs=None) ) self.assertEqual(result["outputs"]["completion_tokens"], "7") self.assertEqual(result["outputs"]["text"], "tool-text") @@ -622,7 +622,7 @@ def test_process_response_dict_normal_with_reasoning(self): } result = processor.process_response_dict_normal( - response, enable_thinking=True, request=SimpleNamespace(tool_choice="none") + response, enable_thinking=True, request=SimpleNamespace(chat_template_kwargs=None) ) self.assertEqual(result["outputs"]["completion_tokens"], "7") self.assertEqual(result["outputs"]["reasoning_content"], "because") @@ -761,72 +761,6 @@ def custom_convert(tokens): self.assertEqual(processor.update_bad_words(["combo", "oversize"], []), []) -class IsForcedToolChoiceTest(unittest.TestCase): - """Tests for the module-level ``_is_forced_tool_choice`` helper. - - The helper takes a request-like object (something with ``tool_choice`` - and ``chat_template_kwargs`` attributes) and returns whether the chat - template will inject a tool-call prefix. - """ - - def setUp(self): - from fastdeploy.input import base_processor - - self._is_forced = base_processor._is_forced_tool_choice - - def _req(self, *, tool_choice=None, chat_template_kwargs=None): - return SimpleNamespace( - tool_choice=tool_choice, - chat_template_kwargs=chat_template_kwargs, - ) - - def test_string_tool_choice_never_forces(self): - # Plain string tool_choice values do NOT cause the chat template to - # inject a tool-call prefix, even when the value is ``"required"``. - self.assertFalse(self._is_forced(self._req(tool_choice="required"))) - self.assertFalse(self._is_forced(self._req(tool_choice="auto"))) - self.assertFalse(self._is_forced(self._req(tool_choice="none"))) - self.assertFalse(self._is_forced(self._req(tool_choice=""))) - - def test_pydantic_named_tool_choice(self): - named = SimpleNamespace(type="function", function=SimpleNamespace(name="f")) - self.assertTrue(self._is_forced(self._req(tool_choice=named))) - - def test_pydantic_other_type(self): - self.assertFalse(self._is_forced(self._req(tool_choice=SimpleNamespace(type="other")))) - self.assertFalse(self._is_forced(self._req(tool_choice=SimpleNamespace()))) - - def test_no_tool_choice_no_options(self): - self.assertFalse(self._is_forced(self._req())) - - def test_chat_template_options_force_mode(self): - kwargs = { - "options": { - "tool_choice": {"mode": "force", "name": "get_current_weather"}, - } - } - self.assertTrue(self._is_forced(self._req(chat_template_kwargs=kwargs))) - - def test_chat_template_options_non_force_mode(self): - kwargs = {"options": {"tool_choice": {"mode": "auto"}}} - self.assertFalse(self._is_forced(self._req(chat_template_kwargs=kwargs))) - - def test_chat_template_options_missing_tool_choice(self): - self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": {}}))) - self.assertFalse(self._is_forced(self._req(chat_template_kwargs={}))) - - def test_chat_template_options_malformed(self): - # Non-dict options/inner must be tolerated (no crash, returns False). - self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": "x"}))) - self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": {"tool_choice": "x"}}))) - - def test_tool_choice_takes_priority_over_options(self): - kwargs = {"options": {"tool_choice": {"mode": "force"}}} - # Named-tool pydantic choice combined with options.force still forces. - named = SimpleNamespace(type="function", function=SimpleNamespace(name="f")) - self.assertTrue(self._is_forced(self._req(tool_choice=named, chat_template_kwargs=kwargs))) - - class _RecordingToolParser: """Minimal tool parser that records inputs and exposes the prefix-state fields the serving layer reads/writes.""" @@ -885,8 +819,9 @@ def extract_tool_calls_streaming( class ToolPrefixCompensationTest(unittest.TestCase): """Tests for the forced-tool-call prefix compensation logic in - ``BaseTextProcessor`` (named-tool ``tool_choice`` and - ``chat_template_kwargs.options.tool_choice.mode == "force"``).""" + ``BaseTextProcessor``. Splicing is driven entirely by whether the + rendered prompt ends with an unclosed tool-call start token, not by + request parameter introspection.""" def setUp(self): module, cleanup = _import_text_processor() @@ -928,7 +863,9 @@ def test_prepare_tool_prefix_handles_exception(self): self.assertTrue(parser._tool_prefix_computed) self.assertEqual(parser._tool_prefix, "") - def test_normal_path_splices_prefix_when_required(self): + def test_normal_path_splices_prefix_when_prompt_has_prefix(self): + """Prompt ending with an unclosed tool-call start triggers splicing, + regardless of how the user requested it.""" processor = self.processor parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") processor.tool_parser_obj = self._make_parser_factory(parser) @@ -938,14 +875,9 @@ def test_normal_path_splices_prefix_when_required(self): "finished": True, "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, } - # Named-tool pydantic choice triggers prefix injection. - request = SimpleNamespace( - tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")), - ) - processor.process_response_dict_normal( response, - request=request, + request=SimpleNamespace(chat_template_kwargs=None), prompt_tokens="user msg\n", ) self.assertEqual(len(parser.extract_calls), 1) @@ -953,7 +885,8 @@ def test_normal_path_splices_prefix_when_required(self): self.assertTrue(parser.extract_calls[0].startswith("")) self.assertEqual(response["outputs"]["tool_calls"], ["tc"]) - def test_normal_path_no_splice_when_not_required(self): + def test_normal_path_no_splice_when_prompt_lacks_prefix(self): + """No prefix in prompt tail => detect returns "" => no splice.""" processor = self.processor parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") processor.tool_parser_obj = self._make_parser_factory(parser) @@ -963,70 +896,20 @@ def test_normal_path_no_splice_when_not_required(self): "finished": True, "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, } - request = SimpleNamespace(tool_choice="auto") - processor.process_response_dict_normal( response, - request=request, - prompt_tokens="user msg\n", + request=SimpleNamespace(chat_template_kwargs=None), + prompt_tokens="user msg without sentinel", ) - # detect_tool_prefix must NOT be called for non-forced choices. - self.assertEqual(parser.detect_calls, []) + # detect_tool_prefix is called, but returns "" => no prefix prepended. + self.assertEqual(len(parser.detect_calls), 1) self.assertFalse(parser.extract_calls[0].startswith("")) - def test_normal_path_named_tool_choice_pydantic(self): - """A pydantic ``ChatCompletionNamedToolChoiceParam`` (duck-typed via - ``type='function'``) must also trigger prefix splicing.""" - processor = self.processor - parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") - processor.tool_parser_obj = self._make_parser_factory(parser) - - response = { - "request_id": "req-named", - "finished": True, - "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, - } - request = SimpleNamespace(tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f"))) - - processor.process_response_dict_normal( - response, - request=request, - prompt_tokens="user msg\n", - ) - self.assertTrue(parser.extract_calls[0].startswith("")) - - def test_normal_path_chat_template_force_mode(self): - """Forcing through ``chat_template_kwargs.options.tool_choice.mode`` - must also trigger prefix splicing even when ``tool_choice`` is unset - (the default ``"none"``).""" - processor = self.processor - parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") - processor.tool_parser_obj = self._make_parser_factory(parser) - - response = { - "request_id": "req-cti", - "finished": True, - "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, - } - request = SimpleNamespace( - tool_choice="none", - chat_template_kwargs={"options": {"tool_choice": {"mode": "force", "name": "get_current_weather"}}}, - ) - - processor.process_response_dict_normal( - response, - request=request, - prompt_tokens="user msg\n", - ) - self.assertTrue(parser.extract_calls[0].startswith("")) - def test_streaming_path_splices_prefix_only_on_first_delta(self): processor = self.processor parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") processor.tool_parser_obj = self._make_parser_factory(parser) - request = SimpleNamespace( - tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")), - ) + request = SimpleNamespace(chat_template_kwargs=None) prompt_tokens = "user msg\n" # First chunk @@ -1070,12 +953,11 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self): def test_streaming_path_no_splice_when_no_prefix_detected(self): processor = self.processor - # Empty configured prefix => detect returns "" even when forced. + # Empty configured prefix => detect returns "" even when prompt looks + # like a forced rendering. parser = _RecordingToolParser(processor.tokenizer, tool_prefix="") processor.tool_parser_obj = self._make_parser_factory(parser) - request = SimpleNamespace( - tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")), - ) + request = SimpleNamespace(chat_template_kwargs=None) first = { "finished": False, From 2b12c9a3fbe19a015c160b9cfa2c639d7a75949e Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Fri, 22 May 2026 15:30:55 +0800 Subject: [PATCH 09/10] fix review --- fastdeploy/engine/common_engine.py | 6 +++++- fastdeploy/input/base_processor.py | 19 +++++++++++++++---- tests/engine/test_common_engine.py | 13 +++++++++---- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index d3f27122dba..494d2248380 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -1909,7 +1909,11 @@ def _send_error_response(self, request_id, error_msg, error_code: int = 500, wor def _decode_token(self, token_ids, req_id, is_end): delta_text = "" if envs.FD_ENABLE_RETURN_TEXT: - delta_text, cum_tokens, _ = self.data_processor.ids2tokens(token_ids, req_id) + delta_text, previous_token_ids, _ = self.data_processor.ids2tokens(token_ids, req_id) + # Reconstruct the post-extend cumulative list from the pre-delta + # snapshot + this call's input — ``ids2tokens`` only returns the + # snapshot to keep its return values aliasing-free. + cum_tokens = previous_token_ids + list(token_ids) if delta_text != "": prefix_offset = self.data_processor.decode_status[req_id][0] read_offset = self.data_processor.decode_status[req_id][1] diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py index 4baad8ee22e..21c30fb19d7 100644 --- a/fastdeploy/input/base_processor.py +++ b/fastdeploy/input/base_processor.py @@ -214,9 +214,16 @@ def ids2tokens(self, token_id, task_id): Returns: (delta_text, previous_token_ids, previous_texts) - Both the HF and the PaddleFormers/ERNIE tokeniser paths return the - same tuple shape. The HF path sets ``previous_token_ids`` to ``[]`` - since it does not expose per-token ids during batch-decode. + ``previous_token_ids`` and ``previous_texts`` are **snapshots of the + accumulated state BEFORE this call's tokens were appended** — + symmetric pre-delta views of what the caller had decoded so far. + Both are owned by the caller (no aliasing of internal state). + + Callers that need the post-extend cumulative list should reconstruct + it locally via ``previous_token_ids + token_id``. + + The HF path returns ``[]`` for ``previous_token_ids`` since it does + not expose per-token ids during batch-decode. """ if envs.FD_USE_HF_TOKENIZER: if task_id not in self.decode_status: @@ -235,7 +242,9 @@ def ids2tokens(self, token_id, task_id): status[2] = decode_str[0] else: new_str = "" - # Return consistent three-tuple; previous_token_ids not available. + # NOTE: HF path historically returns the post-delta full string + # here, inconsistent with the non-HF branch (which returns the + # pre-delta snapshot). Preserved as-is to avoid behavior change. return new_str, [], status[2] else: if task_id not in self.decode_status: @@ -243,6 +252,8 @@ def ids2tokens(self, token_id, task_id): self.decode_status[task_id] = [0, 0, [], ""] status = self.decode_status[task_id] previous_texts = status[3] + # Snapshot BEFORE extend so the returned list is owned by the + # caller and symmetric with ``previous_texts``. previous_token_ids = list(status[2]) status[2].extend(token_id) decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(status[2], status[0], status[1]) diff --git a/tests/engine/test_common_engine.py b/tests/engine/test_common_engine.py index a3487133bcb..3f05797a37a 100644 --- a/tests/engine/test_common_engine.py +++ b/tests/engine/test_common_engine.py @@ -752,7 +752,9 @@ def __init__(self): self.decode_status = {"rid": (0, 2)} def ids2tokens(self, token_ids, req_id): - return "hi", [101, 102], None + # previous_token_ids snapshot is empty (first call); engine + # reconstructs cum = previous + input = [101, 102]. + return "hi", [], None eng.data_processor = DummyProcessor() @@ -782,7 +784,8 @@ def __init__(self): self.decode_status = {"rid": (0, 1)} def ids2tokens(self, token_ids, req_id): - return "", [7], None + # previous snapshot is empty; cum becomes [7]. + return "", [], None eng.data_processor = DummyProcessor() @@ -1975,7 +1978,8 @@ def __init__(self): self.decode_status = {"rid": (0, 2)} def ids2tokens(self, token_ids, req_id): - return "hi", [1, 2], None + # previous snapshot empty; cum = [] + [1, 2] = [1, 2]. + return "hi", [], None eng.data_processor = DummyProcessor() @@ -3453,7 +3457,8 @@ def __init__(self): self.decode_status = {"tok-req": (1, 3)} def ids2tokens(self, token_ids, req_id): - return "hello", [10, 20, 30], None + # previous snapshot empty; cum = [] + [10, 20, 30]. + return "hello", [], None eng.data_processor = DummyProcessor() From 0cafd3e8d1a26433608b23e6093f06ef0925c659 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Fri, 22 May 2026 17:38:37 +0800 Subject: [PATCH 10/10] fix unit test --- tests/input/test_text_processor.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py index 0efa4e9f7fc..940fe51ec46 100644 --- a/tests/input/test_text_processor.py +++ b/tests/input/test_text_processor.py @@ -332,6 +332,13 @@ def create_dummy_tool_parser(tokenizer, content="tool-text"): class DummyToolParser: def __init__(self, tokenizer): self.tokenizer = tokenizer + self._tool_prefix = "" + self._tool_prefix_token_ids = [] + self._tool_prefix_computed = False + self._tool_prefix_injected_to_delta = False + + def detect_tool_prefix(self, prompt): + return "" def extract_tool_calls(self, full_text, response_dict): # 模拟工具调用解析,返回固定的工具调用数据用于测试