From 6b2f806181ab9fcf97f9c73cedfa148a8a3ee4fe Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Wed, 20 May 2026 20:07:51 +0800
Subject: [PATCH 01/10] first commit

---
 fastdeploy/entrypoints/openai/protocol.py     |  22 ++
 .../tool_parsers/abstract_tool_parser.py      |  62 +++++
 fastdeploy/input/base_processor.py            |  72 +++++-
 .../tool_parsers/test_abstract_tool_parser.py |  99 ++++++++
 tests/input/test_text_processor.py            | 214 ++++++++++++++++++
 5 files changed, 465 insertions(+), 4 deletions(-)
 create mode 100644 tests/entrypoints/openai/tool_parsers/test_abstract_tool_parser.py

diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py
index 82cdd26d92d..c25ade1a38a 100644
--- a/fastdeploy/entrypoints/openai/protocol.py
+++ b/fastdeploy/entrypoints/openai/protocol.py
@@ -242,6 +242,22 @@ class ChatCompletionToolsParam(BaseModel):
     function: FunctionDefinition
 
 
+class ChatCompletionNamedFunction(BaseModel):
+    """Named function for ``tool_choice`` when forcing a specific tool."""
+
+    name: str
+
+
+class ChatCompletionNamedToolChoiceParam(BaseModel):
+    """OpenAI-compatible named tool choice — forces the model to call a
+    specific tool by name. Used as one of the values of
+    :attr:`ChatCompletionRequest.tool_choice`.
+    """
+
+    function: ChatCompletionNamedFunction
+    type: Literal["function"] = "function"
+
+
 class ChatMessage(BaseModel):
     """
     Chat message.
@@ -668,6 +684,12 @@ class ChatCompletionRequest(BaseModel):
     # https://platform.openai.com/docs/api-reference/chat/create
     messages: Union[List[Any], List[int]]
     tools: Optional[List[ChatCompletionToolsParam]] = None
+    tool_choice: Optional[
+        Union[
+            Literal["none", "auto", "required"],
+            ChatCompletionNamedToolChoiceParam,
+        ]
+    ] = "none"
     model: Optional[str] = "default"
     frequency_penalty: Optional[float] = Field(None, le=2, ge=-2)
     logprobs: Optional[bool] = False
diff --git a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index 906483f445a..641f56dc82b 100644
--- a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -34,6 +34,14 @@ class ToolParser:
     derived classes.
     """
 
+    # Subclasses should override these with the literal tool-call sentinel
+    # tokens they recognize (e.g. ``"<tool_call>"`` / ``"</tool_call>"``).
+    # Used by :meth:`detect_tool_prefix` to support ``tool_choice=required``
+    # style prompt-prefix injection. Empty defaults make the detection a no-op
+    # for parsers that have not opted in.
+    tool_call_start_token: str = ""
+    tool_call_end_token: str = ""
+
     def __init__(self, tokenizer):
         self.prev_tool_call_arr: list[dict] = []
         # the index of the tool call that is currently being parsed
@@ -43,6 +51,21 @@ def __init__(self, tokenizer):
 
         self.model_tokenizer = tokenizer
 
+        # Per-request tool-prefix state, populated by the serving layer when
+        # ``tool_choice=required`` (or similar) causes a tool-call prefix to be
+        # appended to the rendered prompt by the chat template. The parser
+        # itself does not compute these — the serving layer calls
+        # :meth:`detect_tool_prefix` and stashes the result here.
+        self._tool_prefix: str = ""
+        # Idempotency flag: the serving layer may invoke its preparation hook
+        # once per streaming chunk, but the prefix only needs to be computed
+        # once per request. Set to ``True`` after the first computation.
+        self._tool_prefix_computed: bool = False
+        # Whether the prefix has already been spliced into ``delta_text`` for
+        # the streaming path. Only the first streaming call needs the splice;
+        # subsequent calls keep ``delta_text`` untouched.
+        self._tool_prefix_injected_to_delta: bool = False
+
     @cached_property
     def vocab(self) -> dict[str, int]:
         # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
@@ -55,6 +78,45 @@ def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionReques
         """
         return request
 
+    def detect_tool_prefix(self, prompt: str) -> str:
+        """Detect a tool-call prefix that the chat template injected at the tail
+        of the rendered prompt to force tool output (``tool_choice=required``).
+
+        The check is generic: find the **last** occurrence of
+        :attr:`tool_call_start_token` in ``prompt`` and, if it is **not** closed
+        by a subsequent :attr:`tool_call_end_token`, treat the substring from
+        that position to the end of the prompt as the injected prefix. The
+        injected prefix must reach the very end of the prompt (modulo trailing
+        whitespace) — anything else is treated as historical / unrelated and
+        we conservatively return an empty string.
+
+        Returns ``""`` for parsers that have not declared their sentinel tokens
+        or for prompts where no such prefix is detected.
+
+        Subclasses with non-paired tag formats (e.g. a single sentinel without
+        a closing counterpart) may override this method.
+        """
+        start = self.tool_call_start_token
+        if not start or not prompt:
+            return ""
+
+        last_start = prompt.rfind(start)
+        if last_start == -1:
+            return ""
+
+        end = self.tool_call_end_token
+        if end and prompt.find(end, last_start + len(start)) != -1:
+            # The last start token is closed — this is a historical, completed
+            # tool-call (e.g. from a previous assistant turn), not an injected
+            # forced prefix.
+            return ""
+
+        # By construction, ``prompt[last_start:]`` reaches the end of the
+        # prompt. We treat the whole tail as the injected prefix. Subclasses
+        # whose chat templates place additional content after the prefix can
+        # override this method to apply stricter validation.
+        return prompt[last_start:]
+
     def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation:
         """
         Static method that should be implemented for extracting tool calls from
diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py
index c65e0c42cf4..0f655f60bcd 100644
--- a/fastdeploy/input/base_processor.py
+++ b/fastdeploy/input/base_processor.py
@@ -57,6 +57,21 @@
 _SAMPLING_EPS = 1e-5
 
 
+def _is_forced_tool_choice(tool_choice) -> bool:
+    """Return True iff ``tool_choice`` requires the chat template to inject
+    a tool-call prefix into the prompt — i.e. ``"required"`` or a named-tool
+    choice (``{"type": "function", "function": {...}}``).
+
+    By the time this runs, the request has already been dumped to dict form,
+    so ``tool_choice`` is either a string or a dict.
+    """
+    if isinstance(tool_choice, str):
+        return tool_choice == "required"
+    if isinstance(tool_choice, dict):
+        return tool_choice.get("type") == "function"
+    return False
+
+
 class BaseTextProcessor(ABC):
     """Abstract base class shared by all text / VL processors.
 
@@ -266,6 +281,32 @@ def process_response_dict(self, response_dict, **kwargs):
         else:
             return self.process_response_dict_normal(response_dict, **kwargs)
 
+    def _prepare_tool_prefix(self, tool_parser, request):
+        """Compute and cache on ``tool_parser`` the tool-call prefix that the
+        chat template may have injected at the tail of the rendered prompt
+        (e.g. for ``tool_choice=required``).
+
+        The detection is delegated to the parser itself
+        (:meth:`ToolParser.detect_tool_prefix`) so each parser controls
+        which sentinel tokens it recognizes. We compute once per parser
+        instance — for non-streaming a fresh instance is created per request,
+        for streaming the instance is cached per ``request_id``.
+        """
+        if tool_parser._tool_prefix_computed:
+            return
+        tool_parser._tool_prefix_computed = True
+        tool_parser._tool_prefix = ""
+        if not request:
+            return
+        prompt_str = request.get("prompt_tokens")
+        if not prompt_str or not isinstance(prompt_str, str):
+            return
+        try:
+            tool_parser._tool_prefix = tool_parser.detect_tool_prefix(prompt_str) or ""
+        except Exception:
+            data_processor_logger.exception("detect_tool_prefix failed; falling back to empty prefix")
+            tool_parser._tool_prefix = ""
+
     def process_response_dict_normal(self, response_dict, **kwargs):
         """Accumulate tokens and build the full completion text (non-streaming)."""
         token_ids = response_dict["outputs"]["token_ids"]
@@ -300,7 +341,12 @@ def process_response_dict_normal(self, response_dict, **kwargs):
 
             if self.tool_parser_obj:
                 tool_parser = self.tool_parser_obj(self.tokenizer)
-                tool_call_info = tool_parser.extract_tool_calls(full_text, request)
+                parser_input = full_text
+                if _is_forced_tool_choice(request.get("tool_choice")):
+                    self._prepare_tool_prefix(tool_parser, request)
+                    if tool_parser._tool_prefix:
+                        parser_input = tool_parser._tool_prefix + full_text
+                tool_call_info = tool_parser.extract_tool_calls(parser_input, request)
                 if tool_call_info.tools_called:
                     response_dict["outputs"]["tool_calls"] = tool_call_info.tool_calls
 
@@ -354,10 +400,28 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             if req_id not in self.tool_parser_dict:
                 self.tool_parser_dict[req_id] = self.tool_parser_obj(self.tokenizer)
             tool_parser = self.tool_parser_dict[req_id]
+            stream_previous = previous_texts
+            stream_current = previous_texts + delta_text
+            stream_delta = delta_text
+            if _is_forced_tool_choice(request.get("tool_choice")):
+                self._prepare_tool_prefix(tool_parser, request)
+                prefix = tool_parser._tool_prefix
+                # When the chat template injected a forced tool-call prefix into
+                # the prompt, the model output starts mid-tool-call. We splice
+                # the prefix back into the streaming arguments so the parser
+                # sees a complete sequence and its existing state machine works
+                # unchanged. ``delta_text`` only needs the splice on the first
+                # call so the parser's start-token detection fires once.
+                if prefix:
+                    stream_previous = prefix + stream_previous
+                    stream_current = prefix + stream_current
+                    if not tool_parser._tool_prefix_injected_to_delta:
+                        stream_delta = prefix + stream_delta
+                        tool_parser._tool_prefix_injected_to_delta = True
             tool_call_delta_message = tool_parser.extract_tool_calls_streaming(
-                previous_texts,
-                previous_texts + delta_text,
-                delta_text,
+                stream_previous,
+                stream_current,
+                stream_delta,
                 previous_token_ids,
                 previous_token_ids + token_ids,
                 token_ids,
diff --git a/tests/entrypoints/openai/tool_parsers/test_abstract_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_abstract_tool_parser.py
new file mode 100644
index 00000000000..d8fd3acec0f
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_abstract_tool_parser.py
@@ -0,0 +1,99 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import unittest
+
+from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
+
+
+class _DummyTokenizer:
+    def get_vocab(self):
+        return {}
+
+
+class _PairedTagParser(ToolParser):
+    """A concrete parser declaring paired sentinel tokens for testing."""
+
+    tool_call_start_token = "<tool_call>"
+    tool_call_end_token = "</tool_call>"
+
+
+class _NoSentinelParser(ToolParser):
+    """A parser that did not opt in to prefix detection."""
+
+
+class TestDetectToolPrefix(unittest.TestCase):
+    def setUp(self):
+        self.tokenizer = _DummyTokenizer()
+        self.parser = _PairedTagParser(self.tokenizer)
+
+    def test_initial_state(self):
+        self.assertEqual(self.parser._tool_prefix, "")
+        self.assertFalse(self.parser._tool_prefix_computed)
+        self.assertFalse(self.parser._tool_prefix_injected_to_delta)
+
+    def test_empty_prompt_returns_empty(self):
+        self.assertEqual(self.parser.detect_tool_prefix(""), "")
+
+    def test_no_start_token_returns_empty(self):
+        self.assertEqual(
+            self.parser.detect_tool_prefix("user: hello\nassistant: hi"),
+            "",
+        )
+
+    def test_parser_without_sentinel_returns_empty(self):
+        parser = _NoSentinelParser(self.tokenizer)
+        self.assertEqual(
+            parser.detect_tool_prefix("anything <tool_call> here"),
+            "",
+        )
+
+    def test_trailing_start_token_only(self):
+        prompt = "user: q\n<tool_call>"
+        self.assertEqual(self.parser.detect_tool_prefix(prompt), "<tool_call>")
+
+    def test_trailing_start_with_invoke_prefix(self):
+        prompt = "history\n<tool_call><invoke name="
+        self.assertEqual(
+            self.parser.detect_tool_prefix(prompt),
+            "<tool_call><invoke name=",
+        )
+
+    def test_history_closed_tool_call_no_injection(self):
+        prompt = "<tool_call>{...}</tool_call>\nuser: next"
+        self.assertEqual(self.parser.detect_tool_prefix(prompt), "")
+
+    def test_history_closed_plus_new_injected_prefix(self):
+        prompt = "<tool_call>{a:1}</tool_call>\n<tool_call><invoke name="
+        self.assertEqual(
+            self.parser.detect_tool_prefix(prompt),
+            "<tool_call><invoke name=",
+        )
+
+    def test_multiple_closed_history_no_injection(self):
+        prompt = "<tool_call>{a:1}</tool_call>\n" "<tool_call>{b:2}</tool_call>\n" "assistant: done"
+        self.assertEqual(self.parser.detect_tool_prefix(prompt), "")
+
+    def test_trailing_whitespace_after_start(self):
+        prompt = "history\n<tool_call>   "
+        self.assertEqual(
+            self.parser.detect_tool_prefix(prompt),
+            "<tool_call>   ",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py
index ebb4c9ff127..cf143aaef0b 100644
--- a/tests/input/test_text_processor.py
+++ b/tests/input/test_text_processor.py
@@ -753,5 +753,219 @@ def custom_convert(tokens):
         self.assertEqual(processor.update_bad_words(["combo", "oversize"], []), [])
 
 
+class IsForcedToolChoiceTest(unittest.TestCase):
+    """Tests for the module-level ``_is_forced_tool_choice`` helper."""
+
+    def setUp(self):
+        from fastdeploy.input import base_processor
+
+        self._is_forced = base_processor._is_forced_tool_choice
+
+    def test_required_string(self):
+        self.assertTrue(self._is_forced("required"))
+
+    def test_other_strings(self):
+        self.assertFalse(self._is_forced("auto"))
+        self.assertFalse(self._is_forced("none"))
+        self.assertFalse(self._is_forced(""))
+
+    def test_named_function_dict(self):
+        self.assertTrue(self._is_forced({"type": "function", "function": {"name": "f"}}))
+
+    def test_dict_without_function_type(self):
+        self.assertFalse(self._is_forced({"type": "other"}))
+        self.assertFalse(self._is_forced({}))
+
+    def test_none_and_other_types(self):
+        self.assertFalse(self._is_forced(None))
+        self.assertFalse(self._is_forced(123))
+        self.assertFalse(self._is_forced(["required"]))
+
+
+class _RecordingToolParser:
+    """Minimal tool parser that records inputs and exposes the prefix-state
+    fields the serving layer reads/writes."""
+
+    def __init__(self, tokenizer, tool_prefix="<tool_call>", detect_raises=False):
+        self.tokenizer = tokenizer
+        self._configured_prefix = tool_prefix
+        self._detect_raises = detect_raises
+        self._tool_prefix = ""
+        self._tool_prefix_computed = False
+        self._tool_prefix_injected_to_delta = False
+        self.detect_calls = []
+        self.extract_calls = []
+        self.streaming_calls = []
+
+    def detect_tool_prefix(self, prompt):
+        self.detect_calls.append(prompt)
+        if self._detect_raises:
+            raise RuntimeError("boom")
+        return self._configured_prefix if prompt and prompt.endswith(self._configured_prefix) else ""
+
+    def extract_tool_calls(self, model_output, request):
+        self.extract_calls.append(model_output)
+        return SimpleNamespace(tools_called=True, tool_calls=["tc"])
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text,
+        current_text,
+        delta_text,
+        previous_token_ids,
+        current_token_ids,
+        delta_token_ids,
+        request,
+    ):
+        self.streaming_calls.append(
+            {
+                "previous_text": previous_text,
+                "current_text": current_text,
+                "delta_text": delta_text,
+            }
+        )
+        tool_calls = [
+            DeltaToolCall(
+                index=0,
+                type="function",
+                id="x",
+                function=DeltaFunctionCall(name="t").model_dump(exclude_none=True),
+            )
+        ]
+        return DeltaMessage(tool_calls=tool_calls, content="c")
+
+
+class ToolPrefixCompensationTest(unittest.TestCase):
+    """Tests for the ``tool_choice=required`` prefix compensation logic in
+    ``BaseTextProcessor``."""
+
+    def setUp(self):
+        module, cleanup = _import_text_processor()
+        self.text_processor_module = module
+        self.addCleanup(cleanup)
+        self.processor = module.TextProcessor("stub-model")
+
+    def _make_parser_factory(self, parser):
+        return lambda tokenizer: parser
+
+    def test_prepare_tool_prefix_idempotent(self):
+        parser = _RecordingToolParser(self.processor.tokenizer)
+        request = {"prompt_tokens": "history\n<tool_call>"}
+
+        self.processor._prepare_tool_prefix(parser, request)
+        self.assertTrue(parser._tool_prefix_computed)
+        self.assertEqual(parser._tool_prefix, "<tool_call>")
+        self.assertEqual(len(parser.detect_calls), 1)
+
+        # Second call must not invoke detect again.
+        self.processor._prepare_tool_prefix(parser, request)
+        self.assertEqual(len(parser.detect_calls), 1)
+
+    def test_prepare_tool_prefix_no_prompt(self):
+        parser = _RecordingToolParser(self.processor.tokenizer)
+        self.processor._prepare_tool_prefix(parser, {})
+        self.assertTrue(parser._tool_prefix_computed)
+        self.assertEqual(parser._tool_prefix, "")
+        self.assertEqual(parser.detect_calls, [])
+
+    def test_prepare_tool_prefix_handles_exception(self):
+        parser = _RecordingToolParser(self.processor.tokenizer, detect_raises=True)
+        request = {"prompt_tokens": "history\n<tool_call>"}
+        self.processor._prepare_tool_prefix(parser, request)
+        self.assertTrue(parser._tool_prefix_computed)
+        self.assertEqual(parser._tool_prefix, "")
+
+    def test_normal_path_splices_prefix_when_required(self):
+        processor = self.processor
+        parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")
+        processor.tool_parser_obj = self._make_parser_factory(parser)
+
+        response = {
+            "request_id": "req-normal",
+            "finished": True,
+            "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
+        }
+        request = {
+            "tool_choice": "required",
+            "prompt_tokens": "user msg\n<tool_call>",
+        }
+
+        processor.process_response_dict_normal(response, request=request)
+        self.assertEqual(len(parser.extract_calls), 1)
+        # Model output is "7" after decoding token 7; prefix must be prepended.
+        self.assertTrue(parser.extract_calls[0].startswith("<tool_call>"))
+        self.assertEqual(response["outputs"]["tool_calls"], ["tc"])
+
+    def test_normal_path_no_splice_when_not_required(self):
+        processor = self.processor
+        parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")
+        processor.tool_parser_obj = self._make_parser_factory(parser)
+
+        response = {
+            "request_id": "req-auto",
+            "finished": True,
+            "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
+        }
+        request = {"tool_choice": "auto", "prompt_tokens": "user msg\n<tool_call>"}
+
+        processor.process_response_dict_normal(response, request=request)
+        # detect_tool_prefix must NOT be called for non-forced choices.
+        self.assertEqual(parser.detect_calls, [])
+        self.assertFalse(parser.extract_calls[0].startswith("<tool_call>"))
+
+    def test_streaming_path_splices_prefix_only_on_first_delta(self):
+        processor = self.processor
+        parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")
+        processor.tool_parser_obj = self._make_parser_factory(parser)
+        request = {
+            "tool_choice": "required",
+            "prompt_tokens": "user msg\n<tool_call>",
+        }
+
+        # First chunk
+        first = {
+            "finished": False,
+            "request_id": "stream-req",
+            "outputs": {"token_ids": [7]},
+        }
+        processor.process_response_dict_streaming(first, request=request)
+        first_call = parser.streaming_calls[0]
+        # delta_text decodes to "7"; previous="" current="7"
+        self.assertEqual(first_call["previous_text"], "<tool_call>")
+        self.assertEqual(first_call["current_text"], "<tool_call>7")
+        self.assertEqual(first_call["delta_text"], "<tool_call>7")
+        self.assertTrue(parser._tool_prefix_injected_to_delta)
+
+        # Second chunk: delta must NOT be re-spliced, but previous/current are.
+        second = {
+            "finished": True,
+            "request_id": "stream-req",
+            "outputs": {"token_ids": [8, processor.tokenizer.eos_token_id]},
+        }
+        processor.process_response_dict_streaming(second, request=request)
+        second_call = parser.streaming_calls[1]
+        self.assertEqual(second_call["previous_text"], "<tool_call>7")
+        self.assertEqual(second_call["current_text"], "<tool_call>78")
+        self.assertEqual(second_call["delta_text"], "8")  # no extra prefix splice
+        # detect should only run once across the whole stream.
+        self.assertEqual(len(parser.detect_calls), 1)
+
+    def test_streaming_path_no_splice_when_no_prefix_detected(self):
+        processor = self.processor
+        # Empty configured prefix => detect returns "" even with required.
+        parser = _RecordingToolParser(processor.tokenizer, tool_prefix="")
+        processor.tool_parser_obj = self._make_parser_factory(parser)
+        request = {"tool_choice": "required", "prompt_tokens": "no sentinel"}
+
+        first = {
+            "finished": False,
+            "request_id": "stream-noprefix",
+            "outputs": {"token_ids": [7]},
+        }
+        processor.process_response_dict_streaming(first, request=request)
+        self.assertEqual(parser.streaming_calls[0]["delta_text"], "7")
+        self.assertFalse(parser._tool_prefix_injected_to_delta)
+
+
 if __name__ == "__main__":
     unittest.main()

From 47a7e23fddcdc0ad84c5e68d465797c7d208a2ec Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Thu, 21 May 2026 16:49:32 +0800
Subject: [PATCH 02/10] fix bug

---
 .../entrypoints/openai/response_processors.py |  3 +-
 fastdeploy/entrypoints/openai/serving_chat.py |  2 +
 fastdeploy/input/base_processor.py            | 32 ++++----
 tests/input/test_text_processor.py            | 82 +++++++++++++------
 4 files changed, 73 insertions(+), 46 deletions(-)

diff --git a/fastdeploy/entrypoints/openai/response_processors.py b/fastdeploy/entrypoints/openai/response_processors.py
index ffaaf0f4aa5..b0c9e6adcd1 100644
--- a/fastdeploy/entrypoints/openai/response_processors.py
+++ b/fastdeploy/entrypoints/openai/response_processors.py
@@ -72,7 +72,7 @@ def accumulate_token_ids(self, request_output):
             else:
                 self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output})
 
-    async def process_response_chat(self, request_outputs, stream, include_stop_str_in_output, request):
+    async def process_response_chat(self, request_outputs, stream, include_stop_str_in_output, request, prompt_tokens):
         """
         Process a list of responses into a generator that yields each processed response as it's generated.
         Args:
@@ -101,6 +101,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_
                                 audio_tokens=all_audio_tokens,
                                 tts=tts,
                                 request=request,
+                                prompt_tokens=prompt_tokens,
                             )
                         else:
                             response = self.data_processor.process_response_dict(
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index d6429521f05..25b77220d27 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -317,6 +317,7 @@ async def chat_completion_stream_generator(
                     stream=True,
                     include_stop_str_in_output=include_stop_str_in_output,
                     request=request,
+                    prompt_tokens=prompt_tokens,
                 )
 
                 async for res in generator:
@@ -650,6 +651,7 @@ async def chat_completion_full_generator(
                     stream=False,
                     include_stop_str_in_output=include_stop_str_in_output,
                     request=request,
+                    prompt_tokens=prompt_tokens,
                 )
                 async for data in generator:
                     idx = get_choice_index(data["request_id"])
diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py
index 0f655f60bcd..019635e160e 100644
--- a/fastdeploy/input/base_processor.py
+++ b/fastdeploy/input/base_processor.py
@@ -60,16 +60,13 @@
 def _is_forced_tool_choice(tool_choice) -> bool:
     """Return True iff ``tool_choice`` requires the chat template to inject
     a tool-call prefix into the prompt — i.e. ``"required"`` or a named-tool
-    choice (``{"type": "function", "function": {...}}``).
-
-    By the time this runs, the request has already been dumped to dict form,
-    so ``tool_choice`` is either a string or a dict.
+    choice (a ``ChatCompletionNamedToolChoiceParam`` pydantic model).
     """
     if isinstance(tool_choice, str):
         return tool_choice == "required"
-    if isinstance(tool_choice, dict):
-        return tool_choice.get("type") == "function"
-    return False
+    # Duck-type the pydantic ``ChatCompletionNamedToolChoiceParam`` via its
+    # ``type`` attribute to avoid importing the protocol module here.
+    return getattr(tool_choice, "type", None) == "function"
 
 
 class BaseTextProcessor(ABC):
@@ -281,12 +278,14 @@ def process_response_dict(self, response_dict, **kwargs):
         else:
             return self.process_response_dict_normal(response_dict, **kwargs)
 
-    def _prepare_tool_prefix(self, tool_parser, request):
+    def _prepare_tool_prefix(self, tool_parser, prompt_tokens):
         """Compute and cache on ``tool_parser`` the tool-call prefix that the
         chat template may have injected at the tail of the rendered prompt
         (e.g. for ``tool_choice=required``).
 
-        The detection is delegated to the parser itself
+        ``prompt_tokens`` is the rendered-prompt string passed in by the
+        serving layer (see ``response_processors.process_response_chat``).
+        The detection itself is delegated to the parser
         (:meth:`ToolParser.detect_tool_prefix`) so each parser controls
         which sentinel tokens it recognizes. We compute once per parser
         instance — for non-streaming a fresh instance is created per request,
@@ -296,13 +295,10 @@ def _prepare_tool_prefix(self, tool_parser, request):
             return
         tool_parser._tool_prefix_computed = True
         tool_parser._tool_prefix = ""
-        if not request:
-            return
-        prompt_str = request.get("prompt_tokens")
-        if not prompt_str or not isinstance(prompt_str, str):
+        if not prompt_tokens or not isinstance(prompt_tokens, str):
             return
         try:
-            tool_parser._tool_prefix = tool_parser.detect_tool_prefix(prompt_str) or ""
+            tool_parser._tool_prefix = tool_parser.detect_tool_prefix(prompt_tokens) or ""
         except Exception:
             data_processor_logger.exception("detect_tool_prefix failed; falling back to empty prefix")
             tool_parser._tool_prefix = ""
@@ -342,8 +338,8 @@ def process_response_dict_normal(self, response_dict, **kwargs):
             if self.tool_parser_obj:
                 tool_parser = self.tool_parser_obj(self.tokenizer)
                 parser_input = full_text
-                if _is_forced_tool_choice(request.get("tool_choice")):
-                    self._prepare_tool_prefix(tool_parser, request)
+                if _is_forced_tool_choice(request.tool_choice):
+                    self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens"))
                     if tool_parser._tool_prefix:
                         parser_input = tool_parser._tool_prefix + full_text
                 tool_call_info = tool_parser.extract_tool_calls(parser_input, request)
@@ -403,8 +399,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             stream_previous = previous_texts
             stream_current = previous_texts + delta_text
             stream_delta = delta_text
-            if _is_forced_tool_choice(request.get("tool_choice")):
-                self._prepare_tool_prefix(tool_parser, request)
+            if _is_forced_tool_choice(request.tool_choice):
+                self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens"))
                 prefix = tool_parser._tool_prefix
                 # When the chat template injected a forced tool-call prefix into
                 # the prompt, the model output starts mid-tool-call. We splice
diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py
index cf143aaef0b..e3fd91782f5 100644
--- a/tests/input/test_text_processor.py
+++ b/tests/input/test_text_processor.py
@@ -769,17 +769,17 @@ def test_other_strings(self):
         self.assertFalse(self._is_forced("none"))
         self.assertFalse(self._is_forced(""))
 
-    def test_named_function_dict(self):
-        self.assertTrue(self._is_forced({"type": "function", "function": {"name": "f"}}))
+    def test_pydantic_named_tool_choice(self):
+        named = SimpleNamespace(type="function", function=SimpleNamespace(name="f"))
+        self.assertTrue(self._is_forced(named))
 
-    def test_dict_without_function_type(self):
-        self.assertFalse(self._is_forced({"type": "other"}))
-        self.assertFalse(self._is_forced({}))
+    def test_pydantic_other_type(self):
+        self.assertFalse(self._is_forced(SimpleNamespace(type="other")))
+        self.assertFalse(self._is_forced(SimpleNamespace()))
 
     def test_none_and_other_types(self):
         self.assertFalse(self._is_forced(None))
         self.assertFalse(self._is_forced(123))
-        self.assertFalse(self._is_forced(["required"]))
 
 
 class _RecordingToolParser:
@@ -850,28 +850,32 @@ def _make_parser_factory(self, parser):
 
     def test_prepare_tool_prefix_idempotent(self):
         parser = _RecordingToolParser(self.processor.tokenizer)
-        request = {"prompt_tokens": "history\n<tool_call>"}
+        prompt = "history\n<tool_call>"
 
-        self.processor._prepare_tool_prefix(parser, request)
+        self.processor._prepare_tool_prefix(parser, prompt)
         self.assertTrue(parser._tool_prefix_computed)
         self.assertEqual(parser._tool_prefix, "<tool_call>")
         self.assertEqual(len(parser.detect_calls), 1)
 
         # Second call must not invoke detect again.
-        self.processor._prepare_tool_prefix(parser, request)
+        self.processor._prepare_tool_prefix(parser, prompt)
         self.assertEqual(len(parser.detect_calls), 1)
 
     def test_prepare_tool_prefix_no_prompt(self):
         parser = _RecordingToolParser(self.processor.tokenizer)
-        self.processor._prepare_tool_prefix(parser, {})
+        self.processor._prepare_tool_prefix(parser, None)
         self.assertTrue(parser._tool_prefix_computed)
         self.assertEqual(parser._tool_prefix, "")
         self.assertEqual(parser.detect_calls, [])
 
+        parser2 = _RecordingToolParser(self.processor.tokenizer)
+        self.processor._prepare_tool_prefix(parser2, "")
+        self.assertEqual(parser2._tool_prefix, "")
+        self.assertEqual(parser2.detect_calls, [])
+
     def test_prepare_tool_prefix_handles_exception(self):
         parser = _RecordingToolParser(self.processor.tokenizer, detect_raises=True)
-        request = {"prompt_tokens": "history\n<tool_call>"}
-        self.processor._prepare_tool_prefix(parser, request)
+        self.processor._prepare_tool_prefix(parser, "history\n<tool_call>")
         self.assertTrue(parser._tool_prefix_computed)
         self.assertEqual(parser._tool_prefix, "")
 
@@ -885,12 +889,13 @@ def test_normal_path_splices_prefix_when_required(self):
             "finished": True,
             "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
         }
-        request = {
-            "tool_choice": "required",
-            "prompt_tokens": "user msg\n<tool_call>",
-        }
+        request = SimpleNamespace(tool_choice="required")
 
-        processor.process_response_dict_normal(response, request=request)
+        processor.process_response_dict_normal(
+            response,
+            request=request,
+            prompt_tokens="user msg\n<tool_call>",
+        )
         self.assertEqual(len(parser.extract_calls), 1)
         # Model output is "7" after decoding token 7; prefix must be prepended.
         self.assertTrue(parser.extract_calls[0].startswith("<tool_call>"))
@@ -906,21 +911,44 @@ def test_normal_path_no_splice_when_not_required(self):
             "finished": True,
             "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
         }
-        request = {"tool_choice": "auto", "prompt_tokens": "user msg\n<tool_call>"}
+        request = SimpleNamespace(tool_choice="auto")
 
-        processor.process_response_dict_normal(response, request=request)
+        processor.process_response_dict_normal(
+            response,
+            request=request,
+            prompt_tokens="user msg\n<tool_call>",
+        )
         # detect_tool_prefix must NOT be called for non-forced choices.
         self.assertEqual(parser.detect_calls, [])
         self.assertFalse(parser.extract_calls[0].startswith("<tool_call>"))
 
-    def test_streaming_path_splices_prefix_only_on_first_delta(self):
+    def test_normal_path_named_tool_choice_pydantic(self):
+        """A pydantic ``ChatCompletionNamedToolChoiceParam`` (duck-typed via
+        ``type='function'``) must also trigger prefix splicing."""
         processor = self.processor
         parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")
         processor.tool_parser_obj = self._make_parser_factory(parser)
-        request = {
-            "tool_choice": "required",
-            "prompt_tokens": "user msg\n<tool_call>",
+
+        response = {
+            "request_id": "req-named",
+            "finished": True,
+            "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
         }
+        request = SimpleNamespace(tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")))
+
+        processor.process_response_dict_normal(
+            response,
+            request=request,
+            prompt_tokens="user msg\n<tool_call>",
+        )
+        self.assertTrue(parser.extract_calls[0].startswith("<tool_call>"))
+
+    def test_streaming_path_splices_prefix_only_on_first_delta(self):
+        processor = self.processor
+        parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")
+        processor.tool_parser_obj = self._make_parser_factory(parser)
+        request = SimpleNamespace(tool_choice="required")
+        prompt_tokens = "user msg\n<tool_call>"
 
         # First chunk
         first = {
@@ -928,7 +956,7 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self):
             "request_id": "stream-req",
             "outputs": {"token_ids": [7]},
         }
-        processor.process_response_dict_streaming(first, request=request)
+        processor.process_response_dict_streaming(first, request=request, prompt_tokens=prompt_tokens)
         first_call = parser.streaming_calls[0]
         # delta_text decodes to "7"; previous="" current="7"
         self.assertEqual(first_call["previous_text"], "<tool_call>")
@@ -942,7 +970,7 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self):
             "request_id": "stream-req",
             "outputs": {"token_ids": [8, processor.tokenizer.eos_token_id]},
         }
-        processor.process_response_dict_streaming(second, request=request)
+        processor.process_response_dict_streaming(second, request=request, prompt_tokens=prompt_tokens)
         second_call = parser.streaming_calls[1]
         self.assertEqual(second_call["previous_text"], "<tool_call>7")
         self.assertEqual(second_call["current_text"], "<tool_call>78")
@@ -955,14 +983,14 @@ def test_streaming_path_no_splice_when_no_prefix_detected(self):
         # Empty configured prefix => detect returns "" even with required.
         parser = _RecordingToolParser(processor.tokenizer, tool_prefix="")
         processor.tool_parser_obj = self._make_parser_factory(parser)
-        request = {"tool_choice": "required", "prompt_tokens": "no sentinel"}
+        request = SimpleNamespace(tool_choice="required")
 
         first = {
             "finished": False,
             "request_id": "stream-noprefix",
             "outputs": {"token_ids": [7]},
         }
-        processor.process_response_dict_streaming(first, request=request)
+        processor.process_response_dict_streaming(first, request=request, prompt_tokens="no sentinel")
         self.assertEqual(parser.streaming_calls[0]["delta_text"], "7")
         self.assertFalse(parser._tool_prefix_injected_to_delta)
 

From 7041f47790f58388bbd82528e2786b81fab22e5e Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Thu, 21 May 2026 17:53:49 +0800
Subject: [PATCH 03/10] fix unit test

---
 fastdeploy/input/base_processor.py | 33 +++++++---
 tests/input/test_text_processor.py | 97 +++++++++++++++++++++++++-----
 2 files changed, 108 insertions(+), 22 deletions(-)

diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py
index 019635e160e..de906cdf552 100644
--- a/fastdeploy/input/base_processor.py
+++ b/fastdeploy/input/base_processor.py
@@ -57,16 +57,33 @@
 _SAMPLING_EPS = 1e-5
 
 
-def _is_forced_tool_choice(tool_choice) -> bool:
-    """Return True iff ``tool_choice`` requires the chat template to inject
-    a tool-call prefix into the prompt — i.e. ``"required"`` or a named-tool
-    choice (a ``ChatCompletionNamedToolChoiceParam`` pydantic model).
+def _is_forced_tool_choice(request) -> bool:
+    """Return True iff the request asks the chat template to inject a
+    tool-call prefix into the prompt. Two ways are recognized:
+
+    1. ``request.tool_choice == "required"`` or a named-tool choice (a
+       ``ChatCompletionNamedToolChoiceParam`` pydantic model with
+       ``type == "function"``).
+    2. ``request.chat_template_kwargs.options.tool_choice.mode == "force"``
+       — used by chat templates that drive forced tool calls through their
+       own ``options`` dict instead of the OpenAI-style ``tool_choice``
+       field.
     """
+    tool_choice = getattr(request, "tool_choice", None)
     if isinstance(tool_choice, str):
-        return tool_choice == "required"
+        if tool_choice == "required":
+            return True
     # Duck-type the pydantic ``ChatCompletionNamedToolChoiceParam`` via its
     # ``type`` attribute to avoid importing the protocol module here.
-    return getattr(tool_choice, "type", None) == "function"
+    elif getattr(tool_choice, "type", None) == "function":
+        return True
+
+    chat_template_kwargs = getattr(request, "chat_template_kwargs", None) or {}
+    options = chat_template_kwargs.get("options") if isinstance(chat_template_kwargs, dict) else None
+    inner = options.get("tool_choice") if isinstance(options, dict) else None
+    if isinstance(inner, dict) and inner.get("mode") == "force":
+        return True
+    return False
 
 
 class BaseTextProcessor(ABC):
@@ -338,7 +355,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
             if self.tool_parser_obj:
                 tool_parser = self.tool_parser_obj(self.tokenizer)
                 parser_input = full_text
-                if _is_forced_tool_choice(request.tool_choice):
+                if _is_forced_tool_choice(request):
                     self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens"))
                     if tool_parser._tool_prefix:
                         parser_input = tool_parser._tool_prefix + full_text
@@ -399,7 +416,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             stream_previous = previous_texts
             stream_current = previous_texts + delta_text
             stream_delta = delta_text
-            if _is_forced_tool_choice(request.tool_choice):
+            if _is_forced_tool_choice(request):
                 self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens"))
                 prefix = tool_parser._tool_prefix
                 # When the chat template injected a forced tool-call prefix into
diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py
index e3fd91782f5..faf88ec03f4 100644
--- a/tests/input/test_text_processor.py
+++ b/tests/input/test_text_processor.py
@@ -130,6 +130,8 @@ def _create_dummy_modules():
         info=lambda *args, **kwargs: None,
         warning=lambda *args, **kwargs: None,
         debug=lambda *args, **kwargs: None,
+        exception=lambda *args, **kwargs: None,
+        error=lambda *args, **kwargs: None,
     )
 
     CHOICE_SEPARATOR = "::n::"
@@ -570,7 +572,9 @@ def test_process_response_with_reasoning_and_tools(self):
             "outputs": {"token_ids": [1, processor.tokenizer.eos_token_id]},
         }
 
-        processed = processor.process_response_dict(response, stream=False)
+        processed = processor.process_response_dict(
+            response, stream=False, request=SimpleNamespace(tool_choice="none")
+        )
         self.assertEqual(processed["outputs"]["reasoning_content"], "think")
         self.assertEqual(processed["outputs"]["tool_calls"], ["tool"])
 
@@ -597,7 +601,9 @@ def test_process_response_streaming_with_reasoning_and_tools(self):
             "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
         }
 
-        result = processor.process_response_dict_streaming(response, enable_thinking=True)
+        result = processor.process_response_dict_streaming(
+            response, enable_thinking=True, request=SimpleNamespace(tool_choice="none")
+        )
         self.assertEqual(result["outputs"]["completion_tokens"], "7")
         self.assertEqual(result["outputs"]["text"], "tool-text")
         self.assertEqual(result["outputs"]["reasoning_content"], "because")
@@ -615,7 +621,9 @@ def test_process_response_dict_normal_with_reasoning(self):
             "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
         }
 
-        result = processor.process_response_dict_normal(response, enable_thinking=True)
+        result = processor.process_response_dict_normal(
+            response, enable_thinking=True, request=SimpleNamespace(tool_choice="none")
+        )
         self.assertEqual(result["outputs"]["completion_tokens"], "7")
         self.assertEqual(result["outputs"]["reasoning_content"], "because")
         self.assertEqual(result["outputs"]["reasoning_token_num"], 1)
@@ -754,32 +762,68 @@ def custom_convert(tokens):
 
 
 class IsForcedToolChoiceTest(unittest.TestCase):
-    """Tests for the module-level ``_is_forced_tool_choice`` helper."""
+    """Tests for the module-level ``_is_forced_tool_choice`` helper.
+
+    The helper takes a request-like object (something with ``tool_choice``
+    and ``chat_template_kwargs`` attributes) and returns whether the chat
+    template will inject a tool-call prefix.
+    """
 
     def setUp(self):
         from fastdeploy.input import base_processor
 
         self._is_forced = base_processor._is_forced_tool_choice
 
+    def _req(self, *, tool_choice=None, chat_template_kwargs=None):
+        return SimpleNamespace(
+            tool_choice=tool_choice,
+            chat_template_kwargs=chat_template_kwargs,
+        )
+
     def test_required_string(self):
-        self.assertTrue(self._is_forced("required"))
+        self.assertTrue(self._is_forced(self._req(tool_choice="required")))
 
     def test_other_strings(self):
-        self.assertFalse(self._is_forced("auto"))
-        self.assertFalse(self._is_forced("none"))
-        self.assertFalse(self._is_forced(""))
+        self.assertFalse(self._is_forced(self._req(tool_choice="auto")))
+        self.assertFalse(self._is_forced(self._req(tool_choice="none")))
+        self.assertFalse(self._is_forced(self._req(tool_choice="")))
 
     def test_pydantic_named_tool_choice(self):
         named = SimpleNamespace(type="function", function=SimpleNamespace(name="f"))
-        self.assertTrue(self._is_forced(named))
+        self.assertTrue(self._is_forced(self._req(tool_choice=named)))
 
     def test_pydantic_other_type(self):
-        self.assertFalse(self._is_forced(SimpleNamespace(type="other")))
-        self.assertFalse(self._is_forced(SimpleNamespace()))
+        self.assertFalse(self._is_forced(self._req(tool_choice=SimpleNamespace(type="other"))))
+        self.assertFalse(self._is_forced(self._req(tool_choice=SimpleNamespace())))
 
-    def test_none_and_other_types(self):
-        self.assertFalse(self._is_forced(None))
-        self.assertFalse(self._is_forced(123))
+    def test_no_tool_choice_no_options(self):
+        self.assertFalse(self._is_forced(self._req()))
+
+    def test_chat_template_options_force_mode(self):
+        kwargs = {
+            "options": {
+                "tool_choice": {"mode": "force", "name": "get_current_weather"},
+            }
+        }
+        self.assertTrue(self._is_forced(self._req(chat_template_kwargs=kwargs)))
+
+    def test_chat_template_options_non_force_mode(self):
+        kwargs = {"options": {"tool_choice": {"mode": "auto"}}}
+        self.assertFalse(self._is_forced(self._req(chat_template_kwargs=kwargs)))
+
+    def test_chat_template_options_missing_tool_choice(self):
+        self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": {}})))
+        self.assertFalse(self._is_forced(self._req(chat_template_kwargs={})))
+
+    def test_chat_template_options_malformed(self):
+        # Non-dict options/inner must be tolerated (no crash, returns False).
+        self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": "x"})))
+        self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": {"tool_choice": "x"}})))
+
+    def test_tool_choice_takes_priority_over_options(self):
+        kwargs = {"options": {"tool_choice": {"mode": "auto"}}}
+        # Even with non-force mode in options, an explicit "required" wins.
+        self.assertTrue(self._is_forced(self._req(tool_choice="required", chat_template_kwargs=kwargs)))
 
 
 class _RecordingToolParser:
@@ -943,6 +987,31 @@ def test_normal_path_named_tool_choice_pydantic(self):
         )
         self.assertTrue(parser.extract_calls[0].startswith("<tool_call>"))
 
+    def test_normal_path_chat_template_force_mode(self):
+        """Forcing through ``chat_template_kwargs.options.tool_choice.mode``
+        must also trigger prefix splicing even when ``tool_choice`` is unset
+        (the default ``"none"``)."""
+        processor = self.processor
+        parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")
+        processor.tool_parser_obj = self._make_parser_factory(parser)
+
+        response = {
+            "request_id": "req-cti",
+            "finished": True,
+            "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
+        }
+        request = SimpleNamespace(
+            tool_choice="none",
+            chat_template_kwargs={"options": {"tool_choice": {"mode": "force", "name": "get_current_weather"}}},
+        )
+
+        processor.process_response_dict_normal(
+            response,
+            request=request,
+            prompt_tokens="user msg\n<tool_call>",
+        )
+        self.assertTrue(parser.extract_calls[0].startswith("<tool_call>"))
+
     def test_streaming_path_splices_prefix_only_on_first_delta(self):
         processor = self.processor
         parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")

From 3017744f0dab25f35a44257805423c8a2566a02e Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Thu, 21 May 2026 19:22:53 +0800
Subject: [PATCH 04/10] fix

---
 fastdeploy/input/base_processor.py | 17 ++++++++---------
 tests/input/test_text_processor.py | 30 +++++++++++++++++++-----------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py
index de906cdf552..1e0640e878f 100644
--- a/fastdeploy/input/base_processor.py
+++ b/fastdeploy/input/base_processor.py
@@ -61,21 +61,20 @@ def _is_forced_tool_choice(request) -> bool:
     """Return True iff the request asks the chat template to inject a
     tool-call prefix into the prompt. Two ways are recognized:
 
-    1. ``request.tool_choice == "required"`` or a named-tool choice (a
+    1. ``request.tool_choice`` is a named-tool choice (a
        ``ChatCompletionNamedToolChoiceParam`` pydantic model with
-       ``type == "function"``).
+       ``type == "function"``). The plain ``"required"`` string does NOT
+       trigger prefix injection in the chat template.
     2. ``request.chat_template_kwargs.options.tool_choice.mode == "force"``
        — used by chat templates that drive forced tool calls through their
        own ``options`` dict instead of the OpenAI-style ``tool_choice``
        field.
     """
-    tool_choice = getattr(request, "tool_choice", None)
-    if isinstance(tool_choice, str):
-        if tool_choice == "required":
-            return True
-    # Duck-type the pydantic ``ChatCompletionNamedToolChoiceParam`` via its
-    # ``type`` attribute to avoid importing the protocol module here.
-    elif getattr(tool_choice, "type", None) == "function":
+    tool_choice = request.tool_choice
+    # Named-tool choices are pydantic ``ChatCompletionNamedToolChoiceParam``
+    # objects (``type == "function"``); plain string values such as
+    # ``"required"`` / ``"auto"`` / ``"none"`` are skipped here.
+    if not isinstance(tool_choice, str) and getattr(tool_choice, "type", None) == "function":
         return True
 
     chat_template_kwargs = getattr(request, "chat_template_kwargs", None) or {}
diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py
index faf88ec03f4..b75f457a7cb 100644
--- a/tests/input/test_text_processor.py
+++ b/tests/input/test_text_processor.py
@@ -780,10 +780,10 @@ def _req(self, *, tool_choice=None, chat_template_kwargs=None):
             chat_template_kwargs=chat_template_kwargs,
         )
 
-    def test_required_string(self):
-        self.assertTrue(self._is_forced(self._req(tool_choice="required")))
-
-    def test_other_strings(self):
+    def test_string_tool_choice_never_forces(self):
+        # Plain string tool_choice values do NOT cause the chat template to
+        # inject a tool-call prefix, even when the value is ``"required"``.
+        self.assertFalse(self._is_forced(self._req(tool_choice="required")))
         self.assertFalse(self._is_forced(self._req(tool_choice="auto")))
         self.assertFalse(self._is_forced(self._req(tool_choice="none")))
         self.assertFalse(self._is_forced(self._req(tool_choice="")))
@@ -821,9 +821,10 @@ def test_chat_template_options_malformed(self):
         self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": {"tool_choice": "x"}})))
 
     def test_tool_choice_takes_priority_over_options(self):
-        kwargs = {"options": {"tool_choice": {"mode": "auto"}}}
-        # Even with non-force mode in options, an explicit "required" wins.
-        self.assertTrue(self._is_forced(self._req(tool_choice="required", chat_template_kwargs=kwargs)))
+        kwargs = {"options": {"tool_choice": {"mode": "force"}}}
+        # Named-tool pydantic choice combined with options.force still forces.
+        named = SimpleNamespace(type="function", function=SimpleNamespace(name="f"))
+        self.assertTrue(self._is_forced(self._req(tool_choice=named, chat_template_kwargs=kwargs)))
 
 
 class _RecordingToolParser:
@@ -933,7 +934,10 @@ def test_normal_path_splices_prefix_when_required(self):
             "finished": True,
             "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
         }
-        request = SimpleNamespace(tool_choice="required")
+        # Named-tool pydantic choice triggers prefix injection.
+        request = SimpleNamespace(
+            tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")),
+        )
 
         processor.process_response_dict_normal(
             response,
@@ -1016,7 +1020,9 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self):
         processor = self.processor
         parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")
         processor.tool_parser_obj = self._make_parser_factory(parser)
-        request = SimpleNamespace(tool_choice="required")
+        request = SimpleNamespace(
+            tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")),
+        )
         prompt_tokens = "user msg\n<tool_call>"
 
         # First chunk
@@ -1049,10 +1055,12 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self):
 
     def test_streaming_path_no_splice_when_no_prefix_detected(self):
         processor = self.processor
-        # Empty configured prefix => detect returns "" even with required.
+        # Empty configured prefix => detect returns "" even when forced.
         parser = _RecordingToolParser(processor.tokenizer, tool_prefix="")
         processor.tool_parser_obj = self._make_parser_factory(parser)
-        request = SimpleNamespace(tool_choice="required")
+        request = SimpleNamespace(
+            tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")),
+        )
 
         first = {
             "finished": False,

From 2ac3eb99cdf9161fd185a2707272a49761f08d15 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Thu, 21 May 2026 19:39:35 +0800
Subject: [PATCH 05/10] fix review

---
 .../tool_parsers/abstract_tool_parser.py      | 17 +++-----
 fastdeploy/input/base_processor.py            | 43 +++++++++++++++----
 tests/input/test_text_processor.py            | 15 +++++++
 3 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index 641f56dc82b..83d3ab4a924 100644
--- a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -51,19 +51,14 @@ def __init__(self, tokenizer):
 
         self.model_tokenizer = tokenizer
 
-        # Per-request tool-prefix state, populated by the serving layer when
-        # ``tool_choice=required`` (or similar) causes a tool-call prefix to be
-        # appended to the rendered prompt by the chat template. The parser
-        # itself does not compute these — the serving layer calls
-        # :meth:`detect_tool_prefix` and stashes the result here.
+        # Per-request tool-prefix state populated by the serving layer when
+        # the chat template injects a forced tool-call prefix into the prompt.
         self._tool_prefix: str = ""
-        # Idempotency flag: the serving layer may invoke its preparation hook
-        # once per streaming chunk, but the prefix only needs to be computed
-        # once per request. Set to ``True`` after the first computation.
+        self._tool_prefix_token_ids: list[int] = []
+        # Set after the prefix is computed once for this request.
         self._tool_prefix_computed: bool = False
-        # Whether the prefix has already been spliced into ``delta_text`` for
-        # the streaming path. Only the first streaming call needs the splice;
-        # subsequent calls keep ``delta_text`` untouched.
+        # Set after the prefix has been spliced into the streaming delta
+        # (only the first chunk needs it).
         self._tool_prefix_injected_to_delta: bool = False
 
     @cached_property
diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py
index 1e0640e878f..896a27707ce 100644
--- a/fastdeploy/input/base_processor.py
+++ b/fastdeploy/input/base_processor.py
@@ -311,13 +311,26 @@ def _prepare_tool_prefix(self, tool_parser, prompt_tokens):
             return
         tool_parser._tool_prefix_computed = True
         tool_parser._tool_prefix = ""
+        tool_parser._tool_prefix_token_ids = []
         if not prompt_tokens or not isinstance(prompt_tokens, str):
             return
         try:
-            tool_parser._tool_prefix = tool_parser.detect_tool_prefix(prompt_tokens) or ""
+            prefix = tool_parser.detect_tool_prefix(prompt_tokens) or ""
         except Exception:
             data_processor_logger.exception("detect_tool_prefix failed; falling back to empty prefix")
-            tool_parser._tool_prefix = ""
+            return
+        tool_parser._tool_prefix = prefix
+        if not prefix:
+            return
+        # Encode the prefix into token ids so the streaming path can also
+        # splice ``previous/current/delta_token_ids`` — some parsers gate on
+        # ``tool_call_start_token_id in current_token_ids`` rather than on
+        # text (e.g. ``Ernie45VLThinkingToolParser``).
+        try:
+            tool_parser._tool_prefix_token_ids = list(self.tokenizer.encode(prefix, add_special_tokens=False))
+        except Exception:
+            data_processor_logger.exception("encode tool prefix to token ids failed; token-id splice disabled")
+            tool_parser._tool_prefix_token_ids = []
 
     def process_response_dict_normal(self, response_dict, **kwargs):
         """Accumulate tokens and build the full completion text (non-streaming)."""
@@ -415,28 +428,40 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             stream_previous = previous_texts
             stream_current = previous_texts + delta_text
             stream_delta = delta_text
+            stream_previous_token_ids = previous_token_ids
+            stream_current_token_ids = previous_token_ids + token_ids
+            stream_delta_token_ids = token_ids
             if _is_forced_tool_choice(request):
                 self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens"))
                 prefix = tool_parser._tool_prefix
+                prefix_ids = tool_parser._tool_prefix_token_ids
                 # When the chat template injected a forced tool-call prefix into
                 # the prompt, the model output starts mid-tool-call. We splice
-                # the prefix back into the streaming arguments so the parser
-                # sees a complete sequence and its existing state machine works
-                # unchanged. ``delta_text`` only needs the splice on the first
-                # call so the parser's start-token detection fires once.
+                # the prefix back into both the text and token-id streaming
+                # arguments so parsers that gate on either form (e.g.
+                # ``Ernie45VLThinkingToolParser`` checks
+                # ``tool_call_start_token_id in current_token_ids``) see a
+                # complete sequence and their existing state machines work
+                # unchanged. The ``delta_*`` forms only need the splice on the
+                # first call so the parser's start detection fires once.
                 if prefix:
                     stream_previous = prefix + stream_previous
                     stream_current = prefix + stream_current
+                    if prefix_ids:
+                        stream_previous_token_ids = list(prefix_ids) + list(stream_previous_token_ids)
+                        stream_current_token_ids = list(prefix_ids) + list(stream_current_token_ids)
                     if not tool_parser._tool_prefix_injected_to_delta:
                         stream_delta = prefix + stream_delta
+                        if prefix_ids:
+                            stream_delta_token_ids = list(prefix_ids) + list(stream_delta_token_ids)
                         tool_parser._tool_prefix_injected_to_delta = True
             tool_call_delta_message = tool_parser.extract_tool_calls_streaming(
                 stream_previous,
                 stream_current,
                 stream_delta,
-                previous_token_ids,
-                previous_token_ids + token_ids,
-                token_ids,
+                stream_previous_token_ids,
+                stream_current_token_ids,
+                stream_delta_token_ids,
                 request,
             )
             if tool_call_delta_message:
diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py
index b75f457a7cb..d1329c6d01b 100644
--- a/tests/input/test_text_processor.py
+++ b/tests/input/test_text_processor.py
@@ -867,6 +867,9 @@ def extract_tool_calls_streaming(
                 "previous_text": previous_text,
                 "current_text": current_text,
                 "delta_text": delta_text,
+                "previous_token_ids": list(previous_token_ids),
+                "current_token_ids": list(current_token_ids),
+                "delta_token_ids": list(delta_token_ids),
             }
         )
         tool_calls = [
@@ -1037,7 +1040,13 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self):
         self.assertEqual(first_call["previous_text"], "<tool_call>")
         self.assertEqual(first_call["current_text"], "<tool_call>7")
         self.assertEqual(first_call["delta_text"], "<tool_call>7")
+        # token_ids must be spliced too — DummyTokenizer.encode("<tool_call>") -> [11].
+        prefix_ids = [11]
+        self.assertEqual(first_call["previous_token_ids"], prefix_ids)
+        self.assertEqual(first_call["current_token_ids"], prefix_ids + [7])
+        self.assertEqual(first_call["delta_token_ids"], prefix_ids + [7])
         self.assertTrue(parser._tool_prefix_injected_to_delta)
+        self.assertEqual(parser._tool_prefix_token_ids, prefix_ids)
 
         # Second chunk: delta must NOT be re-spliced, but previous/current are.
         second = {
@@ -1050,6 +1059,12 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self):
         self.assertEqual(second_call["previous_text"], "<tool_call>7")
         self.assertEqual(second_call["current_text"], "<tool_call>78")
         self.assertEqual(second_call["delta_text"], "8")  # no extra prefix splice
+        self.assertEqual(second_call["previous_token_ids"], prefix_ids + [7])
+        self.assertEqual(
+            second_call["current_token_ids"],
+            prefix_ids + [7, 8, processor.tokenizer.eos_token_id],
+        )
+        self.assertEqual(second_call["delta_token_ids"], [8, processor.tokenizer.eos_token_id])
         # detect should only run once across the whole stream.
         self.assertEqual(len(parser.detect_calls), 1)
 

From 01eb97c6bc8cd3be7cb4ef0e65e4e75a2ebb5af2 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Thu, 21 May 2026 20:00:14 +0800
Subject: [PATCH 06/10] fix review

---
 fastdeploy/entrypoints/openai/response_processors.py | 11 ++++++++++-
 fastdeploy/input/base_processor.py                   |  5 ++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/entrypoints/openai/response_processors.py b/fastdeploy/entrypoints/openai/response_processors.py
index b0c9e6adcd1..2cfef290201 100644
--- a/fastdeploy/entrypoints/openai/response_processors.py
+++ b/fastdeploy/entrypoints/openai/response_processors.py
@@ -72,7 +72,9 @@ def accumulate_token_ids(self, request_output):
             else:
                 self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output})
 
-    async def process_response_chat(self, request_outputs, stream, include_stop_str_in_output, request, prompt_tokens):
+    async def process_response_chat(
+        self, request_outputs, stream, include_stop_str_in_output, request, prompt_tokens=None
+    ):
         """
         Process a list of responses into a generator that yields each processed response as it's generated.
         Args:
@@ -111,6 +113,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_
                                 audio_tokens=all_audio_tokens,
                                 tts=tts,
                                 request=request,
+                                prompt_tokens=prompt_tokens,
                             )
                         yield response
                     elif decode_type == 2:  # audio
@@ -129,6 +132,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_
                             stream=stream,
                             include_stop_str_in_output=include_stop_str_in_output,
                             request=request,
+                            prompt_tokens=prompt_tokens,
                         )
                     else:
                         response = self.data_processor.process_response_dict(
@@ -136,6 +140,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_
                             stream=stream,
                             include_stop_str_in_output=include_stop_str_in_output,
                             request=request,
+                            prompt_tokens=prompt_tokens,
                         )
                     yield response
             elif stream:
@@ -169,6 +174,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_
                             stream=stream,
                             include_stop_str_in_output=include_stop_str_in_output,
                             request=request,
+                            prompt_tokens=prompt_tokens,
                         )
                     else:
                         self.data_processor.process_response_dict(
@@ -176,6 +182,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_
                             stream=stream,
                             include_stop_str_in_output=include_stop_str_in_output,
                             request=request,
+                            prompt_tokens=prompt_tokens,
                         )
                     text = {"type": "text", "text": request_output["outputs"]["text"]}
                     request_output["outputs"]["multipart"] = [text]
@@ -198,6 +205,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_
                                     stream=False,
                                     include_stop_str_in_output=include_stop_str_in_output,
                                     request=request,
+                                    prompt_tokens=prompt_tokens,
                                 )
                             else:
                                 self.data_processor.process_response_dict(
@@ -205,6 +213,7 @@ async def process_response_chat(self, request_outputs, stream, include_stop_str_
                                     stream=stream,
                                     include_stop_str_in_output=include_stop_str_in_output,
                                     request=request,
+                                    prompt_tokens=prompt_tokens,
                                 )
                             text = {"type": "text", "text": part["request_output"]["outputs"]["text"]}
                             multipart.append(text)
diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py
index 896a27707ce..7ffc34db97d 100644
--- a/fastdeploy/input/base_processor.py
+++ b/fastdeploy/input/base_processor.py
@@ -70,7 +70,10 @@ def _is_forced_tool_choice(request) -> bool:
        own ``options`` dict instead of the OpenAI-style ``tool_choice``
        field.
     """
-    tool_choice = request.tool_choice
+    if request is None:
+        return False
+
+    tool_choice = getattr(request, "tool_choice", None)
     # Named-tool choices are pydantic ``ChatCompletionNamedToolChoiceParam``
     # objects (``type == "function"``); plain string values such as
     # ``"required"`` / ``"auto"`` / ``"none"`` are skipped here.

From 7c5af98ba8c1e11b8e5020f49d78cfc5994e492c Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Fri, 22 May 2026 11:13:32 +0800
Subject: [PATCH 07/10] fix unit test

---
 .../tool_parsers/abstract_tool_parser.py      | 10 ++-
 fastdeploy/input/base_processor.py            | 85 +++++++++----------
 .../entrypoints/openai/test_finish_reason.py  |  8 +-
 .../openai/test_max_streaming_tokens.py       |  8 +-
 tests/input/test_text_processor.py            | 14 +--
 5 files changed, 65 insertions(+), 60 deletions(-)

diff --git a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index 83d3ab4a924..c0e1367f086 100644
--- a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -36,9 +36,10 @@ class ToolParser:
 
     # Subclasses should override these with the literal tool-call sentinel
     # tokens they recognize (e.g. ``"<tool_call>"`` / ``"</tool_call>"``).
-    # Used by :meth:`detect_tool_prefix` to support ``tool_choice=required``
-    # style prompt-prefix injection. Empty defaults make the detection a no-op
-    # for parsers that have not opted in.
+    # Used by :meth:`detect_tool_prefix` to support forced tool-call prompt
+    # prefix injection (named-tool ``tool_choice`` or
+    # ``chat_template_kwargs.options.tool_choice.mode == "force"``). Empty
+    # defaults make the detection a no-op for parsers that have not opted in.
     tool_call_start_token: str = ""
     tool_call_end_token: str = ""
 
@@ -75,7 +76,8 @@ def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionReques
 
     def detect_tool_prefix(self, prompt: str) -> str:
         """Detect a tool-call prefix that the chat template injected at the tail
-        of the rendered prompt to force tool output (``tool_choice=required``).
+        of the rendered prompt to force tool output (named-tool ``tool_choice``
+        or ``chat_template_kwargs.options.tool_choice.mode == "force"``).
 
         The check is generic: find the **last** occurrence of
         :attr:`tool_call_start_token` in ``prompt`` and, if it is **not** closed
diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py
index 7ffc34db97d..903fec68596 100644
--- a/fastdeploy/input/base_processor.py
+++ b/fastdeploy/input/base_processor.py
@@ -58,17 +58,12 @@
 
 
 def _is_forced_tool_choice(request) -> bool:
-    """Return True iff the request asks the chat template to inject a
-    tool-call prefix into the prompt. Two ways are recognized:
-
-    1. ``request.tool_choice`` is a named-tool choice (a
-       ``ChatCompletionNamedToolChoiceParam`` pydantic model with
-       ``type == "function"``). The plain ``"required"`` string does NOT
-       trigger prefix injection in the chat template.
-    2. ``request.chat_template_kwargs.options.tool_choice.mode == "force"``
-       — used by chat templates that drive forced tool calls through their
-       own ``options`` dict instead of the OpenAI-style ``tool_choice``
-       field.
+    """Return True iff the chat template should inject a forced tool-call
+    prefix into the prompt. Two recognized triggers:
+
+    1. ``request.tool_choice`` is a named-tool choice (pydantic model with
+       ``type == "function"``). Plain ``"required"`` does NOT trigger.
+    2. ``request.chat_template_kwargs.options.tool_choice.mode == "force"``.
     """
     if request is None:
         return False
@@ -169,6 +164,28 @@ def text2ids(self, text, max_model_len=None, **kwargs):
             )
         return tokens["input_ids"][0]
 
+    def _text_to_token_ids(self, text: str) -> list:
+        """Encode ``text`` to a ``list[int]``, shared by :meth:`messages2ids`
+        and :meth:`_prepare_tool_prefix`.
+
+        ``ernie4_5`` tokenizer hangs on long inputs via ``.encode()``, so it
+        goes through ``tokenize`` + ``convert_tokens_to_ids``. Other tokenizers
+        use ``.encode()`` and the result is normalized to a plain list.
+        """
+        if self.tokenizer_type == "ernie4_5":
+            # NOTE: ernie4_5 tokenizer will hang when meet long input when use .encode()
+            return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
+        token_ids = self.tokenizer.encode(text, add_special_tokens=False)
+        if hasattr(token_ids, "input_ids") or (isinstance(token_ids, dict) and "input_ids" in token_ids):
+            token_ids = token_ids["input_ids"]
+            if hasattr(token_ids, "ndim") and token_ids.ndim > 1:
+                token_ids = token_ids[0]
+        if hasattr(token_ids, "tolist"):
+            token_ids = token_ids.tolist()
+        if not isinstance(token_ids, list):
+            token_ids = list(token_ids)
+        return token_ids
+
     def messages2ids(self, request, **kwargs):
         """Convert a chat-template request into a token-ID list.
 
@@ -190,19 +207,7 @@ def messages2ids(self, request, **kwargs):
         )
         request["prompt_tokens"] = spliced_message
         req_id = request.get("request_id", None) if isinstance(request, dict) else None
-        if self.tokenizer_type == "ernie4_5":
-            # NOTE: ernie4_5 tokenizer will hang when meet long input when use .encode()
-            token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(spliced_message))
-        else:
-            token_ids = self.tokenizer.encode(spliced_message, add_special_tokens=False)
-            if hasattr(token_ids, "input_ids") or (isinstance(token_ids, dict) and "input_ids" in token_ids):
-                token_ids = token_ids["input_ids"]
-                if hasattr(token_ids, "ndim") and token_ids.ndim > 1:
-                    token_ids = token_ids[0]
-            if hasattr(token_ids, "tolist"):
-                token_ids = token_ids.tolist()
-            if not isinstance(token_ids, list):
-                token_ids = list(token_ids)
+        token_ids = self._text_to_token_ids(spliced_message)
         log_request(
             level=1,
             message="req_id:{req_id}, token_ids: {token_ids}",
@@ -264,12 +269,13 @@ def ids2tokens(self, token_id, task_id):
                 self.decode_status[task_id] = [0, 0, [], ""]
             status = self.decode_status[task_id]
             previous_texts = status[3]
+            previous_token_ids = list(status[2])
             status[2].extend(token_id)
             decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(status[2], status[0], status[1])
             status[0] = prefix_offset
             status[1] = read_offset
             status[3] += decode_str
-            return decode_str, status[2], previous_texts
+            return decode_str, previous_token_ids, previous_texts
 
     # ------------------------------------------------------------------
     # Response processing
@@ -298,17 +304,10 @@ def process_response_dict(self, response_dict, **kwargs):
             return self.process_response_dict_normal(response_dict, **kwargs)
 
     def _prepare_tool_prefix(self, tool_parser, prompt_tokens):
-        """Compute and cache on ``tool_parser`` the tool-call prefix that the
-        chat template may have injected at the tail of the rendered prompt
-        (e.g. for ``tool_choice=required``).
-
-        ``prompt_tokens`` is the rendered-prompt string passed in by the
-        serving layer (see ``response_processors.process_response_chat``).
-        The detection itself is delegated to the parser
-        (:meth:`ToolParser.detect_tool_prefix`) so each parser controls
-        which sentinel tokens it recognizes. We compute once per parser
-        instance — for non-streaming a fresh instance is created per request,
-        for streaming the instance is cached per ``request_id``.
+        """Detect and cache on ``tool_parser`` the tool-call prefix that the
+        chat template injected at the tail of ``prompt_tokens`` (the rendered
+        prompt string from the serving layer). Computed once per parser
+        instance via the parser's :meth:`ToolParser.detect_tool_prefix`.
         """
         if tool_parser._tool_prefix_computed:
             return
@@ -330,7 +329,7 @@ def _prepare_tool_prefix(self, tool_parser, prompt_tokens):
         # ``tool_call_start_token_id in current_token_ids`` rather than on
         # text (e.g. ``Ernie45VLThinkingToolParser``).
         try:
-            tool_parser._tool_prefix_token_ids = list(self.tokenizer.encode(prefix, add_special_tokens=False))
+            tool_parser._tool_prefix_token_ids = self._text_to_token_ids(prefix)
         except Exception:
             data_processor_logger.exception("encode tool prefix to token ids failed; token-id splice disabled")
             tool_parser._tool_prefix_token_ids = []
@@ -438,15 +437,11 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
                 self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens"))
                 prefix = tool_parser._tool_prefix
                 prefix_ids = tool_parser._tool_prefix_token_ids
-                # When the chat template injected a forced tool-call prefix into
-                # the prompt, the model output starts mid-tool-call. We splice
-                # the prefix back into both the text and token-id streaming
-                # arguments so parsers that gate on either form (e.g.
+                # Splice the injected prefix back into both text and token-id
+                # streaming args so parsers that gate on either form (e.g.
                 # ``Ernie45VLThinkingToolParser`` checks
-                # ``tool_call_start_token_id in current_token_ids``) see a
-                # complete sequence and their existing state machines work
-                # unchanged. The ``delta_*`` forms only need the splice on the
-                # first call so the parser's start detection fires once.
+                # ``tool_call_start_token_id in current_token_ids``) work
+                # unchanged. ``delta_*`` only spliced on the first call.
                 if prefix:
                     stream_previous = prefix + stream_previous
                     stream_current = prefix + stream_current
diff --git a/tests/entrypoints/openai/test_finish_reason.py b/tests/entrypoints/openai/test_finish_reason.py
index 067b80ca0e5..74ce54e21cd 100644
--- a/tests/entrypoints/openai/test_finish_reason.py
+++ b/tests/entrypoints/openai/test_finish_reason.py
@@ -262,7 +262,9 @@ async def test_chat_full_max_tokens(self, mock_data_logger, mock_processor_class
         mock_processor_instance = Mock()
         mock_processor_instance.enable_multimodal_content.return_value = True
 
-        async def mock_process_response_chat_async(response, stream, include_stop_str_in_output, request=None):
+        async def mock_process_response_chat_async(
+            response, stream, include_stop_str_in_output, request=None, prompt_tokens=None
+        ):
             yield response
 
         mock_processor_instance.process_response_chat = mock_process_response_chat_async
@@ -445,7 +447,9 @@ async def test_chat_stream_max_tokens(self, mock_api_logger, mock_processor_clas
         mock_processor_instance = Mock()
         mock_processor_instance.enable_multimodal_content.return_value = False
 
-        async def mock_process_response_chat_async(response, stream, include_stop_str_in_output, request=None):
+        async def mock_process_response_chat_async(
+            response, stream, include_stop_str_in_output, request=None, prompt_tokens=None
+        ):
             if isinstance(response, list):
                 for res in response:
                     yield res
diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py
index 63db437cc5d..c2efcdd03a0 100644
--- a/tests/entrypoints/openai/test_max_streaming_tokens.py
+++ b/tests/entrypoints/openai/test_max_streaming_tokens.py
@@ -222,7 +222,9 @@ async def test_integration_with_chat_stream_generator(self, mock_processor_class
 
         mock_processor_instance = Mock()
 
-        async def mock_process_response_chat_single(response, stream, include_stop_str_in_output, request=None):
+        async def mock_process_response_chat_single(
+            response, stream, include_stop_str_in_output, request=None, prompt_tokens=None
+        ):
             yield response
 
         mock_processor_instance.process_response_chat = mock_process_response_chat_single
@@ -639,7 +641,9 @@ async def test_chat_stream_usage_fields(self, mock_response_processor, api_serve
 
         mock_processor_instance = Mock()
 
-        async def mock_process_response_chat(response, stream, include_stop_str_in_output, request=None):
+        async def mock_process_response_chat(
+            response, stream, include_stop_str_in_output, request=None, prompt_tokens=None
+        ):
             delta_msg_mock = Mock()
             delta_msg_mock.content = response["outputs"]["text"]
             if response["outputs"]["text"] == "a":
diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py
index d1329c6d01b..20661d8b0ab 100644
--- a/tests/input/test_text_processor.py
+++ b/tests/input/test_text_processor.py
@@ -884,8 +884,9 @@ def extract_tool_calls_streaming(
 
 
 class ToolPrefixCompensationTest(unittest.TestCase):
-    """Tests for the ``tool_choice=required`` prefix compensation logic in
-    ``BaseTextProcessor``."""
+    """Tests for the forced-tool-call prefix compensation logic in
+    ``BaseTextProcessor`` (named-tool ``tool_choice`` and
+    ``chat_template_kwargs.options.tool_choice.mode == "force"``)."""
 
     def setUp(self):
         module, cleanup = _import_text_processor()
@@ -1059,12 +1060,11 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self):
         self.assertEqual(second_call["previous_text"], "<tool_call>7")
         self.assertEqual(second_call["current_text"], "<tool_call>78")
         self.assertEqual(second_call["delta_text"], "8")  # no extra prefix splice
+        # ``is_end=True`` causes the eos token to be stripped before ids2tokens,
+        # so token_ids fed to the parser is just [8].
         self.assertEqual(second_call["previous_token_ids"], prefix_ids + [7])
-        self.assertEqual(
-            second_call["current_token_ids"],
-            prefix_ids + [7, 8, processor.tokenizer.eos_token_id],
-        )
-        self.assertEqual(second_call["delta_token_ids"], [8, processor.tokenizer.eos_token_id])
+        self.assertEqual(second_call["current_token_ids"], prefix_ids + [7, 8])
+        self.assertEqual(second_call["delta_token_ids"], [8])
         # detect should only run once across the whole stream.
         self.assertEqual(len(parser.detect_calls), 1)
 

From 2d06f3ee5da89d15fc9fc68af9b8a2d570e3c440 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Fri, 22 May 2026 14:52:31 +0800
Subject: [PATCH 08/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=9D=A1=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastdeploy/entrypoints/openai/protocol.py     |  22 ---
 .../tool_parsers/abstract_tool_parser.py      |  24 +--
 fastdeploy/input/base_processor.py            |  70 +++-----
 tests/input/test_text_processor.py            | 158 +++---------------
 4 files changed, 48 insertions(+), 226 deletions(-)

diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py
index c25ade1a38a..82cdd26d92d 100644
--- a/fastdeploy/entrypoints/openai/protocol.py
+++ b/fastdeploy/entrypoints/openai/protocol.py
@@ -242,22 +242,6 @@ class ChatCompletionToolsParam(BaseModel):
     function: FunctionDefinition
 
 
-class ChatCompletionNamedFunction(BaseModel):
-    """Named function for ``tool_choice`` when forcing a specific tool."""
-
-    name: str
-
-
-class ChatCompletionNamedToolChoiceParam(BaseModel):
-    """OpenAI-compatible named tool choice — forces the model to call a
-    specific tool by name. Used as one of the values of
-    :attr:`ChatCompletionRequest.tool_choice`.
-    """
-
-    function: ChatCompletionNamedFunction
-    type: Literal["function"] = "function"
-
-
 class ChatMessage(BaseModel):
     """
     Chat message.
@@ -684,12 +668,6 @@ class ChatCompletionRequest(BaseModel):
     # https://platform.openai.com/docs/api-reference/chat/create
     messages: Union[List[Any], List[int]]
     tools: Optional[List[ChatCompletionToolsParam]] = None
-    tool_choice: Optional[
-        Union[
-            Literal["none", "auto", "required"],
-            ChatCompletionNamedToolChoiceParam,
-        ]
-    ] = "none"
     model: Optional[str] = "default"
     frequency_penalty: Optional[float] = Field(None, le=2, ge=-2)
     logprobs: Optional[bool] = False
diff --git a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index c0e1367f086..461f702cd1b 100644
--- a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -75,23 +75,13 @@ def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionReques
         return request
 
     def detect_tool_prefix(self, prompt: str) -> str:
-        """Detect a tool-call prefix that the chat template injected at the tail
-        of the rendered prompt to force tool output (named-tool ``tool_choice``
-        or ``chat_template_kwargs.options.tool_choice.mode == "force"``).
-
-        The check is generic: find the **last** occurrence of
-        :attr:`tool_call_start_token` in ``prompt`` and, if it is **not** closed
-        by a subsequent :attr:`tool_call_end_token`, treat the substring from
-        that position to the end of the prompt as the injected prefix. The
-        injected prefix must reach the very end of the prompt (modulo trailing
-        whitespace) — anything else is treated as historical / unrelated and
-        we conservatively return an empty string.
-
-        Returns ``""`` for parsers that have not declared their sentinel tokens
-        or for prompts where no such prefix is detected.
-
-        Subclasses with non-paired tag formats (e.g. a single sentinel without
-        a closing counterpart) may override this method.
+        """Detect the tool-call prefix injected at the tail of the rendered
+        prompt by a forced ``tool_choice``.
+
+        Finds the **last** :attr:`tool_call_start_token` in ``prompt`` that is
+        not closed by a later :attr:`tool_call_end_token` and reaches the
+        prompt end (modulo trailing whitespace). Returns ``""`` otherwise.
+        Subclasses with non-paired tag formats may override.
         """
         start = self.tool_call_start_token
         if not start or not prompt:
diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py
index 903fec68596..4baad8ee22e 100644
--- a/fastdeploy/input/base_processor.py
+++ b/fastdeploy/input/base_processor.py
@@ -57,32 +57,6 @@
 _SAMPLING_EPS = 1e-5
 
 
-def _is_forced_tool_choice(request) -> bool:
-    """Return True iff the chat template should inject a forced tool-call
-    prefix into the prompt. Two recognized triggers:
-
-    1. ``request.tool_choice`` is a named-tool choice (pydantic model with
-       ``type == "function"``). Plain ``"required"`` does NOT trigger.
-    2. ``request.chat_template_kwargs.options.tool_choice.mode == "force"``.
-    """
-    if request is None:
-        return False
-
-    tool_choice = getattr(request, "tool_choice", None)
-    # Named-tool choices are pydantic ``ChatCompletionNamedToolChoiceParam``
-    # objects (``type == "function"``); plain string values such as
-    # ``"required"`` / ``"auto"`` / ``"none"`` are skipped here.
-    if not isinstance(tool_choice, str) and getattr(tool_choice, "type", None) == "function":
-        return True
-
-    chat_template_kwargs = getattr(request, "chat_template_kwargs", None) or {}
-    options = chat_template_kwargs.get("options") if isinstance(chat_template_kwargs, dict) else None
-    inner = options.get("tool_choice") if isinstance(options, dict) else None
-    if isinstance(inner, dict) and inner.get("mode") == "force":
-        return True
-    return False
-
-
 class BaseTextProcessor(ABC):
     """Abstract base class shared by all text / VL processors.
 
@@ -369,10 +343,9 @@ def process_response_dict_normal(self, response_dict, **kwargs):
             if self.tool_parser_obj:
                 tool_parser = self.tool_parser_obj(self.tokenizer)
                 parser_input = full_text
-                if _is_forced_tool_choice(request):
-                    self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens"))
-                    if tool_parser._tool_prefix:
-                        parser_input = tool_parser._tool_prefix + full_text
+                self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens"))
+                if tool_parser._tool_prefix:
+                    parser_input = tool_parser._tool_prefix + full_text
                 tool_call_info = tool_parser.extract_tool_calls(parser_input, request)
                 if tool_call_info.tools_called:
                     response_dict["outputs"]["tool_calls"] = tool_call_info.tool_calls
@@ -433,26 +406,25 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             stream_previous_token_ids = previous_token_ids
             stream_current_token_ids = previous_token_ids + token_ids
             stream_delta_token_ids = token_ids
-            if _is_forced_tool_choice(request):
-                self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens"))
-                prefix = tool_parser._tool_prefix
-                prefix_ids = tool_parser._tool_prefix_token_ids
-                # Splice the injected prefix back into both text and token-id
-                # streaming args so parsers that gate on either form (e.g.
-                # ``Ernie45VLThinkingToolParser`` checks
-                # ``tool_call_start_token_id in current_token_ids``) work
-                # unchanged. ``delta_*`` only spliced on the first call.
-                if prefix:
-                    stream_previous = prefix + stream_previous
-                    stream_current = prefix + stream_current
+            self._prepare_tool_prefix(tool_parser, kwargs.get("prompt_tokens"))
+            prefix = tool_parser._tool_prefix
+            prefix_ids = tool_parser._tool_prefix_token_ids
+            # Splice the injected prefix back into both text and token-id
+            # streaming args so parsers that gate on either form (e.g.
+            # ``Ernie45VLThinkingToolParser`` checks
+            # ``tool_call_start_token_id in current_token_ids``) work
+            # unchanged. ``delta_*`` only spliced on the first call.
+            if prefix:
+                stream_previous = prefix + stream_previous
+                stream_current = prefix + stream_current
+                if prefix_ids:
+                    stream_previous_token_ids = list(prefix_ids) + list(stream_previous_token_ids)
+                    stream_current_token_ids = list(prefix_ids) + list(stream_current_token_ids)
+                if not tool_parser._tool_prefix_injected_to_delta:
+                    stream_delta = prefix + stream_delta
                     if prefix_ids:
-                        stream_previous_token_ids = list(prefix_ids) + list(stream_previous_token_ids)
-                        stream_current_token_ids = list(prefix_ids) + list(stream_current_token_ids)
-                    if not tool_parser._tool_prefix_injected_to_delta:
-                        stream_delta = prefix + stream_delta
-                        if prefix_ids:
-                            stream_delta_token_ids = list(prefix_ids) + list(stream_delta_token_ids)
-                        tool_parser._tool_prefix_injected_to_delta = True
+                        stream_delta_token_ids = list(prefix_ids) + list(stream_delta_token_ids)
+                    tool_parser._tool_prefix_injected_to_delta = True
             tool_call_delta_message = tool_parser.extract_tool_calls_streaming(
                 stream_previous,
                 stream_current,
diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py
index 20661d8b0ab..0efa4e9f7fc 100644
--- a/tests/input/test_text_processor.py
+++ b/tests/input/test_text_processor.py
@@ -573,7 +573,7 @@ def test_process_response_with_reasoning_and_tools(self):
         }
 
         processed = processor.process_response_dict(
-            response, stream=False, request=SimpleNamespace(tool_choice="none")
+            response, stream=False, request=SimpleNamespace(chat_template_kwargs=None)
         )
         self.assertEqual(processed["outputs"]["reasoning_content"], "think")
         self.assertEqual(processed["outputs"]["tool_calls"], ["tool"])
@@ -602,7 +602,7 @@ def test_process_response_streaming_with_reasoning_and_tools(self):
         }
 
         result = processor.process_response_dict_streaming(
-            response, enable_thinking=True, request=SimpleNamespace(tool_choice="none")
+            response, enable_thinking=True, request=SimpleNamespace(chat_template_kwargs=None)
         )
         self.assertEqual(result["outputs"]["completion_tokens"], "7")
         self.assertEqual(result["outputs"]["text"], "tool-text")
@@ -622,7 +622,7 @@ def test_process_response_dict_normal_with_reasoning(self):
         }
 
         result = processor.process_response_dict_normal(
-            response, enable_thinking=True, request=SimpleNamespace(tool_choice="none")
+            response, enable_thinking=True, request=SimpleNamespace(chat_template_kwargs=None)
         )
         self.assertEqual(result["outputs"]["completion_tokens"], "7")
         self.assertEqual(result["outputs"]["reasoning_content"], "because")
@@ -761,72 +761,6 @@ def custom_convert(tokens):
         self.assertEqual(processor.update_bad_words(["combo", "oversize"], []), [])
 
 
-class IsForcedToolChoiceTest(unittest.TestCase):
-    """Tests for the module-level ``_is_forced_tool_choice`` helper.
-
-    The helper takes a request-like object (something with ``tool_choice``
-    and ``chat_template_kwargs`` attributes) and returns whether the chat
-    template will inject a tool-call prefix.
-    """
-
-    def setUp(self):
-        from fastdeploy.input import base_processor
-
-        self._is_forced = base_processor._is_forced_tool_choice
-
-    def _req(self, *, tool_choice=None, chat_template_kwargs=None):
-        return SimpleNamespace(
-            tool_choice=tool_choice,
-            chat_template_kwargs=chat_template_kwargs,
-        )
-
-    def test_string_tool_choice_never_forces(self):
-        # Plain string tool_choice values do NOT cause the chat template to
-        # inject a tool-call prefix, even when the value is ``"required"``.
-        self.assertFalse(self._is_forced(self._req(tool_choice="required")))
-        self.assertFalse(self._is_forced(self._req(tool_choice="auto")))
-        self.assertFalse(self._is_forced(self._req(tool_choice="none")))
-        self.assertFalse(self._is_forced(self._req(tool_choice="")))
-
-    def test_pydantic_named_tool_choice(self):
-        named = SimpleNamespace(type="function", function=SimpleNamespace(name="f"))
-        self.assertTrue(self._is_forced(self._req(tool_choice=named)))
-
-    def test_pydantic_other_type(self):
-        self.assertFalse(self._is_forced(self._req(tool_choice=SimpleNamespace(type="other"))))
-        self.assertFalse(self._is_forced(self._req(tool_choice=SimpleNamespace())))
-
-    def test_no_tool_choice_no_options(self):
-        self.assertFalse(self._is_forced(self._req()))
-
-    def test_chat_template_options_force_mode(self):
-        kwargs = {
-            "options": {
-                "tool_choice": {"mode": "force", "name": "get_current_weather"},
-            }
-        }
-        self.assertTrue(self._is_forced(self._req(chat_template_kwargs=kwargs)))
-
-    def test_chat_template_options_non_force_mode(self):
-        kwargs = {"options": {"tool_choice": {"mode": "auto"}}}
-        self.assertFalse(self._is_forced(self._req(chat_template_kwargs=kwargs)))
-
-    def test_chat_template_options_missing_tool_choice(self):
-        self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": {}})))
-        self.assertFalse(self._is_forced(self._req(chat_template_kwargs={})))
-
-    def test_chat_template_options_malformed(self):
-        # Non-dict options/inner must be tolerated (no crash, returns False).
-        self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": "x"})))
-        self.assertFalse(self._is_forced(self._req(chat_template_kwargs={"options": {"tool_choice": "x"}})))
-
-    def test_tool_choice_takes_priority_over_options(self):
-        kwargs = {"options": {"tool_choice": {"mode": "force"}}}
-        # Named-tool pydantic choice combined with options.force still forces.
-        named = SimpleNamespace(type="function", function=SimpleNamespace(name="f"))
-        self.assertTrue(self._is_forced(self._req(tool_choice=named, chat_template_kwargs=kwargs)))
-
-
 class _RecordingToolParser:
     """Minimal tool parser that records inputs and exposes the prefix-state
     fields the serving layer reads/writes."""
@@ -885,8 +819,9 @@ def extract_tool_calls_streaming(
 
 class ToolPrefixCompensationTest(unittest.TestCase):
     """Tests for the forced-tool-call prefix compensation logic in
-    ``BaseTextProcessor`` (named-tool ``tool_choice`` and
-    ``chat_template_kwargs.options.tool_choice.mode == "force"``)."""
+    ``BaseTextProcessor``. Splicing is driven entirely by whether the
+    rendered prompt ends with an unclosed tool-call start token, not by
+    request parameter introspection."""
 
     def setUp(self):
         module, cleanup = _import_text_processor()
@@ -928,7 +863,9 @@ def test_prepare_tool_prefix_handles_exception(self):
         self.assertTrue(parser._tool_prefix_computed)
         self.assertEqual(parser._tool_prefix, "")
 
-    def test_normal_path_splices_prefix_when_required(self):
+    def test_normal_path_splices_prefix_when_prompt_has_prefix(self):
+        """Prompt ending with an unclosed tool-call start triggers splicing,
+        regardless of how the user requested it."""
         processor = self.processor
         parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")
         processor.tool_parser_obj = self._make_parser_factory(parser)
@@ -938,14 +875,9 @@ def test_normal_path_splices_prefix_when_required(self):
             "finished": True,
             "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
         }
-        # Named-tool pydantic choice triggers prefix injection.
-        request = SimpleNamespace(
-            tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")),
-        )
-
         processor.process_response_dict_normal(
             response,
-            request=request,
+            request=SimpleNamespace(chat_template_kwargs=None),
             prompt_tokens="user msg\n<tool_call>",
         )
         self.assertEqual(len(parser.extract_calls), 1)
@@ -953,7 +885,8 @@ def test_normal_path_splices_prefix_when_required(self):
         self.assertTrue(parser.extract_calls[0].startswith("<tool_call>"))
         self.assertEqual(response["outputs"]["tool_calls"], ["tc"])
 
-    def test_normal_path_no_splice_when_not_required(self):
+    def test_normal_path_no_splice_when_prompt_lacks_prefix(self):
+        """No prefix in prompt tail => detect returns "" => no splice."""
         processor = self.processor
         parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")
         processor.tool_parser_obj = self._make_parser_factory(parser)
@@ -963,70 +896,20 @@ def test_normal_path_no_splice_when_not_required(self):
             "finished": True,
             "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
         }
-        request = SimpleNamespace(tool_choice="auto")
-
         processor.process_response_dict_normal(
             response,
-            request=request,
-            prompt_tokens="user msg\n<tool_call>",
+            request=SimpleNamespace(chat_template_kwargs=None),
+            prompt_tokens="user msg without sentinel",
         )
-        # detect_tool_prefix must NOT be called for non-forced choices.
-        self.assertEqual(parser.detect_calls, [])
+        # detect_tool_prefix is called, but returns "" => no prefix prepended.
+        self.assertEqual(len(parser.detect_calls), 1)
         self.assertFalse(parser.extract_calls[0].startswith("<tool_call>"))
 
-    def test_normal_path_named_tool_choice_pydantic(self):
-        """A pydantic ``ChatCompletionNamedToolChoiceParam`` (duck-typed via
-        ``type='function'``) must also trigger prefix splicing."""
-        processor = self.processor
-        parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")
-        processor.tool_parser_obj = self._make_parser_factory(parser)
-
-        response = {
-            "request_id": "req-named",
-            "finished": True,
-            "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
-        }
-        request = SimpleNamespace(tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")))
-
-        processor.process_response_dict_normal(
-            response,
-            request=request,
-            prompt_tokens="user msg\n<tool_call>",
-        )
-        self.assertTrue(parser.extract_calls[0].startswith("<tool_call>"))
-
-    def test_normal_path_chat_template_force_mode(self):
-        """Forcing through ``chat_template_kwargs.options.tool_choice.mode``
-        must also trigger prefix splicing even when ``tool_choice`` is unset
-        (the default ``"none"``)."""
-        processor = self.processor
-        parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")
-        processor.tool_parser_obj = self._make_parser_factory(parser)
-
-        response = {
-            "request_id": "req-cti",
-            "finished": True,
-            "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
-        }
-        request = SimpleNamespace(
-            tool_choice="none",
-            chat_template_kwargs={"options": {"tool_choice": {"mode": "force", "name": "get_current_weather"}}},
-        )
-
-        processor.process_response_dict_normal(
-            response,
-            request=request,
-            prompt_tokens="user msg\n<tool_call>",
-        )
-        self.assertTrue(parser.extract_calls[0].startswith("<tool_call>"))
-
     def test_streaming_path_splices_prefix_only_on_first_delta(self):
         processor = self.processor
         parser = _RecordingToolParser(processor.tokenizer, tool_prefix="<tool_call>")
         processor.tool_parser_obj = self._make_parser_factory(parser)
-        request = SimpleNamespace(
-            tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")),
-        )
+        request = SimpleNamespace(chat_template_kwargs=None)
         prompt_tokens = "user msg\n<tool_call>"
 
         # First chunk
@@ -1070,12 +953,11 @@ def test_streaming_path_splices_prefix_only_on_first_delta(self):
 
     def test_streaming_path_no_splice_when_no_prefix_detected(self):
         processor = self.processor
-        # Empty configured prefix => detect returns "" even when forced.
+        # Empty configured prefix => detect returns "" even when prompt looks
+        # like a forced rendering.
         parser = _RecordingToolParser(processor.tokenizer, tool_prefix="")
         processor.tool_parser_obj = self._make_parser_factory(parser)
-        request = SimpleNamespace(
-            tool_choice=SimpleNamespace(type="function", function=SimpleNamespace(name="f")),
-        )
+        request = SimpleNamespace(chat_template_kwargs=None)
 
         first = {
             "finished": False,

From 2b12c9a3fbe19a015c160b9cfa2c639d7a75949e Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Fri, 22 May 2026 15:30:55 +0800
Subject: [PATCH 09/10] fix review

---
 fastdeploy/engine/common_engine.py |  6 +++++-
 fastdeploy/input/base_processor.py | 19 +++++++++++++++----
 tests/engine/test_common_engine.py | 13 +++++++++----
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py
index d3f27122dba..494d2248380 100644
--- a/fastdeploy/engine/common_engine.py
+++ b/fastdeploy/engine/common_engine.py
@@ -1909,7 +1909,11 @@ def _send_error_response(self, request_id, error_msg, error_code: int = 500, wor
     def _decode_token(self, token_ids, req_id, is_end):
         delta_text = ""
         if envs.FD_ENABLE_RETURN_TEXT:
-            delta_text, cum_tokens, _ = self.data_processor.ids2tokens(token_ids, req_id)
+            delta_text, previous_token_ids, _ = self.data_processor.ids2tokens(token_ids, req_id)
+            # Reconstruct the post-extend cumulative list from the pre-delta
+            # snapshot + this call's input — ``ids2tokens`` only returns the
+            # snapshot to keep its return values aliasing-free.
+            cum_tokens = previous_token_ids + list(token_ids)
             if delta_text != "":
                 prefix_offset = self.data_processor.decode_status[req_id][0]
                 read_offset = self.data_processor.decode_status[req_id][1]
diff --git a/fastdeploy/input/base_processor.py b/fastdeploy/input/base_processor.py
index 4baad8ee22e..21c30fb19d7 100644
--- a/fastdeploy/input/base_processor.py
+++ b/fastdeploy/input/base_processor.py
@@ -214,9 +214,16 @@ def ids2tokens(self, token_id, task_id):
         Returns:
             (delta_text, previous_token_ids, previous_texts)
 
-        Both the HF and the PaddleFormers/ERNIE tokeniser paths return the
-        same tuple shape.  The HF path sets ``previous_token_ids`` to ``[]``
-        since it does not expose per-token ids during batch-decode.
+        ``previous_token_ids`` and ``previous_texts`` are **snapshots of the
+        accumulated state BEFORE this call's tokens were appended** —
+        symmetric pre-delta views of what the caller had decoded so far.
+        Both are owned by the caller (no aliasing of internal state).
+
+        Callers that need the post-extend cumulative list should reconstruct
+        it locally via ``previous_token_ids + token_id``.
+
+        The HF path returns ``[]`` for ``previous_token_ids`` since it does
+        not expose per-token ids during batch-decode.
         """
         if envs.FD_USE_HF_TOKENIZER:
             if task_id not in self.decode_status:
@@ -235,7 +242,9 @@ def ids2tokens(self, token_id, task_id):
                 status[2] = decode_str[0]
             else:
                 new_str = ""
-            # Return consistent three-tuple; previous_token_ids not available.
+            # NOTE: HF path historically returns the post-delta full string
+            # here, inconsistent with the non-HF branch (which returns the
+            # pre-delta snapshot). Preserved as-is to avoid behavior change.
             return new_str, [], status[2]
         else:
             if task_id not in self.decode_status:
@@ -243,6 +252,8 @@ def ids2tokens(self, token_id, task_id):
                 self.decode_status[task_id] = [0, 0, [], ""]
             status = self.decode_status[task_id]
             previous_texts = status[3]
+            # Snapshot BEFORE extend so the returned list is owned by the
+            # caller and symmetric with ``previous_texts``.
             previous_token_ids = list(status[2])
             status[2].extend(token_id)
             decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(status[2], status[0], status[1])
diff --git a/tests/engine/test_common_engine.py b/tests/engine/test_common_engine.py
index a3487133bcb..3f05797a37a 100644
--- a/tests/engine/test_common_engine.py
+++ b/tests/engine/test_common_engine.py
@@ -752,7 +752,9 @@ def __init__(self):
                 self.decode_status = {"rid": (0, 2)}
 
             def ids2tokens(self, token_ids, req_id):
-                return "hi", [101, 102], None
+                # previous_token_ids snapshot is empty (first call); engine
+                # reconstructs cum = previous + input = [101, 102].
+                return "hi", [], None
 
         eng.data_processor = DummyProcessor()
 
@@ -782,7 +784,8 @@ def __init__(self):
                 self.decode_status = {"rid": (0, 1)}
 
             def ids2tokens(self, token_ids, req_id):
-                return "", [7], None
+                # previous snapshot is empty; cum becomes [7].
+                return "", [], None
 
         eng.data_processor = DummyProcessor()
 
@@ -1975,7 +1978,8 @@ def __init__(self):
                 self.decode_status = {"rid": (0, 2)}
 
             def ids2tokens(self, token_ids, req_id):
-                return "hi", [1, 2], None
+                # previous snapshot empty; cum = [] + [1, 2] = [1, 2].
+                return "hi", [], None
 
         eng.data_processor = DummyProcessor()
 
@@ -3453,7 +3457,8 @@ def __init__(self):
                 self.decode_status = {"tok-req": (1, 3)}
 
             def ids2tokens(self, token_ids, req_id):
-                return "hello", [10, 20, 30], None
+                # previous snapshot empty; cum = [] + [10, 20, 30].
+                return "hello", [], None
 
         eng.data_processor = DummyProcessor()
 

From 0cafd3e8d1a26433608b23e6093f06ef0925c659 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Fri, 22 May 2026 17:38:37 +0800
Subject: [PATCH 10/10] fix unit test

---
 tests/input/test_text_processor.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/input/test_text_processor.py b/tests/input/test_text_processor.py
index 0efa4e9f7fc..940fe51ec46 100644
--- a/tests/input/test_text_processor.py
+++ b/tests/input/test_text_processor.py
@@ -332,6 +332,13 @@ def create_dummy_tool_parser(tokenizer, content="tool-text"):
         class DummyToolParser:
             def __init__(self, tokenizer):
                 self.tokenizer = tokenizer
+                self._tool_prefix = ""
+                self._tool_prefix_token_ids = []
+                self._tool_prefix_computed = False
+                self._tool_prefix_injected_to_delta = False
+
+            def detect_tool_prefix(self, prompt):
+                return ""
 
             def extract_tool_calls(self, full_text, response_dict):
                 # 模拟工具调用解析，返回固定的工具调用数据用于测试