Broader model compatibility, tool_choice support, bug fixes and cleanup

devnen · devnen · commit a2c7d81686ec · 2026-02-14T16:19:59.000+01:00
diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py
@@ -1014,8 +1014,21 @@ async def generate_gen(
                 if chunk:
                     chunk_tokens = result.get("token_ids", self.tokenizer.encode(chunk))
                     full_response += chunk
+
+                    # Extract token IDs as a plain list for downstream consumers
                     if isinstance(chunk_tokens, torch.Tensor):
+                        token_id_list = chunk_tokens.flatten().tolist()
                         generated_tokens += chunk_tokens.size(dim=0)
+                    elif isinstance(chunk_tokens, tuple):
+                        first = chunk_tokens[0]
+                        if isinstance(first, torch.Tensor):
+                            token_id_list = first.flatten().tolist()
+                        else:
+                            token_id_list = list(first)
+                        generated_tokens += len(token_id_list)
+                    else:
+                        token_id_list = list(chunk_tokens)
+                        generated_tokens += len(token_id_list)
 
                     # Increase penalty range to generated token amount
                     # TODO:
@@ -1025,6 +1038,7 @@ async def generate_gen(
                     generation = {
                         "request_id": request_id,
                         "text": chunk,
+                        "token_ids": token_id_list,
                         "prompt_tokens": context_len,
                         "generated_tokens": generated_tokens,
                         "offset": len(full_response),
diff --git a/common/templating.py b/common/templating.py
@@ -12,6 +12,7 @@
 from jinja2.ext import loopcontrols
 from jinja2.sandbox import ImmutableSandboxedEnvironment
 from loguru import logger
+from markupsafe import Markup
 from packaging import version
 
 
@@ -33,6 +34,7 @@ class TemplateMetadata:
 
     stop_strings: List[str] = field(default_factory=list)
     tool_start: Optional[str] = None
+    tool_end: Optional[str] = None
     tool_call_format: str = "json"
 
 
@@ -50,6 +52,22 @@ class PromptTemplate:
     )
     metadata: Optional[TemplateMetadata] = None
 
+    @staticmethod
+    def _tojson_compat(value, indent=None, ensure_ascii=True):
+        """Compatibility JSON filter for chat templates.
+
+        Some model templates call ``tojson(ensure_ascii=False)`` while the
+        bundled Jinja filter may not accept that keyword in sandboxed mode.
+        """
+        return Markup(
+            json.dumps(
+                value,
+                indent=indent,
+                ensure_ascii=ensure_ascii,
+                separators=(",", ": "),
+            )
+        )
+
     async def extract_metadata(self, template_vars: dict):
         """
         Returns deserialized template metadata from a chat template.
@@ -80,6 +98,10 @@ async def extract_metadata(self, template_vars: dict):
             if isinstance(template_module.tool_start, str):
                 template_metadata.tool_start = template_module.tool_start
 
+        if hasattr(template_module, "tool_end"):
+            if isinstance(template_module.tool_end, str):
+                template_metadata.tool_end = template_module.tool_end
+
         if hasattr(template_module, "tool_call_format"):
             fmt = template_module.tool_call_format
             if isinstance(fmt, str) and fmt in VALID_TOOL_CALL_FORMATS:
@@ -123,6 +145,7 @@ def raise_exception(message):
 
         self.environment.globals["strftime_now"] = strftime_now
         self.environment.globals["raise_exception"] = raise_exception
+        self.environment.filters["tojson"] = self._tojson_compat
 
         return self.environment.from_string(template_str)
 
diff --git a/endpoints/OAI/types/chat_completion.py b/endpoints/OAI/types/chat_completion.py
@@ -4,7 +4,7 @@
 from uuid import uuid4
 
 from endpoints.OAI.types.common import UsageStats, CommonCompletionRequest
-from endpoints.OAI.types.tools import ToolSpec, ToolCall
+from endpoints.OAI.types.tools import NamedToolChoice, ToolSpec, ToolCall
 
 
 class ChatCompletionLogprob(BaseModel):
@@ -71,6 +71,10 @@ class ChatCompletionRequest(CommonCompletionRequest):
 
     tools: Optional[List[ToolSpec]] = None
     functions: Optional[List[Dict]] = None
+    tool_choice: Optional[
+        Union[Literal["none", "auto", "required"], NamedToolChoice]
+    ] = None
+    parallel_tool_calls: Optional[bool] = True
 
     # Chat completions requests do not have a BOS token preference. Backend
     # respects the tokenization config for the individual model.
diff --git a/endpoints/OAI/types/tools.py b/endpoints/OAI/types/tools.py
@@ -40,3 +40,16 @@ class ToolCall(BaseModel):
     function: Tool
     type: Literal["function"] = "function"
     index: Optional[int] = None
+
+
+class NamedToolFunction(BaseModel):
+    """Represents a named function reference for tool_choice."""
+
+    name: str
+
+
+class NamedToolChoice(BaseModel):
+    """Represents a named tool choice (forces a specific function call)."""
+
+    function: NamedToolFunction
+    type: Literal["function"] = "function"
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
@@ -29,7 +29,7 @@
     ChatCompletionStreamChoice,
 )
 from endpoints.OAI.types.common import UsageStats
-from endpoints.OAI.types.tools import ToolCall
+from endpoints.OAI.types.tools import NamedToolChoice, ToolCall
 from endpoints.OAI.utils.completion import _parse_gen_request_id, _stream_collector
 from endpoints.OAI.utils.tools import ToolCallProcessor, TOOL_CALL_SCHEMA
 
@@ -54,6 +54,7 @@ def _create_response(
     generations: List[dict],
     model_name: Optional[str],
     tool_call_format: str = "json",
+    tool_choice=None,
 ):
     """Create a chat completion response from the provided text."""
 
@@ -66,6 +67,10 @@ def _create_response(
         tool_calls_raw = generation.get("tool_calls")
         if tool_calls_raw:
             parsed = ToolCallProcessor.parse(tool_calls_raw, format=tool_call_format)
+            if parsed and isinstance(tool_choice, NamedToolChoice):
+                parsed = ToolCallProcessor.filter_by_name(
+                    parsed, tool_choice.function.name
+                )
             if parsed:
                 message.tool_calls = parsed
             else:
@@ -488,7 +493,7 @@ async def stream_generate_chat_completion(
                 raise CancelledError()
 
             # Handle options if a tool model is present
-            if tool_start:
+            if tool_start and data.tool_choice != "none":
                 if "stop_str" in generation:
                     generations = await generate_tool_calls(
                         prompt,
@@ -507,6 +512,10 @@ async def stream_generate_chat_completion(
                         parsed = ToolCallProcessor.parse(
                             tool_calls_raw, format=tool_call_format
                         )
+                        if parsed and isinstance(data.tool_choice, NamedToolChoice):
+                            parsed = ToolCallProcessor.filter_by_name(
+                                parsed, data.tool_choice.function.name
+                            )
                         if parsed:
                             for tc_chunk in _build_tool_call_chunks(
                                 parsed,
@@ -616,7 +625,10 @@ async def generate_chat_completion(
         generations = await asyncio.gather(*gen_tasks)
 
         # Check all the generations and see if a tool call is required
-        if tool_start:
+        force_tool_pass = data.tool_choice == "required" or isinstance(
+            data.tool_choice, NamedToolChoice
+        )
+        if tool_start or force_tool_pass:
             generations = await generate_tool_calls(
                 prompt, embeddings, data, generations, request
             )
@@ -626,6 +638,7 @@ async def generate_chat_completion(
             generations,
             model_path.name,
             tool_call_format=tool_call_format,
+            tool_choice=data.tool_choice,
         )
 
         logger.info(f"Finished chat completion request {request.state.id}")
@@ -652,6 +665,10 @@ async def generate_tool_calls(
     gen_tasks: List[asyncio.Task] = []
     tool_start = model.container.prompt_template.metadata.tool_start
     tool_call_format = model.container.prompt_template.metadata.tool_call_format
+    tool_choice = data.tool_choice
+
+    if tool_choice == "none":
+        return generations
 
     # Tracks which generations asked for a tool call
     tool_idx: List[int] = []
@@ -684,29 +701,35 @@ async def generate_tool_calls(
         tool_data.json_schema = TOOL_CALL_SCHEMA
 
     for idx, gen in enumerate(generations):
-        if gen["stop_str"] != tool_start:
+        stop_str = gen.get("stop_str")
+        should_generate = stop_str == tool_start
+
+        # Force tool generation if tool_choice requires it
+        if not should_generate and (
+            tool_choice == "required" or isinstance(tool_choice, NamedToolChoice)
+        ):
+            should_generate = True
+
+        if not should_generate:
             continue
 
         logger.info(
             f"Detected tool call in chat completion request "
             f"{request.state.id} (format={tool_call_format})"
         )
 
-        # Append the existing generation text if present
+        # Build per-generation prompt (avoid mutating shared prompt)
+        tool_prompt = prompt
         precursor_text = gen.get("full_text")
         if precursor_text:
-            prompt = prompt + precursor_text
+            tool_prompt = tool_prompt + precursor_text
 
         # For XML/auto mode: append tool_start back to prompt.
         # The stop string was consumed by the first pass and not included
         # in full_text, but the model expects to continue after <tool_call>.
         # Include a trailing newline to match the canonical template format.
-        if tool_call_format in ("xml", "auto"):
-            prompt = prompt + tool_start + "\n"
-            logger.debug(
-                f"generate_tool_calls: Appended '{tool_start}\\n' "
-                f"to prompt for XML continuation"
-            )
+        if tool_call_format in ("xml", "auto") and tool_start:
+            tool_prompt = tool_prompt + tool_start + "\n"
 
         gen_request_id = gen.get("request_id")
         tool_request_id = f"{gen_request_id}-tool"
@@ -715,7 +738,7 @@ async def generate_tool_calls(
             asyncio.create_task(
                 model.container.generate(
                     tool_request_id,
-                    prompt,
+                    tool_prompt,
                     tool_data,
                     mm_embeddings=embeddings,
                 )
@@ -734,10 +757,6 @@ async def generate_tool_calls(
             if tool_call_format in ("xml", "auto"):
                 # Prepend tool_start to reconstruct complete XML for parser
                 raw_text = tool_start + "\n" + raw_text
-                logger.debug(
-                    f"generate_tool_calls: Raw XML tool call output "
-                    f"({len(raw_text)} chars): {raw_text[:500]}..."
-                )
 
             generations[gen_idx]["tool_calls"] = raw_text
 
diff --git a/endpoints/OAI/utils/tools.py b/endpoints/OAI/utils/tools.py