openvinotoolkit · dkalinowski · Jun 17, 2026
diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch
@@ -1,8 +1,68 @@
+diff --git a/berkeley-function-call-leaderboard/bfcl_eval/__main__.py b/berkeley-function-call-leaderboard/bfcl_eval/__main__.py
+index 3832e7e..61901b2 100644
+--- a/berkeley-function-call-leaderboard/bfcl_eval/__main__.py
++++ b/berkeley-function-call-leaderboard/bfcl_eval/__main__.py
+@@ -118,6 +118,11 @@ def generate(
+         "--exclude-state-log",
+         help="Exclude info about the state of each API system after each turn in the inference log; only relevant for multi-turn categories.",
+     ),
++    include_verbose_log: bool = typer.Option(
++        False,
++        "--include-verbose-log",
++        help="Include the __verbose field from model server responses (e.g. OVMS) in the result output; useful for debugging generation settings, prompts, and timings.",
++    ),
+     num_gpus: int = typer.Option(1, help="The number of GPUs to use."),
+     num_threads: Optional[int] = typer.Option(None, help="The number of threads to use."),
+     gpu_memory_utilization: float = typer.Option(0.9, help="The GPU memory utilization."),
+@@ -159,6 +164,7 @@ def generate(
+         temperature=temperature,
+         include_input_log=include_input_log,
+         exclude_state_log=exclude_state_log,
++        include_verbose_log=include_verbose_log,
+         num_gpus=num_gpus,
+         num_threads=num_threads,
+         gpu_memory_utilization=gpu_memory_utilization,
+diff --git a/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py
+index c9cbe09..6504eb1 100644
+--- a/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py
++++ b/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py
+@@ -165,13 +165,13 @@ def collect_test_cases(args, model_name, all_test_categories, all_test_entries_i
+     return sorted(test_cases_to_generate, key=sort_key)
+
+
+-def multi_threaded_inference(handler, test_case, include_input_log, exclude_state_log):
++def multi_threaded_inference(handler, test_case, include_input_log, exclude_state_log, include_verbose_log=False):
+
+     assert type(test_case["function"]) is list
+
+     try:
+         result, metadata = handler.inference(
+-            test_case, include_input_log, exclude_state_log
++            test_case, include_input_log, exclude_state_log, include_verbose_log
+         )
+     except Exception as e:
+         # This is usually the case when the model getting stuck on one particular test case.
+@@ -284,6 +284,7 @@ def generate_results(args, model_name, test_cases_total):
+                     test_case,
+                     args.include_input_log,
+                     args.exclude_state_log,
++                    getattr(args, "include_verbose_log", False),
+                 )
+                 in_flight[future] = test_case_id
+
+@@ -320,6 +321,7 @@ def generate_results(args, model_name, test_cases_total):
+                         test_case,
+                         args.include_input_log,
+                         args.exclude_state_log,
++                        getattr(args, "include_verbose_log", False),
+                     )
+                     in_flight[future] = test_case_id
+
 diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
-index bb625d2..3ab2856 100644
+index bb625d2..7204adb 100644
 --- a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
 +++ b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
-@@ -2153,6 +2153,42 @@ third_party_inference_model_map = {
+@@ -2153,6 +2153,30 @@ third_party_inference_model_map = {
          is_fc_model=True,
          underscore_to_dot=True,
      ),
@@ -29,18 +89,6 @@ index bb625d2..3ab2856 100644
 +        output_price=None,
 +        is_fc_model=True,
 +        underscore_to_dot=True,
-+    ),
-+    "ovms-model-responses": ModelConfig(
-+        model_name="ovms-model-responses",
-+        display_name="ovms-model-responses",
-+        url="http://localhost:8000/v3",
-+        org="ovms",
-+        license="apache-2.0",
-+        model_handler=OpenAIResponsesHandler,
-+        input_price=None,
-+        output_price=None,
-+        is_fc_model=True,
-+        underscore_to_dot=True,
 +    ),
  }
 
@@ -72,50 +120,18 @@ index 357584f..e45e12c 100644
              "store": False,
          }
 
-diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py
-index 0953fdd..fffcc6c 100644
---- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py
-+++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py
-@@ -38,10 +38,10 @@ class OpenAIResponsesHandler(BaseHandler):
-
-         kwargs = {}
-
--        if api_key := os.getenv("OPENAI_API_KEY"):
-+        if api_key := os.getenv("OPENAI_API_KEY","unused"):
-             kwargs["api_key"] = api_key
-
--        if base_url := os.getenv("OPENAI_BASE_URL"):
-+        if base_url := os.getenv("OPENAI_BASE_URL","http://localhost:8000/v3"):
-             kwargs["base_url"] = base_url
-
-         if headers_env := os.getenv("OPENAI_DEFAULT_HEADERS"):
-@@ -103,6 +103,9 @@ class OpenAIResponsesHandler(BaseHandler):
-             "include": ["reasoning.encrypted_content"],
-             "reasoning": {"summary": "auto"},
-             "temperature": self.temperature,
-+            "max_output_tokens": 2048,
-+            "tool_choice": os.getenv("TOOL_CHOICE", "auto"),
-+            "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))},
-         }
-
-         # OpenAI reasoning models don't support temperature parameter
-@@ -222,6 +225,7 @@ class OpenAIResponsesHandler(BaseHandler):
-             "include": ["reasoning.encrypted_content"],
-             "reasoning": {"summary": "auto"},
-             "temperature": self.temperature,
-+            "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))},
-         }
-
-         # OpenAI reasoning models don't support temperature parameter
 diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py
-index 10f1a08..b67d39c 100644
+index 10f1a08..50890c7 100644
 --- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py
 +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py
-@@ -1,3 +1,4 @@
+@@ -7,6 +7,7 @@ from openai import OpenAI
+ from overrides import override
+ from qwen_agent.llm import get_chat_model
+ import time
 +import json
- import os
- from typing import Any
 
+ class QwenAPIHandler(OpenAICompletionsHandler):
+     """
 @@ -28,8 +29,8 @@ class QwenAPIHandler(OpenAICompletionsHandler):
          super().__init__(model_name, temperature, registry_name, is_fc_model, **kwargs)
          self.model_style = ModelStyle.OPENAI_COMPLETIONS
@@ -127,7 +143,7 @@ index 10f1a08..b67d39c 100644
          )
 
      #### FC methods ####
-@@ -45,9 +46,10 @@ class QwenAPIHandler(OpenAICompletionsHandler):
+@@ -45,9 +46,9 @@ class QwenAPIHandler(OpenAICompletionsHandler):
              model=self.model_name.replace("-FC", ""),
              tools=tools,
              parallel_tool_calls=True,
@@ -137,14 +153,214 @@ index 10f1a08..b67d39c 100644
 +            max_completion_tokens=2048,
 +            tool_choice=os.getenv("TOOL_CHOICE", "auto"),
 +            extra_body={"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))},
-+            temperature=self.temperature,
              stream=True,
              stream_options={
                  "include_usage": True
-@@ -352,4 +354,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler):
+@@ -352,4 +353,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler):
              'timeout': 1000,
              'max_tokens': 16384
          }
 -    })
 \ No newline at end of file
 +    })
+diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/base_handler.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/base_handler.py
+index a1025e9..fed8c99 100644
+--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/base_handler.py
++++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/base_handler.py
+@@ -70,6 +70,7 @@ class BaseHandler:
+         test_entry: dict,
+         include_input_log: bool,
+         exclude_state_log: bool,
++        include_verbose_log: bool = False,
+     ):
+         # This method is used to retrive model response for each model.
+
+@@ -78,18 +79,18 @@ class BaseHandler:
+         if "FC" in self.registry_name or self.is_fc_model:
+             if contain_multi_turn_interaction(test_entry["id"]):
+                 return self.inference_multi_turn_FC(
+-                    test_entry, include_input_log, exclude_state_log
++                    test_entry, include_input_log, exclude_state_log, include_verbose_log
+                 )
+             else:
+-                return self.inference_single_turn_FC(test_entry, include_input_log)
++                return self.inference_single_turn_FC(test_entry, include_input_log, include_verbose_log)
+         # Prompting model
+         else:
+             if contain_multi_turn_interaction(test_entry["id"]):
+                 return self.inference_multi_turn_prompting(
+-                    test_entry, include_input_log, exclude_state_log
++                    test_entry, include_input_log, exclude_state_log, include_verbose_log
+                 )
+             else:
+-                return self.inference_single_turn_prompting(test_entry, include_input_log)
++                return self.inference_single_turn_prompting(test_entry, include_input_log, include_verbose_log)
+
+     @final
+     def inference_multi_turn_FC(
+@@ -97,6 +98,7 @@ class BaseHandler:
+         test_entry: dict,
+         include_input_log: bool,
+         exclude_state_log: bool,
++        include_verbose_log: bool = False,
+     ) -> tuple[list[list], dict]:
+         initial_config: dict = test_entry.get("initial_config", {})
+         involved_classes: list = test_entry["involved_classes"]
+@@ -119,6 +121,7 @@ class BaseHandler:
+         force_quit = False  # Whether the model has been forced to quit. If True, this whole entry will be failed.
+
+         all_reasoning_content: list[list] = []
++        all_verbose_log: list[list[dict]] = []
+
+         # Execute no function call, but just to get a reference to all the instances to get the initial state for logging purpose
+         _, involved_instances = execute_multi_turn_func_call(
+@@ -206,6 +209,7 @@ class BaseHandler:
+             current_turn_output_token_count: list[float] = []
+             current_turn_latency: list[float] = []
+             current_turn_reasoning_content = []
++            current_turn_verbose_log: list[dict] = []
+
+             count = 0
+             while True:
+@@ -219,6 +223,11 @@ class BaseHandler:
+
+                 api_response, query_latency = self._query_FC(inference_data)
+
++                if include_verbose_log and hasattr(api_response, "model_extra") and api_response.model_extra:
++                    verbose_data = api_response.model_extra.get("__verbose")
++                    if verbose_data:
++                        current_turn_verbose_log.append({k: verbose_data[k] for k in ("prompt", "content") if k in verbose_data})
++
+                 # This part of logging is disabled by default because it is too verbose and will make the result file extremely large
+                 # It is only useful to see if the inference pipeline is working as expected (eg, does it convert all the inputs correctly)
+                 if include_input_log:
+@@ -335,6 +344,7 @@ class BaseHandler:
+             all_model_response.append(current_turn_response)
+             all_inference_log.append(current_turn_inference_log)
+             all_reasoning_content.append(current_turn_reasoning_content)
++            all_verbose_log.append(current_turn_verbose_log)
+             total_input_token_count.append(current_turn_input_token_count)
+             total_output_token_count.append(current_turn_output_token_count)
+             total_latency.append(current_turn_latency)
+@@ -388,6 +398,9 @@ class BaseHandler:
+         ):
+             metadata["reasoning_content"] = all_reasoning_content
+
++        if include_verbose_log and any(turn_log for turn_log in all_verbose_log):
++            metadata["__verbose"] = all_verbose_log
++
+         return all_model_response, metadata
+
+     @final
+@@ -396,6 +409,7 @@ class BaseHandler:
+         test_entry: dict,
+         include_input_log: bool,
+         exclude_state_log: bool,
++        include_verbose_log: bool = False,
+     ) -> tuple[list[list], dict]:
+         initial_config: dict = test_entry.get("initial_config", {})
+         involved_classes: list = test_entry["involved_classes"]
+@@ -415,6 +429,7 @@ class BaseHandler:
+         all_reasoning_content: list[list] = []
+         # The debugging log for human to understand
+         all_inference_log: list[list[dict]] = []
++        all_verbose_log: list[list[dict]] = []
+         force_quit = False  # Whether the model has been forced to quit. If True, this whole entry will be failed.
+
+         # Execute no function call, but just to get a reference to all the instances to get the initial state for logging purpose
+@@ -498,6 +513,7 @@ class BaseHandler:
+             current_turn_input_token_count: list[float] = []
+             current_turn_output_token_count: list[float] = []
+             current_turn_latency: list[float] = []
++            current_turn_verbose_log: list[dict] = []
+
+             count = 0
+             while True:
+@@ -511,6 +527,11 @@ class BaseHandler:
+
+                 api_response, query_latency = self._query_prompting(inference_data)
+
++                if include_verbose_log and hasattr(api_response, "model_extra") and api_response.model_extra:
++                    verbose_data = api_response.model_extra.get("__verbose")
++                    if verbose_data:
++                        current_turn_verbose_log.append({k: verbose_data[k] for k in ("prompt", "content") if k in verbose_data})
++
+                 # This part of logging is disabled by default because it is too verbose and will make the result file extremely large
+                 # It is only useful to see if the inference pipeline is working as expected (eg, does it convert all the inputs correctly)
+                 if include_input_log:
+@@ -626,6 +647,7 @@ class BaseHandler:
+             all_model_response.append(current_turn_response)
+             all_reasoning_content.append(current_turn_reasoning_content)
+             all_inference_log.append(current_turn_inference_log)
++            all_verbose_log.append(current_turn_verbose_log)
+             total_input_token_count.append(current_turn_input_token_count)
+             total_output_token_count.append(current_turn_output_token_count)
+             total_latency.append(current_turn_latency)
+@@ -679,11 +701,14 @@ class BaseHandler:
+         ):
+             metadata["reasoning_content"] = all_reasoning_content
+
++        if include_verbose_log and any(turn_log for turn_log in all_verbose_log):
++            metadata["__verbose"] = all_verbose_log
++
+         return all_model_response, metadata
+
+     @final
+     def inference_single_turn_FC(
+-        self, test_entry: dict, include_input_log: bool
++        self, test_entry: dict, include_input_log: bool, include_verbose_log: bool = False
+     ) -> tuple[any, dict]:
+         inference_data: dict = {}
+         inference_data = self._pre_query_processing_FC(inference_data, test_entry)
+@@ -716,11 +741,16 @@ class BaseHandler:
+         ):
+             metadata["reasoning_content"] = model_response_data["reasoning_content"]
+
++        if include_verbose_log and hasattr(api_response, "model_extra") and api_response.model_extra:
++            verbose_data = api_response.model_extra.get("__verbose")
++            if verbose_data:
++                metadata["__verbose"] = {k: verbose_data[k] for k in ("prompt", "content") if k in verbose_data}
++
+         return model_response_data["model_responses"], metadata
+
+     @final
+     def inference_single_turn_prompting(
+-        self, test_entry: dict, include_input_log: bool
++        self, test_entry: dict, include_input_log: bool, include_verbose_log: bool = False
+     ) -> tuple[any, dict]:
+         inference_data: dict = self._pre_query_processing_prompting(test_entry)
+         inference_data = self.add_first_turn_message_prompting(
+@@ -751,6 +781,11 @@ class BaseHandler:
+         ):
+             metadata["reasoning_content"] = model_response_data["reasoning_content"]
+
++        if include_verbose_log and hasattr(api_response, "model_extra") and api_response.model_extra:
++            verbose_data = api_response.model_extra.get("__verbose")
++            if verbose_data:
++                metadata["__verbose"] = {k: verbose_data[k] for k in ("prompt", "content") if k in verbose_data}
++
+         return model_response_data["model_responses"], metadata
+
+     def decode_ast(self, result, language: ReturnFormat, has_tool_call_tag: bool):
+diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py
+index 961d9bf..6b6504c 100644
+--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py
++++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py
+@@ -51,14 +51,15 @@ class OSSHandler(BaseHandler, EnforceOverrides):
+         test_entry: dict,
+         include_input_log: bool,
+         exclude_state_log: bool,
++        include_verbose_log: bool = False,
+     ):
+         # TODO: Let oss model support FC methods as well, depends on their model type
+         if contain_multi_turn_interaction(test_entry["id"]):
+             return self.inference_multi_turn_prompting(
+-                test_entry, include_input_log, exclude_state_log
++                test_entry, include_input_log, exclude_state_log, include_verbose_log
+             )
+         else:
+-            return self.inference_single_turn_prompting(test_entry, include_input_log)
++            return self.inference_single_turn_prompting(test_entry, include_input_log, include_verbose_log)
+
+     @override
+     def decode_ast(self, result, language, has_tool_call_tag):