Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
328 changes: 272 additions & 56 deletions demos/continuous_batching/accuracy/gorilla.patch
Original file line number Diff line number Diff line change
@@ -1,8 +1,68 @@
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/__main__.py b/berkeley-function-call-leaderboard/bfcl_eval/__main__.py
index 3832e7e..61901b2 100644
--- a/berkeley-function-call-leaderboard/bfcl_eval/__main__.py
+++ b/berkeley-function-call-leaderboard/bfcl_eval/__main__.py
@@ -118,6 +118,11 @@ def generate(
"--exclude-state-log",
help="Exclude info about the state of each API system after each turn in the inference log; only relevant for multi-turn categories.",
),
+ include_verbose_log: bool = typer.Option(
+ False,
+ "--include-verbose-log",
+ help="Include the __verbose field from model server responses (e.g. OVMS) in the result output; useful for debugging generation settings, prompts, and timings.",
+ ),
num_gpus: int = typer.Option(1, help="The number of GPUs to use."),
num_threads: Optional[int] = typer.Option(None, help="The number of threads to use."),
gpu_memory_utilization: float = typer.Option(0.9, help="The GPU memory utilization."),
@@ -159,6 +164,7 @@ def generate(
temperature=temperature,
include_input_log=include_input_log,
exclude_state_log=exclude_state_log,
+ include_verbose_log=include_verbose_log,
num_gpus=num_gpus,
num_threads=num_threads,
gpu_memory_utilization=gpu_memory_utilization,
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py
index c9cbe09..6504eb1 100644
--- a/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py
+++ b/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py
@@ -165,13 +165,13 @@ def collect_test_cases(args, model_name, all_test_categories, all_test_entries_i
return sorted(test_cases_to_generate, key=sort_key)


-def multi_threaded_inference(handler, test_case, include_input_log, exclude_state_log):
+def multi_threaded_inference(handler, test_case, include_input_log, exclude_state_log, include_verbose_log=False):

assert type(test_case["function"]) is list

try:
result, metadata = handler.inference(
- test_case, include_input_log, exclude_state_log
+ test_case, include_input_log, exclude_state_log, include_verbose_log
)
except Exception as e:
# This is usually the case when the model getting stuck on one particular test case.
@@ -284,6 +284,7 @@ def generate_results(args, model_name, test_cases_total):
test_case,
args.include_input_log,
args.exclude_state_log,
+ getattr(args, "include_verbose_log", False),
)
in_flight[future] = test_case_id

@@ -320,6 +321,7 @@ def generate_results(args, model_name, test_cases_total):
test_case,
args.include_input_log,
args.exclude_state_log,
+ getattr(args, "include_verbose_log", False),
)
in_flight[future] = test_case_id

diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
index bb625d2..3ab2856 100644
index bb625d2..7204adb 100644
--- a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
+++ b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
@@ -2153,6 +2153,42 @@ third_party_inference_model_map = {
@@ -2153,6 +2153,30 @@ third_party_inference_model_map = {
is_fc_model=True,
underscore_to_dot=True,
),
Expand All @@ -29,18 +89,6 @@ index bb625d2..3ab2856 100644
+ output_price=None,
+ is_fc_model=True,
+ underscore_to_dot=True,
+ ),
+ "ovms-model-responses": ModelConfig(
+ model_name="ovms-model-responses",
+ display_name="ovms-model-responses",
+ url="http://localhost:8000/v3",
+ org="ovms",
+ license="apache-2.0",
+ model_handler=OpenAIResponsesHandler,
+ input_price=None,
+ output_price=None,
+ is_fc_model=True,
+ underscore_to_dot=True,
+ ),
}

Expand Down Expand Up @@ -72,50 +120,18 @@ index 357584f..e45e12c 100644
"store": False,
}

diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py
index 0953fdd..fffcc6c 100644
--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py
+++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py
@@ -38,10 +38,10 @@ class OpenAIResponsesHandler(BaseHandler):

kwargs = {}

- if api_key := os.getenv("OPENAI_API_KEY"):
+ if api_key := os.getenv("OPENAI_API_KEY","unused"):
kwargs["api_key"] = api_key

- if base_url := os.getenv("OPENAI_BASE_URL"):
+ if base_url := os.getenv("OPENAI_BASE_URL","http://localhost:8000/v3"):
kwargs["base_url"] = base_url

if headers_env := os.getenv("OPENAI_DEFAULT_HEADERS"):
@@ -103,6 +103,9 @@ class OpenAIResponsesHandler(BaseHandler):
"include": ["reasoning.encrypted_content"],
"reasoning": {"summary": "auto"},
"temperature": self.temperature,
+ "max_output_tokens": 2048,
+ "tool_choice": os.getenv("TOOL_CHOICE", "auto"),
+ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))},
}

# OpenAI reasoning models don't support temperature parameter
@@ -222,6 +225,7 @@ class OpenAIResponsesHandler(BaseHandler):
"include": ["reasoning.encrypted_content"],
"reasoning": {"summary": "auto"},
"temperature": self.temperature,
+ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))},
}

# OpenAI reasoning models don't support temperature parameter
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py
index 10f1a08..b67d39c 100644
index 10f1a08..50890c7 100644
--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py
+++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py
@@ -1,3 +1,4 @@
@@ -7,6 +7,7 @@ from openai import OpenAI
from overrides import override
from qwen_agent.llm import get_chat_model
import time
+import json
import os
from typing import Any

class QwenAPIHandler(OpenAICompletionsHandler):
"""
@@ -28,8 +29,8 @@ class QwenAPIHandler(OpenAICompletionsHandler):
super().__init__(model_name, temperature, registry_name, is_fc_model, **kwargs)
self.model_style = ModelStyle.OPENAI_COMPLETIONS
Expand All @@ -127,7 +143,7 @@ index 10f1a08..b67d39c 100644
)

#### FC methods ####
@@ -45,9 +46,10 @@ class QwenAPIHandler(OpenAICompletionsHandler):
@@ -45,9 +46,9 @@ class QwenAPIHandler(OpenAICompletionsHandler):
model=self.model_name.replace("-FC", ""),
tools=tools,
parallel_tool_calls=True,
Expand All @@ -137,14 +153,214 @@ index 10f1a08..b67d39c 100644
+ max_completion_tokens=2048,
+ tool_choice=os.getenv("TOOL_CHOICE", "auto"),
+ extra_body={"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))},
+ temperature=self.temperature,
stream=True,
stream_options={
"include_usage": True
@@ -352,4 +354,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler):
@@ -352,4 +353,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler):
'timeout': 1000,
'max_tokens': 16384
}
- })
\ No newline at end of file
+ })
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/base_handler.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/base_handler.py
index a1025e9..fed8c99 100644
--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/base_handler.py
+++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/base_handler.py
@@ -70,6 +70,7 @@ class BaseHandler:
test_entry: dict,
include_input_log: bool,
exclude_state_log: bool,
+ include_verbose_log: bool = False,
):
# This method is used to retrive model response for each model.

@@ -78,18 +79,18 @@ class BaseHandler:
if "FC" in self.registry_name or self.is_fc_model:
if contain_multi_turn_interaction(test_entry["id"]):
return self.inference_multi_turn_FC(
- test_entry, include_input_log, exclude_state_log
+ test_entry, include_input_log, exclude_state_log, include_verbose_log
)
else:
- return self.inference_single_turn_FC(test_entry, include_input_log)
+ return self.inference_single_turn_FC(test_entry, include_input_log, include_verbose_log)
# Prompting model
else:
if contain_multi_turn_interaction(test_entry["id"]):
return self.inference_multi_turn_prompting(
- test_entry, include_input_log, exclude_state_log
+ test_entry, include_input_log, exclude_state_log, include_verbose_log
)
else:
- return self.inference_single_turn_prompting(test_entry, include_input_log)
+ return self.inference_single_turn_prompting(test_entry, include_input_log, include_verbose_log)

@final
def inference_multi_turn_FC(
@@ -97,6 +98,7 @@ class BaseHandler:
test_entry: dict,
include_input_log: bool,
exclude_state_log: bool,
+ include_verbose_log: bool = False,
) -> tuple[list[list], dict]:
initial_config: dict = test_entry.get("initial_config", {})
involved_classes: list = test_entry["involved_classes"]
@@ -119,6 +121,7 @@ class BaseHandler:
force_quit = False # Whether the model has been forced to quit. If True, this whole entry will be failed.

all_reasoning_content: list[list] = []
+ all_verbose_log: list[list[dict]] = []

# Execute no function call, but just to get a reference to all the instances to get the initial state for logging purpose
_, involved_instances = execute_multi_turn_func_call(
@@ -206,6 +209,7 @@ class BaseHandler:
current_turn_output_token_count: list[float] = []
current_turn_latency: list[float] = []
current_turn_reasoning_content = []
+ current_turn_verbose_log: list[dict] = []

count = 0
while True:
@@ -219,6 +223,11 @@ class BaseHandler:

api_response, query_latency = self._query_FC(inference_data)

+ if include_verbose_log and hasattr(api_response, "model_extra") and api_response.model_extra:
+ verbose_data = api_response.model_extra.get("__verbose")
+ if verbose_data:
+ current_turn_verbose_log.append({k: verbose_data[k] for k in ("prompt", "content") if k in verbose_data})
+
Comment on lines +229 to +233
# This part of logging is disabled by default because it is too verbose and will make the result file extremely large
# It is only useful to see if the inference pipeline is working as expected (eg, does it convert all the inputs correctly)
if include_input_log:
@@ -335,6 +344,7 @@ class BaseHandler:
all_model_response.append(current_turn_response)
all_inference_log.append(current_turn_inference_log)
all_reasoning_content.append(current_turn_reasoning_content)
+ all_verbose_log.append(current_turn_verbose_log)
total_input_token_count.append(current_turn_input_token_count)
total_output_token_count.append(current_turn_output_token_count)
total_latency.append(current_turn_latency)
@@ -388,6 +398,9 @@ class BaseHandler:
):
metadata["reasoning_content"] = all_reasoning_content

+ if include_verbose_log and any(turn_log for turn_log in all_verbose_log):
+ metadata["__verbose"] = all_verbose_log
+
return all_model_response, metadata

@final
@@ -396,6 +409,7 @@ class BaseHandler:
test_entry: dict,
include_input_log: bool,
exclude_state_log: bool,
+ include_verbose_log: bool = False,
) -> tuple[list[list], dict]:
initial_config: dict = test_entry.get("initial_config", {})
involved_classes: list = test_entry["involved_classes"]
@@ -415,6 +429,7 @@ class BaseHandler:
all_reasoning_content: list[list] = []
# The debugging log for human to understand
all_inference_log: list[list[dict]] = []
+ all_verbose_log: list[list[dict]] = []
force_quit = False # Whether the model has been forced to quit. If True, this whole entry will be failed.

# Execute no function call, but just to get a reference to all the instances to get the initial state for logging purpose
@@ -498,6 +513,7 @@ class BaseHandler:
current_turn_input_token_count: list[float] = []
current_turn_output_token_count: list[float] = []
current_turn_latency: list[float] = []
+ current_turn_verbose_log: list[dict] = []

count = 0
while True:
@@ -511,6 +527,11 @@ class BaseHandler:

api_response, query_latency = self._query_prompting(inference_data)

+ if include_verbose_log and hasattr(api_response, "model_extra") and api_response.model_extra:
+ verbose_data = api_response.model_extra.get("__verbose")
+ if verbose_data:
+ current_turn_verbose_log.append({k: verbose_data[k] for k in ("prompt", "content") if k in verbose_data})
+
Comment on lines +283 to +287
# This part of logging is disabled by default because it is too verbose and will make the result file extremely large
# It is only useful to see if the inference pipeline is working as expected (eg, does it convert all the inputs correctly)
if include_input_log:
@@ -626,6 +647,7 @@ class BaseHandler:
all_model_response.append(current_turn_response)
all_reasoning_content.append(current_turn_reasoning_content)
all_inference_log.append(current_turn_inference_log)
+ all_verbose_log.append(current_turn_verbose_log)
total_input_token_count.append(current_turn_input_token_count)
total_output_token_count.append(current_turn_output_token_count)
total_latency.append(current_turn_latency)
@@ -679,11 +701,14 @@ class BaseHandler:
):
metadata["reasoning_content"] = all_reasoning_content

+ if include_verbose_log and any(turn_log for turn_log in all_verbose_log):
+ metadata["__verbose"] = all_verbose_log
+
return all_model_response, metadata

@final
def inference_single_turn_FC(
- self, test_entry: dict, include_input_log: bool
+ self, test_entry: dict, include_input_log: bool, include_verbose_log: bool = False
) -> tuple[any, dict]:
inference_data: dict = {}
inference_data = self._pre_query_processing_FC(inference_data, test_entry)
@@ -716,11 +741,16 @@ class BaseHandler:
):
metadata["reasoning_content"] = model_response_data["reasoning_content"]

+ if include_verbose_log and hasattr(api_response, "model_extra") and api_response.model_extra:
+ verbose_data = api_response.model_extra.get("__verbose")
+ if verbose_data:
+ metadata["__verbose"] = {k: verbose_data[k] for k in ("prompt", "content") if k in verbose_data}
Comment on lines +319 to +322
+
return model_response_data["model_responses"], metadata

@final
def inference_single_turn_prompting(
- self, test_entry: dict, include_input_log: bool
+ self, test_entry: dict, include_input_log: bool, include_verbose_log: bool = False
) -> tuple[any, dict]:
inference_data: dict = self._pre_query_processing_prompting(test_entry)
inference_data = self.add_first_turn_message_prompting(
@@ -751,6 +781,11 @@ class BaseHandler:
):
metadata["reasoning_content"] = model_response_data["reasoning_content"]

+ if include_verbose_log and hasattr(api_response, "model_extra") and api_response.model_extra:
+ verbose_data = api_response.model_extra.get("__verbose")
+ if verbose_data:
+ metadata["__verbose"] = {k: verbose_data[k] for k in ("prompt", "content") if k in verbose_data}
Comment on lines +337 to +340
+
return model_response_data["model_responses"], metadata

def decode_ast(self, result, language: ReturnFormat, has_tool_call_tag: bool):
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py
index 961d9bf..6b6504c 100644
--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py
+++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py
@@ -51,14 +51,15 @@ class OSSHandler(BaseHandler, EnforceOverrides):
test_entry: dict,
include_input_log: bool,
exclude_state_log: bool,
+ include_verbose_log: bool = False,
):
# TODO: Let oss model support FC methods as well, depends on their model type
if contain_multi_turn_interaction(test_entry["id"]):
return self.inference_multi_turn_prompting(
- test_entry, include_input_log, exclude_state_log
+ test_entry, include_input_log, exclude_state_log, include_verbose_log
)
else:
- return self.inference_single_turn_prompting(test_entry, include_input_log)
+ return self.inference_single_turn_prompting(test_entry, include_input_log, include_verbose_log)

@override
def decode_ast(self, result, language, has_tool_call_tag):