Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 167 additions & 0 deletions tests/unit/vertexai/genai/test_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5856,3 +5856,170 @@ async def test_async_generate_user_scenarios(self):
assert len(eval_dataset.eval_dataset_df) == 2

self.mock_api_client.async_request.assert_called_once()


class TestCreateEvaluationSetFromDataFrame:
"""Unit tests for the _create_evaluation_set_from_dataframe function."""

def setup_method(self):
self.mock_api_client = mock.Mock(spec=client.Client)
self.mock_api_client.project = "test-project"
self.mock_api_client.location = "us-central1"

@mock.patch.object(_evals_common, "evals")
@mock.patch.object(_evals_common, "_gcs_utils")
def test_create_evaluation_set_with_intermediate_events(
self, mock_gcs_utils, mock_evals_module
):
intermediate_events = [
{
"content": {"parts": [{"text": "thought 1"}]},
"timestamp": "2024-01-01T00:00:00Z",
},
{
"content": {"parts": [{"functionCall": {"name": "foo"}}]},
"timestamp": "2024-01-01T00:00:01Z",
},
]

eval_df = pd.DataFrame(
[
{
"prompt": "test prompt",
"response": "test response",
"intermediate_events": intermediate_events,
}
]
)

mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value
mock_gcs_instance.upload_json_to_prefix.return_value = (
"gs://bucket/path/request.json"
)

mock_evals_instance = mock_evals_module.Evals.return_value
mock_eval_item = mock.Mock()
mock_eval_item.name = "eval_item_1"
mock_evals_instance.create_evaluation_item.return_value = mock_eval_item

mock_eval_set = mock.Mock()
mock_evals_instance.create_evaluation_set.return_value = mock_eval_set

result = _evals_common._create_evaluation_set_from_dataframe(
api_client=self.mock_api_client,
gcs_dest_prefix="gs://bucket/prefix",
eval_df=eval_df,
candidate_name="test-candidate",
)

assert result == mock_eval_set

mock_gcs_instance.upload_json_to_prefix.assert_called_once()
call_args = mock_gcs_instance.upload_json_to_prefix.call_args
uploaded_data = call_args.kwargs["data"]

candidate_responses = uploaded_data["candidate_responses"]
assert len(candidate_responses) == 1
candidate_response = candidate_responses[0]
assert candidate_response["candidate"] == "test-candidate"
assert candidate_response["text"] == "test response"

expected_events = [
{"parts": [{"text": "thought 1"}]},
{"parts": [{"function_call": {"name": "foo"}}]},
]
assert candidate_response["events"] == expected_events

@mock.patch.object(_evals_common, "evals")
@mock.patch.object(_evals_common, "_gcs_utils")
def test_create_evaluation_set_with_user_scenario(
self, mock_gcs_utils, mock_evals_module
):
eval_df = pd.DataFrame(
[
{
"starting_prompt": "test starting prompt",
"conversation_plan": "test conversation plan",
}
]
)

mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value
mock_gcs_instance.upload_json_to_prefix.return_value = (
"gs://bucket/path/request.json"
)

mock_evals_instance = mock_evals_module.Evals.return_value
mock_eval_item = mock.Mock()
mock_eval_item.name = "eval_item_1"
mock_evals_instance.create_evaluation_item.return_value = mock_eval_item

mock_eval_set = mock.Mock()
mock_evals_instance.create_evaluation_set.return_value = mock_eval_set

result = _evals_common._create_evaluation_set_from_dataframe(
api_client=self.mock_api_client,
gcs_dest_prefix="gs://bucket/prefix",
eval_df=eval_df,
candidate_name="test-candidate",
)

assert result == mock_eval_set

mock_gcs_instance.upload_json_to_prefix.assert_called_once()
call_args = mock_gcs_instance.upload_json_to_prefix.call_args
uploaded_data = call_args.kwargs["data"]

assert uploaded_data.get("candidate_responses") is None
assert uploaded_data["prompt"]["user_scenario"] == {
"starting_prompt": "test starting prompt",
"conversation_plan": "test conversation plan",
}

@mock.patch.object(_evals_common, "evals")
@mock.patch.object(_evals_common, "_gcs_utils")
def test_create_evaluation_set_with_agent_data(
self, mock_gcs_utils, mock_evals_module
):
agent_data = {"turns": [{"turn_id": "turn1", "events": []}]}
eval_df = pd.DataFrame(
[
{
"prompt": "test prompt",
"agent_data": agent_data,
}
]
)

mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value
mock_gcs_instance.upload_json_to_prefix.return_value = (
"gs://bucket/path/request.json"
)

mock_evals_instance = mock_evals_module.Evals.return_value
mock_eval_item = mock.Mock()
mock_eval_item.name = "eval_item_1"
mock_evals_instance.create_evaluation_item.return_value = mock_eval_item

mock_eval_set = mock.Mock()
mock_evals_instance.create_evaluation_set.return_value = mock_eval_set

result = _evals_common._create_evaluation_set_from_dataframe(
api_client=self.mock_api_client,
gcs_dest_prefix="gs://bucket/prefix",
eval_df=eval_df,
candidate_name="test-candidate",
)

assert result == mock_eval_set

mock_gcs_instance.upload_json_to_prefix.assert_called_once()
call_args = mock_gcs_instance.upload_json_to_prefix.call_args
uploaded_data = call_args.kwargs["data"]

assert uploaded_data["prompt"]["text"] == "test prompt"
candidate_responses = uploaded_data["candidate_responses"]
assert len(candidate_responses) == 1
candidate_response = candidate_responses[0]
assert candidate_response["candidate"] == "test-candidate"
assert candidate_response["agent_data"] == agent_data
58 changes: 47 additions & 11 deletions vertexai/_genai/_evals_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from google.genai.models import Models
import pandas as pd
from tqdm import tqdm
from pydantic import ValidationError

from . import _evals_constant
from . import _evals_data_converters
Expand Down Expand Up @@ -2254,7 +2255,48 @@ def _create_evaluation_set_from_dataframe(
for event in row[_evals_constant.INTERMEDIATE_EVENTS]:
if CONTENT in event:
intermediate_events.append(event[CONTENT])
if _evals_constant.CONTEXT in row or _evals_constant.HISTORY in row:

agent_data_obj = None
if _evals_constant.AGENT_DATA in row:
agent_data_val = row[AGENT_DATA]
if isinstance(agent_data_val, str):
try:
agent_data_val = json.loads(agent_data_val)
except json.JSONDecodeError:
pass
if isinstance(agent_data_val, dict):
try:
agent_data_obj = types.evals.AgentData.model_validate(
agent_data_val
)
except ValidationError:
pass
elif isinstance(agent_data_val, types.evals.AgentData):
agent_data_obj = agent_data_val

candidate_responses = []
if _evals_constant.RESPONSE in row or agent_data_obj or intermediate_events:
candidate_responses.append(
types.CandidateResponse(
candidate=candidate_name or "Candidate 1",
text=row.get(_evals_constant.RESPONSE) or None,
events=intermediate_events or None,
agent_data=agent_data_obj,
)
)

prompt = None
if (
_evals_constant.STARTING_PROMPT in row
and _evals_constant.CONVERSATION_PLAN in row
):
prompt = types.EvaluationPrompt(
user_scenario=types.evals.UserScenario(
starting_prompt=row[_evals_constant.STARTING_PROMPT],
conversation_plan=row[_evals_constant.CONVERSATION_PLAN],
)
)
elif _evals_constant.CONTEXT in row or _evals_constant.HISTORY in row:
values = {}
if _evals_constant.CONTEXT in row:
values[_evals_constant.CONTEXT] = _get_content(
Expand All @@ -2273,15 +2315,7 @@ def _create_evaluation_set_from_dataframe(
)
elif _evals_constant.PROMPT in row:
prompt = types.EvaluationPrompt(text=row[_evals_constant.PROMPT])
candidate_responses = []
if _evals_constant.RESPONSE in row:
candidate_responses.append(
types.CandidateResponse(
candidate=candidate_name or "Candidate 1",
text=row[_evals_constant.RESPONSE],
events=intermediate_events or None,
)
)

eval_item_requests.append(
types.EvaluationItemRequest(
prompt=prompt or None,
Expand All @@ -2290,7 +2324,9 @@ def _create_evaluation_set_from_dataframe(
if _evals_constant.REFERENCE in row
else None
),
candidate_responses=candidate_responses,
candidate_responses=(
candidate_responses if candidate_responses else None
),
)
)
logger.info("Writing evaluation item requests to GCS.")
Expand Down
2 changes: 2 additions & 0 deletions vertexai/_genai/_evals_constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@
PARTS = "parts"
USER_AUTHOR = "user"
AGENT_DATA = "agent_data"
STARTING_PROMPT = "starting_prompt"
CONVERSATION_PLAN = "conversation_plan"
HISTORY = "conversation_history"

COMMON_DATASET_COLUMNS = frozenset(
Expand Down
6 changes: 6 additions & 0 deletions vertexai/_genai/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@
from .common import AgentEngineSessionOperation
from .common import AgentEngineSessionOperationDict
from .common import AgentEngineSessionOperationOrDict
from .common import AgentRunConfig
from .common import AgentRunConfigDict
from .common import AgentRunConfigOrDict
from .common import AgentServerMode
from .common import AggregatedMetricResult
from .common import AggregatedMetricResultDict
Expand Down Expand Up @@ -1391,6 +1394,9 @@
"EvaluationRunAgentConfig",
"EvaluationRunAgentConfigDict",
"EvaluationRunAgentConfigOrDict",
"AgentRunConfig",
"AgentRunConfigDict",
"AgentRunConfigOrDict",
"EvaluationRunInferenceConfig",
"EvaluationRunInferenceConfigDict",
"EvaluationRunInferenceConfigOrDict",
Expand Down
60 changes: 52 additions & 8 deletions vertexai/_genai/types/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1914,6 +1914,13 @@ class CandidateResponse(_common.BaseModel):
default=None,
description="""Intermediate events (such as tool calls and responses) that led to the final response.""",
)
agent_data: Optional[evals_types.AgentData] = Field(
default=None,
description="""Represents the complete execution trace of an anget conversation,
which can involve single or multiple agents. This field is used to
provide the full output of an agent's run, including all turns and
events, for direct evaluation.""",
)


class CandidateResponseDict(TypedDict, total=False):
Expand All @@ -1931,6 +1938,12 @@ class CandidateResponseDict(TypedDict, total=False):
events: Optional[list[genai_types.ContentDict]]
"""Intermediate events (such as tool calls and responses) that led to the final response."""

agent_data: Optional[evals_types.AgentData]
"""Represents the complete execution trace of an anget conversation,
which can involve single or multiple agents. This field is used to
provide the full output of an agent's run, including all turns and
events, for direct evaluation."""


CandidateResponseOrDict = Union[CandidateResponse, CandidateResponseDict]

Expand Down Expand Up @@ -3271,6 +3284,41 @@ class EvaluationRunAgentConfigDict(TypedDict, total=False):
]


class AgentRunConfig(_common.BaseModel):
"""Configuration for an Agent Run."""

session_input: Optional[evals_types.SessionInput] = Field(
default=None, description="""The session input to get agent running results."""
)
agent_engine: Optional[str] = Field(
default=None, description="""The resource name of the Agent Engine."""
)
user_simulator_config: Optional[evals_types.UserSimulatorConfig] = Field(
default=None,
description="""Used for multi-turn agent run.
Contains configuration for a user simulator that
uses an LLM to generate messages on behalf of the user.""",
)


class AgentRunConfigDict(TypedDict, total=False):
"""Configuration for an Agent Run."""

session_input: Optional[evals_types.SessionInput]
"""The session input to get agent running results."""

agent_engine: Optional[str]
"""The resource name of the Agent Engine."""

user_simulator_config: Optional[evals_types.UserSimulatorConfig]
"""Used for multi-turn agent run.
Contains configuration for a user simulator that
uses an LLM to generate messages on behalf of the user."""


AgentRunConfigOrDict = Union[AgentRunConfig, AgentRunConfigDict]


class EvaluationRunInferenceConfig(_common.BaseModel):
"""This field is experimental and may change in future versions.

Expand All @@ -3287,11 +3335,9 @@ class EvaluationRunInferenceConfig(_common.BaseModel):
prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
default=None, description="""The prompt template used for inference."""
)
user_simulator_config: Optional[evals_types.UserSimulatorConfig] = Field(
agent_run_config: Optional[AgentRunConfig] = Field(
default=None,
description="""Used for multi-turn agent run.
Contains configuration for a user simulator that
uses an LLM to generate messages on behalf of the user.""",
description="""Configuration for Agent Run in evaluation management service.""",
)


Expand All @@ -3310,10 +3356,8 @@ class EvaluationRunInferenceConfigDict(TypedDict, total=False):
prompt_template: Optional[EvaluationRunPromptTemplateDict]
"""The prompt template used for inference."""

user_simulator_config: Optional[evals_types.UserSimulatorConfig]
"""Used for multi-turn agent run.
Contains configuration for a user simulator that
uses an LLM to generate messages on behalf of the user."""
agent_run_config: Optional[AgentRunConfigDict]
"""Configuration for Agent Run in evaluation management service."""


EvaluationRunInferenceConfigOrDict = Union[
Expand Down
Loading