diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py index 76b7f36f74..ec9fb5865b 100644 --- a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py +++ b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py @@ -79,11 +79,13 @@ ) ] ) -AGENT_INFO = types.evals.AgentInfo( - agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456", - name="agent-1", +AGENT_CONFIG = types.evals.AgentConfig( + agent_id="agent-1", + agent_resource_name=( + "projects/123/locations/us-central1/reasoningEngines/456" + ), instruction="agent-1 instruction", - tool_declarations=[TOOL], + tools=[TOOL], ) DEFAULT_PROMPT_TEMPLATE = "{prompt}" INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd.DataFrame( @@ -103,55 +105,55 @@ def test_create_eval_run_data_source_evaluation_set(client): - """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" - client._api_client._http_options.api_version = "v1beta1" - evaluation_run = client.evals.create_evaluation_run( - name="test4", - display_name="test4", - dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME), - dest=GCS_DEST, - metrics=[ - GENERAL_QUALITY_METRIC, - types.RubricMetric.FINAL_RESPONSE_QUALITY, - LLM_METRIC, - EXACT_MATCH_COMPUTATION_BASED_METRIC, - BLEU_COMPUTATION_BASED_METRIC, - ], - agent_info=AGENT_INFO, - labels={"label1": "value1"}, - ) - assert isinstance(evaluation_run, types.EvaluationRun) - assert evaluation_run.display_name == "test4" - assert evaluation_run.state == types.EvaluationRunState.PENDING - assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) - assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME - assert evaluation_run.evaluation_config == types.EvaluationRunConfig( - output_config=genai_types.OutputConfig( - gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) - ), - metrics=[ - GENERAL_QUALITY_METRIC, - FINAL_RESPONSE_QUALITY_METRIC, - LLM_METRIC, - EXACT_MATCH_COMPUTATION_BASED_METRIC, - BLEU_COMPUTATION_BASED_METRIC, - ], - ) - assert evaluation_run.inference_configs[ - AGENT_INFO.name - ] == types.EvaluationRunInferenceConfig( - agent_config=types.EvaluationRunAgentConfig( - developer_instruction=genai_types.Content( - parts=[genai_types.Part(text="agent-1 instruction")] - ), - tools=[TOOL], - ) - ) - assert evaluation_run.labels == { - "vertex-ai-evaluation-agent-engine-id": "456", - "label1": "value1", - } - assert evaluation_run.error is None + """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" + client._api_client._http_options.api_version = "v1beta1" + evaluation_run = client.evals.create_evaluation_run( + name="test4", + display_name="test4", + dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME), + dest=GCS_DEST, + metrics=[ + GENERAL_QUALITY_METRIC, + types.RubricMetric.FINAL_RESPONSE_QUALITY, + LLM_METRIC, + EXACT_MATCH_COMPUTATION_BASED_METRIC, + BLEU_COMPUTATION_BASED_METRIC, + ], + agent_definitions={"agent-1": AGENT_CONFIG}, + labels={"label1": "value1"}, + ) + assert isinstance(evaluation_run, types.EvaluationRun) + assert evaluation_run.display_name == "test4" + assert evaluation_run.state == types.EvaluationRunState.PENDING + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) + assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME + assert evaluation_run.evaluation_config == types.EvaluationRunConfig( + output_config=genai_types.OutputConfig( + gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) + ), + metrics=[ + GENERAL_QUALITY_METRIC, + FINAL_RESPONSE_QUALITY_METRIC, + LLM_METRIC, + EXACT_MATCH_COMPUTATION_BASED_METRIC, + BLEU_COMPUTATION_BASED_METRIC, + ], + ) + assert evaluation_run.inference_configs[ + "agent-1" + ] == types.EvaluationRunInferenceConfig( + agent_config=types.EvaluationRunAgentConfig( + developer_instruction=genai_types.Content( + parts=[genai_types.Part(text="agent-1 instruction")] + ), + tools=[TOOL], + ) + ) + assert evaluation_run.labels == { + "vertex-ai-evaluation-agent-engine-id": "456", + "label1": "value1", + } + assert evaluation_run.error is None def test_create_eval_run_data_source_bigquery_request_set(client): @@ -203,15 +205,15 @@ def test_create_eval_run_data_source_bigquery_request_set(client): def test_create_eval_run_with_inference_configs(client): - """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs.""" - client._api_client._http_options.api_version = "v1beta1" - inference_config = types.EvaluationRunInferenceConfig( + """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs.""" + client._api_client._http_options.api_version = "v1beta1" + inference_config = types.EvaluationRunInferenceConfig( model=MODEL_NAME, prompt_template=types.EvaluationRunPromptTemplate( prompt_template="test prompt template" ), ) - evaluation_run = client.evals.create_evaluation_run( + evaluation_run = client.evals.create_evaluation_run( name="test_inference_config", display_name="test_inference_config", dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME), @@ -220,22 +222,22 @@ def test_create_eval_run_with_inference_configs(client): inference_configs={"model_1": inference_config}, labels={"label1": "value1"}, ) - assert isinstance(evaluation_run, types.EvaluationRun) - assert evaluation_run.display_name == "test_inference_config" - assert evaluation_run.state == types.EvaluationRunState.PENDING - assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) - assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME - assert evaluation_run.evaluation_config == types.EvaluationRunConfig( + assert isinstance(evaluation_run, types.EvaluationRun) + assert evaluation_run.display_name == "test_inference_config" + assert evaluation_run.state == types.EvaluationRunState.PENDING + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) + assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME + assert evaluation_run.evaluation_config == types.EvaluationRunConfig( output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) ), metrics=[GENERAL_QUALITY_METRIC], ) - assert evaluation_run.inference_configs["model_1"] == inference_config - assert evaluation_run.labels == { + assert evaluation_run.inference_configs["model_1"] == inference_config + assert evaluation_run.labels == { "label1": "value1", } - assert evaluation_run.error is None + assert evaluation_run.error is None # Dataframe tests fail in replay mode because of UUID generation mismatch. @@ -533,7 +535,7 @@ def test_create_eval_run_with_inference_configs(client): # ] == types.EvaluationRunInferenceConfig( # agent_config=types.EvaluationRunAgentConfig( # developer_instruction=genai_types.Content( -# parts=[genai_types.Part(text=AGENT_INFO.instruction)] +# parts=[genai_types.Part(text=AGENT_CONFIG.instruction)] # ), # tools=[TOOL], # ), diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index 97e56c9e19..2fed4e8230 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -282,11 +282,11 @@ def _resolve_dataset( api_client: BaseApiClient, dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset], dest: str, - agent_info_pydantic: Optional[types.evals.AgentInfo] = None, + agent_configs: Optional[dict[str, types.evals.AgentConfig]] = None, ) -> types.EvaluationRunDataSource: """Resolves dataset for the evaluation run.""" if isinstance(dataset, types.EvaluationDataset): - candidate_name = _get_candidate_name(dataset, agent_info_pydantic) + candidate_name = _get_candidate_name(dataset, agent_configs) eval_set = _create_evaluation_set_from_dataframe( api_client, dest, @@ -338,22 +338,9 @@ def _resolve_inference_configs( inference_configs: Optional[ dict[str, types.EvaluationRunInferenceConfigOrDict] ] = None, - agent_info_pydantic: Optional[types.evals.AgentInfo] = None, + agent_configs: Optional[dict[str, types.evals.AgentConfig]] = None, ) -> Optional[dict[str, types.EvaluationRunInferenceConfigOrDict]]: """Resolves inference configs for the evaluation run.""" - # Resolve agent config - if agent_info_pydantic and agent_info_pydantic.name: - inference_configs = {} - inference_configs[agent_info_pydantic.name] = ( - types.EvaluationRunInferenceConfig( - agent_config=types.EvaluationRunAgentConfig( - developer_instruction=genai_types.Content( - parts=[genai_types.Part(text=agent_info_pydantic.instruction)] - ), - tools=agent_info_pydantic.tool_declarations, - ) - ) - ) # Resolve prompt template data if inference_configs: for inference_config in inference_configs.values(): @@ -387,33 +374,32 @@ def _resolve_inference_configs( def _add_evaluation_run_labels( labels: Optional[dict[str, str]] = None, - agent_info_pydantic: Optional[types.evals.AgentInfo] = None, + agent_configs: Optional[dict[str, types.evals.AgentConfig]] = None, ) -> Optional[dict[str, str]]: """Adds labels to the evaluation run.""" - if agent_info_pydantic and agent_info_pydantic.agent_resource_name: - labels = labels or {} - labels["vertex-ai-evaluation-agent-engine-id"] = ( - agent_info_pydantic.agent_resource_name.split("reasoningEngines/")[-1] - ) + if agent_configs: + for config in agent_configs.values(): + if config.agent_resource_name: + labels = labels or {} + labels["vertex-ai-evaluation-agent-engine-id"] = ( + config.agent_resource_name.split("reasoningEngines/")[-1] + ) + break return labels def _get_candidate_name( dataset: types.EvaluationDataset, - agent_info_pydantic: Optional[types.evals.AgentInfo] = None, + agent_configs: Optional[dict[str, types.evals.AgentConfig]] = None, ) -> Optional[str]: """Internal helper to get candidate name.""" - if agent_info_pydantic is not None and ( - dataset.candidate_name - and agent_info_pydantic - and agent_info_pydantic.name - and dataset.candidate_name != agent_info_pydantic.name - ): - logger.warning( - "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended." - ) - elif dataset.candidate_name is None and agent_info_pydantic: - return agent_info_pydantic.name + if agent_configs and dataset.candidate_name: + if dataset.candidate_name not in agent_configs: + logger.warning( + "Evaluation dataset candidate_name is not in the provided agent definitions. Please make sure this is intended." + ) + elif not dataset.candidate_name and agent_configs: + return list(agent_configs.keys())[0] return dataset.candidate_name or None @@ -1249,7 +1235,6 @@ def _resolve_dataset_inputs( dataset: list[types.EvaluationDataset], dataset_schema: Optional[Literal["GEMINI", "FLATTEN", "OPENAI"]], loader: "_evals_utils.EvalDatasetLoader", - agent_info: Optional[types.evals.AgentInfo] = None, ) -> tuple[types.EvaluationDataset, int]: """Loads and processes single or multiple datasets for evaluation. @@ -1259,7 +1244,6 @@ def _resolve_dataset_inputs( dataset_schema: The schema to use for the dataset(s). If None, it will be auto-detected. loader: An instance of EvalDatasetLoader to load data. - agent_info: The agent info of the agent under evaluation. Returns: A tuple containing: @@ -1319,7 +1303,6 @@ def _resolve_dataset_inputs( processed_eval_dataset = _evals_data_converters.merge_evaluation_datasets( datasets=parsed_evaluation_datasets, - agent_info=agent_info, ) if not processed_eval_dataset.eval_cases: @@ -1504,24 +1487,10 @@ def _execute_evaluation( # type: ignore[no-untyped-def] loader = _evals_utils.EvalDatasetLoader(api_client=api_client) - agent_info = kwargs.get("agent_info", None) - validated_agent_info = None - if agent_info: - if isinstance(agent_info, dict): - validated_agent_info = types.evals.AgentInfo.model_validate(agent_info) - elif isinstance(agent_info, types.evals.AgentInfo): - validated_agent_info = agent_info - else: - raise TypeError( - "agent_info values must be of type types.evals.AgentInfo or dict," - f" but got {type(agent_info)}'" - ) - processed_eval_dataset, num_response_candidates = _resolve_dataset_inputs( dataset=dataset_list, dataset_schema=dataset_schema, loader=loader, - agent_info=validated_agent_info, ) resolved_metrics = _resolve_metrics(metrics, api_client) @@ -1542,7 +1511,6 @@ def _execute_evaluation( # type: ignore[no-untyped-def] logger.info("Evaluation took: %f seconds", t2 - t1) evaluation_result.evaluation_dataset = dataset_list - evaluation_result.agent_info = validated_agent_info if not evaluation_result.metadata: evaluation_result.metadata = types.EvaluationRunMetadata() @@ -1636,7 +1604,7 @@ def _run_agent_internal( # TODO: Migrate single turn agent run result to AgentData. agent_data_row = types.evals.AgentData( turns=resp_item, - agents=agent_data_agents, + agent_definitions=agent_data_agents, ).model_dump() else: @@ -2094,39 +2062,6 @@ def _get_eval_cases_eval_dfs_from_eval_items( return eval_case_results, eval_dfs -def _get_agent_info_from_inference_configs( - candidate_names: list[str], - inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None, -) -> Optional[types.evals.AgentInfo]: - """Retrieves an AgentInfo from the inference configs.""" - # TODO(lakeyk): Support multiple agents. - if not ( - inference_configs - and candidate_names - and candidate_names[0] in inference_configs - and inference_configs[candidate_names[0]].agent_config - ): - return None - if len(inference_configs.keys()) > 1: - logger.warning( - "Multiple agents are not supported yet. Displaying the first agent." - ) - agent_config = inference_configs[candidate_names[0]].agent_config - di = ( - agent_config.developer_instruction - if agent_config and agent_config.developer_instruction - else None - ) - instruction = di.parts[0].text if di and di.parts and di.parts[0].text else None - return types.evals.AgentInfo( - name=candidate_names[0], - instruction=instruction, - tool_declarations=( - agent_config.tools if agent_config and agent_config.tools else None - ), - ) - - def _get_eval_result_from_eval_items( results: types.EvaluationRunResults, eval_items: list[types.EvaluationItem], @@ -2148,6 +2083,14 @@ def _get_eval_result_from_eval_items( aggregated_metrics = _get_aggregated_metrics(results) eval_case_results, eval_dfs = _get_eval_cases_eval_dfs_from_eval_items(eval_items) candidate_names = [eval_df.candidate_name for eval_df in eval_dfs] + + agent_configs = None + if inference_configs: + for config in inference_configs.values(): + if config.agent_definitions: + agent_configs = config.agent_definitions + break + eval_result = types.EvaluationResult( summary_metrics=aggregated_metrics, eval_case_results=eval_case_results, @@ -2155,9 +2098,7 @@ def _get_eval_result_from_eval_items( metadata=types.EvaluationRunMetadata( candidate_names=candidate_names, ), - agent_info=_get_agent_info_from_inference_configs( - candidate_names, inference_configs - ), + agent_definitions=agent_configs, ) return eval_result diff --git a/vertexai/_genai/_evals_data_converters.py b/vertexai/_genai/_evals_data_converters.py index 876f9a7341..54513c2edf 100644 --- a/vertexai/_genai/_evals_data_converters.py +++ b/vertexai/_genai/_evals_data_converters.py @@ -743,7 +743,7 @@ def _validate_case_consistency( def merge_evaluation_datasets( datasets: list[types.EvaluationDataset], - agent_info: Optional[types.evals.AgentInfo] = None, + agent_info: Optional[types.evals.AgentConfig] = None, ) -> types.EvaluationDataset: """Merges multiple EvaluationDatasets into a single EvaluationDataset. @@ -858,7 +858,6 @@ def merge_evaluation_datasets( reference=base_eval_case.reference, system_instruction=base_eval_case.system_instruction, conversation_history=base_eval_case.conversation_history, - agent_info=agent_info, intermediate_events=base_eval_case.intermediate_events, **eval_case_custom_columns, ) @@ -870,7 +869,6 @@ def merge_evaluation_datasets( def merge_response_datasets_into_canonical_format( raw_datasets: list[list[dict[str, Any]]], schemas: list[str], - agent_info: Optional[types.evals.AgentInfo] = None, ) -> types.EvaluationDataset: """Merges multiple raw response datasets into a single EvaluationDataset. @@ -897,4 +895,4 @@ def merge_response_datasets_into_canonical_format( converter = get_dataset_converter(schema) parsed_evaluation_datasets.append(converter.convert(raw_ds_entry)) - return merge_evaluation_datasets(parsed_evaluation_datasets, agent_info) + return merge_evaluation_datasets(parsed_evaluation_datasets) diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index 1088df664a..ee7f7d99af 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -78,6 +78,9 @@ def _CreateEvaluationRunParameters_to_vertex( if getv(from_object, ["display_name"]) is not None: setv(to_object, ["displayName"], getv(from_object, ["display_name"])) + if getv(from_object, ["agent"]) is not None: + setv(to_object, ["agent"], getv(from_object, ["agent"])) + if getv(from_object, ["data_source"]) is not None: setv(to_object, ["dataSource"], getv(from_object, ["data_source"])) @@ -356,7 +359,7 @@ def _EvaluationRunInferenceConfig_from_vertex( setv(to_object, ["agent_run_config"], getv(from_object, ["agentRunConfig"])) if getv(from_object, ["agents"]) is not None: - setv(to_object, ["agent_configs"], getv(from_object, ["agents"])) + setv(to_object, ["agent_definitions"], getv(from_object, ["agents"])) return to_object @@ -378,8 +381,8 @@ def _EvaluationRunInferenceConfig_to_vertex( if getv(from_object, ["agent_run_config"]) is not None: setv(to_object, ["agentRunConfig"], getv(from_object, ["agent_run_config"])) - if getv(from_object, ["agent_configs"]) is not None: - setv(to_object, ["agents"], getv(from_object, ["agent_configs"])) + if getv(from_object, ["agent_definitions"]) is not None: + setv(to_object, ["agents"], getv(from_object, ["agent_definitions"])) return to_object @@ -526,8 +529,8 @@ def _GenerateUserScenariosParameters_to_vertex( if getv(from_object, ["location"]) is not None: setv(to_object, ["location"], getv(from_object, ["location"])) - if getv(from_object, ["agents"]) is not None: - setv(to_object, ["agents"], getv(from_object, ["agents"])) + if getv(from_object, ["agent_definitions"]) is not None: + setv(to_object, ["agents"], getv(from_object, ["agent_definitions"])) if getv(from_object, ["root_agent_id"]) is not None: setv(to_object, ["rootAgentId"], getv(from_object, ["root_agent_id"])) @@ -840,6 +843,7 @@ def _create_evaluation_run( *, name: Optional[str] = None, display_name: Optional[str] = None, + agent: Optional[str] = None, data_source: types.EvaluationRunDataSourceOrDict, evaluation_config: types.EvaluationRunConfigOrDict, labels: Optional[dict[str, str]] = None, @@ -855,6 +859,7 @@ def _create_evaluation_run( parameter_model = types._CreateEvaluationRunParameters( name=name, display_name=display_name, + agent=agent, data_source=data_source, evaluation_config=evaluation_config, labels=labels, @@ -1104,7 +1109,7 @@ def _generate_user_scenarios( self, *, location: Optional[str] = None, - agents: Optional[dict[str, evals_types.AgentConfigOrDict]] = None, + agent_definitions: Optional[dict[str, evals_types.AgentConfigOrDict]] = None, root_agent_id: Optional[str] = None, user_scenario_generation_config: Optional[ evals_types.UserScenarioGenerationConfigOrDict @@ -1117,7 +1122,7 @@ def _generate_user_scenarios( parameter_model = types._GenerateUserScenariosParameters( location=location, - agents=agents, + agent_definitions=agent_definitions, root_agent_id=root_agent_id, user_scenario_generation_config=user_scenario_generation_config, config=config, @@ -1794,7 +1799,11 @@ def create_evaluation_run( metrics: list[types.EvaluationRunMetricOrDict], name: Optional[str] = None, display_name: Optional[str] = None, - agent_info: Optional[evals_types.AgentInfoOrDict] = None, + agent: Optional[str] = None, + agent_definitions: Optional[dict[str, evals_types.AgentConfigOrDict]] = None, + user_simulator_config: Optional[ + Union[evals_types.UserSimulatorConfigOrDict, dict[str, Any]] + ] = None, inference_configs: Optional[ dict[str, types.EvaluationRunInferenceConfigOrDict] ] = None, @@ -1809,10 +1818,12 @@ def create_evaluation_run( metrics: The list of metrics to evaluate. name: The name of the evaluation run. display_name: The display name of the evaluation run. - agent_info: The agent info to evaluate. + agent: The agent engine used to run agent. + agent_definitions: The agent configurations to evaluate, as a dictionary mapping agent IDs to agent configurations. + user_simulator_config: The configuration for the user simulator in multi-turn agent evaluation. inference_configs: The candidate to inference config map for the evaluation run. The key is the candidate name, and the value is the inference config. - If provided, agent_info must be None. + If provided, agent and agent_definitions must be None. Example: {"candidate-1": types.EvaluationRunInferenceConfig(model="gemini-2.5-flash")} labels: The labels to apply to the evaluation run. @@ -1821,19 +1832,53 @@ def create_evaluation_run( Returns: The created evaluation run. """ - if agent_info and inference_configs: + if (agent or agent_definitions) and inference_configs: raise ValueError( - "At most one of agent_info or inference_configs can be provided." + "At most one of (agent, agent_definitions) or inference_configs can be provided." ) - agent_info_pydantic = ( - evals_types.AgentInfo.model_validate(agent_info) - if isinstance(agent_info, dict) - else (agent_info or evals_types.AgentInfo()) - ) + + agent_configs_map = None + + if agent_definitions: + agent_configs_map = { + k: ( + evals_types.AgentConfig.model_validate(v) + if isinstance(v, dict) + else v + ) + for k, v in agent_definitions.items() + } + + candidate_name = None + if agent_configs_map: + candidate_name = list(agent_configs_map.keys())[0] + + candidate_name = candidate_name or "candidate-1" + if isinstance(dataset, types.EvaluationDataset) and dataset.candidate_name: + candidate_name = dataset.candidate_name + + if agent or agent_configs_map: + inference_configs = inference_configs or {} + if candidate_name not in inference_configs: + if user_simulator_config is None: + user_simulator_config = evals_types.UserSimulatorConfig(max_turn=5) + elif isinstance(user_simulator_config, dict): + user_simulator_config = ( + evals_types.UserSimulatorConfig.model_validate( + user_simulator_config + ) + ) + + inference_configs[candidate_name] = types.EvaluationRunInferenceConfig( + agent_run_config=types.AgentRunConfig( + agent_engine=agent, user_simulator_config=user_simulator_config + ), + agent_definitions=agent_configs_map, + ) if isinstance(dataset, types.EvaluationDataset): _evals_utils._validate_dataset_agent_data(dataset, inference_configs) resolved_dataset = _evals_common._resolve_dataset( - self._api_client, dataset, dest, agent_info_pydantic + self._api_client, dataset, dest, agent_configs_map ) output_config = genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=dest) @@ -1845,10 +1890,10 @@ def create_evaluation_run( output_config=output_config, metrics=resolved_metrics ) resolved_inference_configs = _evals_common._resolve_inference_configs( - self._api_client, resolved_dataset, inference_configs, agent_info_pydantic + self._api_client, resolved_dataset, inference_configs, agent_configs_map ) resolved_labels = _evals_common._add_evaluation_run_labels( - labels, agent_info_pydantic + labels, agent_configs_map ) resolved_name = name or f"evaluation_run_{uuid.uuid4()}" return self._create_evaluation_run( @@ -2002,7 +2047,7 @@ def create_evaluation_set( def generate_user_scenarios( self, *, - agents: dict[str, evals_types.AgentConfigOrDict], + agent_definitions: dict[str, evals_types.AgentConfigOrDict], user_scenario_generation_config: evals_types.UserScenarioGenerationConfigOrDict, root_agent_id: str, ) -> types.EvaluationDataset: @@ -2011,7 +2056,7 @@ def generate_user_scenarios( and the agent under test. Args: - agents: A map of agent ID to AgentConfig. + agent_definitions: A map of agent ID to AgentConfig. user_scenario_generation_config: Configuration for generating user scenarios. root_agent_id: The ID of the root agent. @@ -2019,7 +2064,7 @@ def generate_user_scenarios( An EvaluationDataset containing the generated user scenarios. """ response = self._generate_user_scenarios( - agents=agents, + agent_definitions=agent_definitions, user_scenario_generation_config=user_scenario_generation_config, root_agent_id=root_agent_id, ) @@ -2092,6 +2137,7 @@ async def _create_evaluation_run( *, name: Optional[str] = None, display_name: Optional[str] = None, + agent: Optional[str] = None, data_source: types.EvaluationRunDataSourceOrDict, evaluation_config: types.EvaluationRunConfigOrDict, labels: Optional[dict[str, str]] = None, @@ -2107,6 +2153,7 @@ async def _create_evaluation_run( parameter_model = types._CreateEvaluationRunParameters( name=name, display_name=display_name, + agent=agent, data_source=data_source, evaluation_config=evaluation_config, labels=labels, @@ -2364,7 +2411,7 @@ async def _generate_user_scenarios( self, *, location: Optional[str] = None, - agents: Optional[dict[str, evals_types.AgentConfigOrDict]] = None, + agent_definitions: Optional[dict[str, evals_types.AgentConfigOrDict]] = None, root_agent_id: Optional[str] = None, user_scenario_generation_config: Optional[ evals_types.UserScenarioGenerationConfigOrDict @@ -2377,7 +2424,7 @@ async def _generate_user_scenarios( parameter_model = types._GenerateUserScenariosParameters( location=location, - agents=agents, + agent_definitions=agent_definitions, root_agent_id=root_agent_id, user_scenario_generation_config=user_scenario_generation_config, config=config, @@ -2724,7 +2771,11 @@ async def create_evaluation_run( metrics: list[types.EvaluationRunMetricOrDict], name: Optional[str] = None, display_name: Optional[str] = None, - agent_info: Optional[evals_types.AgentInfo] = None, + agent: Optional[str] = None, + agent_definitions: Optional[dict[str, evals_types.AgentConfigOrDict]] = None, + user_simulator_config: Optional[ + Union[evals_types.UserSimulatorConfigOrDict, dict[str, Any]] + ] = None, inference_configs: Optional[ dict[str, types.EvaluationRunInferenceConfigOrDict] ] = None, @@ -2739,10 +2790,12 @@ async def create_evaluation_run( metrics: The list of metrics to evaluate. name: The name of the evaluation run. display_name: The display name of the evaluation run. - agent_info: The agent info to evaluate. + agent: The agent engine used to run agent. + agent_definitions: The agent configurations to evaluate, as a dictionary mapping agent IDs to agent configurations. + user_simulator_config: The configuration for the user simulator in multi-turn agent evaluation. inference_configs: The candidate to inference config map for the evaluation run. The key is the candidate name, and the value is the inference config. - If provided, agent_info must be None. + If provided, agent and agent_definitions must be None. Example: {"candidate-1": types.EvaluationRunInferenceConfig(model="gemini-2.5-flash")} labels: The labels to apply to the evaluation run. @@ -2751,19 +2804,54 @@ async def create_evaluation_run( Returns: The created evaluation run. """ - if agent_info and inference_configs: + if (agent or agent_definitions) and inference_configs: raise ValueError( - "At most one of agent_info or inference_configs can be provided." + "At most one of (agent, agent_definitions) or inference_configs can be provided." ) - agent_info_pydantic = ( - evals_types.AgentInfo.model_validate(agent_info) - if isinstance(agent_info, dict) - else (agent_info or evals_types.AgentInfo()) - ) + + agent_configs_map = None + + if agent_definitions: + agent_configs_map = { + k: ( + evals_types.AgentConfig.model_validate(v) + if isinstance(v, dict) + else v + ) + for k, v in agent_definitions.items() + } + + candidate_name = None + if agent_configs_map: + candidate_name = list(agent_configs_map.keys())[0] + + candidate_name = candidate_name or "candidate-1" + + if isinstance(dataset, types.EvaluationDataset) and dataset.candidate_name: + candidate_name = dataset.candidate_name + + if agent or agent_configs_map: + inference_configs = inference_configs or {} + if candidate_name not in inference_configs: + if user_simulator_config is None: + user_simulator_config = evals_types.UserSimulatorConfig(max_turn=5) + elif isinstance(user_simulator_config, dict): + user_simulator_config = ( + evals_types.UserSimulatorConfig.model_validate( + user_simulator_config + ) + ) + + inference_configs[candidate_name] = types.EvaluationRunInferenceConfig( + agent_run_config=types.AgentRunConfig( + agent_engine=agent, user_simulator_config=user_simulator_config + ), + agent_definitions=agent_configs_map, + ) if isinstance(dataset, types.EvaluationDataset): _evals_utils._validate_dataset_agent_data(dataset, inference_configs) resolved_dataset = _evals_common._resolve_dataset( - self._api_client, dataset, dest, agent_info_pydantic + self._api_client, dataset, dest, agent_configs_map ) output_config = genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=dest) @@ -2775,10 +2863,10 @@ async def create_evaluation_run( output_config=output_config, metrics=resolved_metrics ) resolved_inference_configs = _evals_common._resolve_inference_configs( - self._api_client, resolved_dataset, inference_configs, agent_info_pydantic + self._api_client, resolved_dataset, inference_configs, agent_configs_map ) resolved_labels = _evals_common._add_evaluation_run_labels( - labels, agent_info_pydantic + labels, agent_configs_map ) resolved_name = name or f"evaluation_run_{uuid.uuid4()}" @@ -2939,7 +3027,7 @@ async def create_evaluation_set( async def generate_user_scenarios( self, *, - agents: dict[str, evals_types.AgentConfigOrDict], + agent_definitions: dict[str, evals_types.AgentConfigOrDict], user_scenario_generation_config: evals_types.UserScenarioGenerationConfigOrDict, root_agent_id: str, ) -> types.EvaluationDataset: @@ -2948,7 +3036,7 @@ async def generate_user_scenarios( and the agent under test. Args: - agents: A map of agent ID to AgentConfig. + agent_definitions: A map of agent ID to AgentConfig. user_scenario_generation_config: Configuration for generating user scenarios. root_agent_id: The ID of the root agent. @@ -2956,7 +3044,7 @@ async def generate_user_scenarios( An EvaluationDataset containing the generated user scenarios. """ response = await self._generate_user_scenarios( - agents=agents, + agent_definitions=agent_definitions, user_scenario_generation_config=user_scenario_generation_config, root_agent_id=root_agent_id, ) diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py index 4a7512d0fe..fab561c303 100644 --- a/vertexai/_genai/types/common.py +++ b/vertexai/_genai/types/common.py @@ -2663,7 +2663,7 @@ class EvaluationRunInferenceConfig(_common.BaseModel): default=None, description="""Configuration for Agent Run in evaluation management service.""", ) - agent_configs: Optional[dict[str, evals_types.AgentConfig]] = Field( + agent_definitions: Optional[dict[str, evals_types.AgentConfig]] = Field( default=None, description="""A map of agent IDs to their respective agent config.""", ) @@ -2687,7 +2687,7 @@ class EvaluationRunInferenceConfigDict(TypedDict, total=False): agent_run_config: Optional[AgentRunConfigDict] """Configuration for Agent Run in evaluation management service.""" - agent_configs: Optional[dict[str, evals_types.AgentConfig]] + agent_definitions: Optional[dict[str, evals_types.AgentConfig]] """A map of agent IDs to their respective agent config.""" @@ -2721,6 +2721,7 @@ class _CreateEvaluationRunParameters(_common.BaseModel): name: Optional[str] = Field(default=None, description="""""") display_name: Optional[str] = Field(default=None, description="""""") + agent: Optional[str] = Field(default=None, description="""""") data_source: Optional[EvaluationRunDataSource] = Field( default=None, description="""""" ) @@ -2745,6 +2746,9 @@ class _CreateEvaluationRunParametersDict(TypedDict, total=False): display_name: Optional[str] """""" + agent: Optional[str] + """""" + data_source: Optional[EvaluationRunDataSourceDict] """""" @@ -3055,10 +3059,6 @@ class EvalCase(_common.BaseModel): default=None, description="""This field is experimental and may change in future versions. Intermediate events of a single turn in an agent run or intermediate events of the last turn for multi-turn an agent run.""", ) - agent_info: Optional[evals_types.AgentInfo] = Field( - default=None, - description="""This field is experimental and may change in future versions. The agent info of the agent under evaluation. This can be extended for multi-agent evaluation.""", - ) agent_data: Optional[evals_types.AgentData] = Field( default=None, description="""This field is experimental and may change in future versions. The agent data of the agent under evaluation.""", @@ -3098,9 +3098,6 @@ class EvalCaseDict(TypedDict, total=False): intermediate_events: Optional[list[evals_types.Event]] """This field is experimental and may change in future versions. Intermediate events of a single turn in an agent run or intermediate events of the last turn for multi-turn an agent run.""" - agent_info: Optional[evals_types.AgentInfo] - """This field is experimental and may change in future versions. The agent info of the agent under evaluation. This can be extended for multi-agent evaluation.""" - agent_data: Optional[evals_types.AgentData] """This field is experimental and may change in future versions. The agent data of the agent under evaluation.""" @@ -3330,9 +3327,9 @@ class EvaluationResult(_common.BaseModel): metadata: Optional[EvaluationRunMetadata] = Field( default=None, description="""Metadata for the evaluation run.""" ) - agent_info: Optional[evals_types.AgentInfo] = Field( + agent_definitions: Optional[dict[str, evals_types.AgentConfig]] = Field( default=None, - description="""This field is experimental and may change in future versions. The agent info of the agent under evaluation. This can be extended for multi-agent evaluation.""", + description="""This field is experimental and may change in future versions. The agent configs of the agents under evaluation.""", ) def show(self, candidate_names: Optional[List[str]] = None) -> None: @@ -3365,8 +3362,8 @@ class EvaluationResultDict(TypedDict, total=False): metadata: Optional[EvaluationRunMetadataDict] """Metadata for the evaluation run.""" - agent_info: Optional[evals_types.AgentInfo] - """This field is experimental and may change in future versions. The agent info of the agent under evaluation. This can be extended for multi-agent evaluation.""" + agent_definitions: Optional[dict[str, evals_types.AgentConfig]] + """This field is experimental and may change in future versions. The agent configs of the agents under evaluation.""" EvaluationResultOrDict = Union[EvaluationResult, EvaluationResultDict] @@ -5425,7 +5422,7 @@ class _GenerateUserScenariosParameters(_common.BaseModel): """Parameters for GenerateUserScenarios.""" location: Optional[str] = Field(default=None, description="""""") - agents: Optional[dict[str, evals_types.AgentConfig]] = Field( + agent_definitions: Optional[dict[str, evals_types.AgentConfig]] = Field( default=None, description="""""" ) root_agent_id: Optional[str] = Field(default=None, description="""""") @@ -5443,7 +5440,7 @@ class _GenerateUserScenariosParametersDict(TypedDict, total=False): location: Optional[str] """""" - agents: Optional[dict[str, evals_types.AgentConfig]] + agent_definitions: Optional[dict[str, evals_types.AgentConfig]] """""" root_agent_id: Optional[str] diff --git a/vertexai/_genai/types/evals.py b/vertexai/_genai/types/evals.py index a262d9d9f2..dc220def0d 100644 --- a/vertexai/_genai/types/evals.py +++ b/vertexai/_genai/types/evals.py @@ -207,6 +207,10 @@ def from_agent( description=getattr(agent, "description", None), instruction=getattr(agent, "instruction", None), tools=AgentConfig._get_tool_declarations_from_agent(agent), + sub_agents=[ + getattr(sa, "name", "agent_0") + for sa in getattr(agent, "sub_agents", []) + ], ) @@ -366,7 +370,7 @@ class EventsDict(TypedDict, total=False): class AgentData(_common.BaseModel): """Represents data specific to multi-turn agent evaluations.""" - agents: Optional[dict[str, AgentConfig]] = Field( + agent_definitions: Optional[dict[str, AgentConfig]] = Field( default=None, description="""A map containing the static configurations for each agent in the system. Key: agent_id (matches the `author` field in events). @@ -387,8 +391,8 @@ class AgentData(_common.BaseModel): events: Optional[Events] = Field(default=None, description="""A list of events.""") @classmethod - def _get_agents_map(cls, agent: Any) -> dict[str, AgentConfig]: - """Recursively gets all agent configs from an agent and its sub-agents. + def get_agent_definitions(cls, agent: Any) -> dict[str, AgentConfig]: + """Recursively gets all agent definitions from an agent and its sub-agents. Args: agent: The agent to get the agent info from. @@ -401,7 +405,7 @@ def _get_agents_map(cls, agent: Any) -> dict[str, AgentConfig]: agents_map = {agent_id: agent_config} for sub_agent in getattr(agent, "sub_agents", []): - agents_map.update(cls._get_agents_map(sub_agent)) + agents_map.update(cls.get_agent_definitions(sub_agent)) return agents_map @@ -419,7 +423,7 @@ def from_session(cls, agent: Any, session_history: list[Any]) -> "AgentData": Returns: An AgentData object containing the segmented history and agent config. """ - agents_map = cls._get_agents_map(agent) + agents_map = cls.get_agent_definitions(agent) agent_id = getattr(agent, "name", "agent_0") or "agent_0" turns: list[ConversationTurn] = [] @@ -494,13 +498,15 @@ def from_session(cls, agent: Any, session_history: list[Any]) -> "AgentData": ) ) - return cls(agents=agents_map, turns=turns) # pytype: disable=missing-parameter + return cls( + agent_definitions=agents_map, turns=turns + ) # pytype: disable=missing-parameter class AgentDataDict(TypedDict, total=False): """Represents data specific to multi-turn agent evaluations.""" - agents: Optional[dict[str, AgentConfigDict]] + agent_definitions: Optional[dict[str, AgentConfigDict]] """A map containing the static configurations for each agent in the system. Key: agent_id (matches the `author` field in events). Value: The static configuration of the agent.""" @@ -523,107 +529,6 @@ class AgentDataDict(TypedDict, total=False): AgentDataOrDict = Union[AgentData, AgentDataDict] -class AgentInfo(_common.BaseModel): - """The agent info of an agent, used for agent eval.""" - - agent_resource_name: Optional[str] = Field( - default=None, - description="""The agent engine used to run agent. Agent engine resource name in str type, with format - `projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine_id}`.""", - ) - name: Optional[str] = Field( - default=None, description="""Agent name, used as an identifier.""" - ) - instruction: Optional[str] = Field( - default=None, description="""Agent developer instruction.""" - ) - description: Optional[str] = Field( - default=None, description="""Agent description.""" - ) - tool_declarations: Optional[genai_types.ToolListUnion] = Field( - default=None, description="""List of tools used by the Agent.""" - ) - - @staticmethod - def _get_tool_declarations_from_agent(agent: Any) -> genai_types.ToolListUnion: - """Gets tool declarations from an agent. - - Args: - agent: The agent to get the tool declarations from. Data type is google.adk.agents.LLMAgent type, use Any to avoid dependency on ADK. - - Returns: - The tool declarations of the agent. - """ - tool_declarations: genai_types.ToolListUnion = [] - for tool in agent.tools: - tool_declarations.append( - { - "function_declarations": [ - genai_types.FunctionDeclaration.from_callable_with_api_option( - callable=tool - ) - ] - } - ) - return tool_declarations - - @classmethod - def load_from_agent( - cls, agent: Any, agent_resource_name: Optional[str] = None - ) -> "AgentInfo": - """Loads agent info from an agent. - - Args: - agent: The agent to get the agent info from, data type is google.adk.agents.LLMAgent type, use Any to avoid dependency on ADK. - agent_resource_name: Optional. The agent engine resource name. - - Returns: - The agent info of the agent. - - Example: - ``` - from vertexai._genai import types - - # Assuming 'my_agent' is an instance of google.adk.agents.LLMAgent - - agent_info = types.evals.AgentInfo.load_from_agent( - agent=my_agent, - agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456" - ) - ``` - """ - return cls( # pytype: disable=missing-parameter - name=agent.name, - agent_resource_name=agent_resource_name, - instruction=agent.instruction, - description=agent.description, - tool_declarations=AgentInfo._get_tool_declarations_from_agent(agent), - ) - - -class AgentInfoDict(TypedDict, total=False): - """The agent info of an agent, used for agent eval.""" - - agent_resource_name: Optional[str] - """The agent engine used to run agent. Agent engine resource name in str type, with format - `projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine_id}`.""" - - name: Optional[str] - """Agent name, used as an identifier.""" - - instruction: Optional[str] - """Agent developer instruction.""" - - description: Optional[str] - """Agent description.""" - - tool_declarations: Optional[genai_types.ToolListUnionDict] - """List of tools used by the Agent.""" - - -AgentInfoOrDict = Union[AgentInfo, AgentInfoDict] - - class RubricContentProperty(_common.BaseModel): """Defines criteria based on a specific property."""