Skip to content

Commit bf1cefd

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - support loss analysis in EvaluationRun SDK
PiperOrigin-RevId: 898353195
1 parent 9ed3759 commit bf1cefd

8 files changed

Lines changed: 1966 additions & 266 deletions

File tree

tests/unit/vertexai/genai/test_evals.py

Lines changed: 861 additions & 10 deletions
Large diffs are not rendered by default.

vertexai/_genai/_evals_common.py

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2396,14 +2396,48 @@ def _get_eval_result_from_eval_items(
23962396
return eval_result
23972397

23982398

2399+
def _build_eval_item_map(
2400+
eval_items: list[types.EvaluationItem],
2401+
) -> dict[str, dict[str, Any]]:
2402+
"""Builds a mapping from EvaluationItem resource name to serialized data.
2403+
2404+
This is used by the loss analysis visualization to enrich examples with
2405+
scenario and rubric data from the original evaluation items.
2406+
2407+
Args:
2408+
eval_items: The list of EvaluationItem objects.
2409+
2410+
Returns:
2411+
A dict mapping evaluation item resource name to the serialized
2412+
evaluation_response dict (which the JS visualization reads as
2413+
``evaluation_result``).
2414+
"""
2415+
item_map: dict[str, dict[str, Any]] = {}
2416+
for item in eval_items:
2417+
if item.name and item.evaluation_response:
2418+
try:
2419+
item_map[item.name] = item.evaluation_response.model_dump(
2420+
mode="json", exclude_none=True
2421+
)
2422+
except Exception:
2423+
pass
2424+
return item_map
2425+
2426+
23992427
def _convert_evaluation_run_results(
24002428
api_client: BaseApiClient,
24012429
evaluation_run_results: types.EvaluationRunResults,
24022430
inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None,
2403-
) -> Optional[types.EvaluationResult]:
2404-
"""Retrieves an EvaluationItem from the EvaluationRunResults."""
2431+
) -> tuple[Optional[types.EvaluationResult], dict[str, dict[str, Any]]]:
2432+
"""Retrieves an EvaluationResult and item map from EvaluationRunResults.
2433+
2434+
Returns:
2435+
A tuple of (EvaluationResult, eval_item_map). The eval_item_map maps
2436+
evaluation item resource names to their serialized evaluation response
2437+
data, used for enriching loss analysis visualization.
2438+
"""
24052439
if not evaluation_run_results or not evaluation_run_results.evaluation_set:
2406-
return None
2440+
return None, {}
24072441

24082442
evals_module = evals.Evals(api_client_=api_client)
24092443
eval_set = evals_module.get_evaluation_set(
@@ -2416,19 +2450,21 @@ def _convert_evaluation_run_results(
24162450
evals_module.get_evaluation_item(name=item_name)
24172451
for item_name in eval_set.evaluation_items
24182452
]
2419-
return _get_eval_result_from_eval_items(
2453+
eval_result = _get_eval_result_from_eval_items(
24202454
evaluation_run_results, eval_items, inference_configs
24212455
)
2456+
eval_item_map = _build_eval_item_map(eval_items)
2457+
return eval_result, eval_item_map
24222458

24232459

24242460
async def _convert_evaluation_run_results_async(
24252461
api_client: BaseApiClient,
24262462
evaluation_run_results: types.EvaluationRunResults,
24272463
inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None,
2428-
) -> Optional[types.EvaluationResult]:
2429-
"""Retrieves an EvaluationItem from the EvaluationRunResults."""
2464+
) -> tuple[Optional[types.EvaluationResult], dict[str, dict[str, Any]]]:
2465+
"""Retrieves an EvaluationResult and item map from EvaluationRunResults."""
24302466
if not evaluation_run_results or not evaluation_run_results.evaluation_set:
2431-
return None
2467+
return None, {}
24322468

24332469
evals_module = evals.AsyncEvals(api_client_=api_client)
24342470
eval_set = await evals_module.get_evaluation_set(
@@ -2442,9 +2478,11 @@ async def _convert_evaluation_run_results_async(
24422478
for eval_item in eval_set.evaluation_items
24432479
]
24442480
eval_items = await asyncio.gather(*tasks)
2445-
return _get_eval_result_from_eval_items(
2481+
eval_result = _get_eval_result_from_eval_items(
24462482
evaluation_run_results, eval_items, inference_configs
24472483
)
2484+
eval_item_map = _build_eval_item_map(eval_items)
2485+
return eval_result, eval_item_map
24482486

24492487

24502488
def _object_to_dict(obj: Any) -> Union[dict[str, Any], Any]:

0 commit comments

Comments
 (0)