@@ -2396,14 +2396,48 @@ def _get_eval_result_from_eval_items(
23962396 return eval_result
23972397
23982398
2399+ def _build_eval_item_map (
2400+ eval_items : list [types .EvaluationItem ],
2401+ ) -> dict [str , dict [str , Any ]]:
2402+ """Builds a mapping from EvaluationItem resource name to serialized data.
2403+
2404+ This is used by the loss analysis visualization to enrich examples with
2405+ scenario and rubric data from the original evaluation items.
2406+
2407+ Args:
2408+ eval_items: The list of EvaluationItem objects.
2409+
2410+ Returns:
2411+ A dict mapping evaluation item resource name to the serialized
2412+ evaluation_response dict (which the JS visualization reads as
2413+ ``evaluation_result``).
2414+ """
2415+ item_map : dict [str , dict [str , Any ]] = {}
2416+ for item in eval_items :
2417+ if item .name and item .evaluation_response :
2418+ try :
2419+ item_map [item .name ] = item .evaluation_response .model_dump (
2420+ mode = "json" , exclude_none = True
2421+ )
2422+ except Exception :
2423+ pass
2424+ return item_map
2425+
2426+
23992427def _convert_evaluation_run_results (
24002428 api_client : BaseApiClient ,
24012429 evaluation_run_results : types .EvaluationRunResults ,
24022430 inference_configs : Optional [dict [str , types .EvaluationRunInferenceConfig ]] = None ,
2403- ) -> Optional [types .EvaluationResult ]:
2404- """Retrieves an EvaluationItem from the EvaluationRunResults."""
2431+ ) -> tuple [Optional [types .EvaluationResult ], dict [str , dict [str , Any ]]]:
2432+ """Retrieves an EvaluationResult and item map from EvaluationRunResults.
2433+
2434+ Returns:
2435+ A tuple of (EvaluationResult, eval_item_map). The eval_item_map maps
2436+ evaluation item resource names to their serialized evaluation response
2437+ data, used for enriching loss analysis visualization.
2438+ """
24052439 if not evaluation_run_results or not evaluation_run_results .evaluation_set :
2406- return None
2440+ return None , {}
24072441
24082442 evals_module = evals .Evals (api_client_ = api_client )
24092443 eval_set = evals_module .get_evaluation_set (
@@ -2416,19 +2450,21 @@ def _convert_evaluation_run_results(
24162450 evals_module .get_evaluation_item (name = item_name )
24172451 for item_name in eval_set .evaluation_items
24182452 ]
2419- return _get_eval_result_from_eval_items (
2453+ eval_result = _get_eval_result_from_eval_items (
24202454 evaluation_run_results , eval_items , inference_configs
24212455 )
2456+ eval_item_map = _build_eval_item_map (eval_items )
2457+ return eval_result , eval_item_map
24222458
24232459
24242460async def _convert_evaluation_run_results_async (
24252461 api_client : BaseApiClient ,
24262462 evaluation_run_results : types .EvaluationRunResults ,
24272463 inference_configs : Optional [dict [str , types .EvaluationRunInferenceConfig ]] = None ,
2428- ) -> Optional [types .EvaluationResult ]:
2429- """Retrieves an EvaluationItem from the EvaluationRunResults."""
2464+ ) -> tuple [ Optional [types .EvaluationResult ], dict [ str , dict [ str , Any ]] ]:
2465+ """Retrieves an EvaluationResult and item map from EvaluationRunResults."""
24302466 if not evaluation_run_results or not evaluation_run_results .evaluation_set :
2431- return None
2467+ return None , {}
24322468
24332469 evals_module = evals .AsyncEvals (api_client_ = api_client )
24342470 eval_set = await evals_module .get_evaluation_set (
@@ -2442,9 +2478,11 @@ async def _convert_evaluation_run_results_async(
24422478 for eval_item in eval_set .evaluation_items
24432479 ]
24442480 eval_items = await asyncio .gather (* tasks )
2445- return _get_eval_result_from_eval_items (
2481+ eval_result = _get_eval_result_from_eval_items (
24462482 evaluation_run_results , eval_items , inference_configs
24472483 )
2484+ eval_item_map = _build_eval_item_map (eval_items )
2485+ return eval_result , eval_item_map
24482486
24492487
24502488def _object_to_dict (obj : Any ) -> Union [dict [str , Any ], Any ]:
0 commit comments