Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@

### Bugs Fixed

- Fixed `_TaskNavigationEfficiencyEvaluator` failing with `'response' must be a list of messages.`
when invoked through the cloud Foundry / ACA evaluation runtime. The runtime serializes
list/object dataMapping fields to JSON-encoded strings before calling the Python evaluator;
the evaluator now transparently JSON-decodes such string inputs before validation.
- Fixed the validator rejecting the JSON round-tripped form of the `ground_truth` tuple.
JSON has no tuple type, so a `(list, dict)` tuple round-trips to a `[list, dict]` 2-element
list; both forms are now accepted equivalently.
- Fixed row classification double-counting in `_calculate_aoai_evaluation_summary` where errored rows were counted separately and could also be counted as passed/failed. Rows are now classified into mutually exclusive buckets with priority: passed > failed > errored > skipped.
- Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True).
- Fixed `_get_metric_result` prefix matching where shorter metric names (e.g., `xpia`) could match before longer, more-specific ones (e.g., `xpia_manipulated_content`). Now sorts by length descending for correct longest-prefix matching.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,76 +115,92 @@ def _validate_response(self, response: Any) -> Optional[EvaluationException]:

return None

def _validate_ground_truth(self, ground_truth: Any) -> Optional[EvaluationException]:
"""Validate the ground_truth parameter."""
if not ground_truth:
def _validate_tool_names_and_params(
self, tool_names: Any, parameters: Any
) -> Optional[EvaluationException]:
"""Validate the (tool_names_list, parameters_dict) pair used in the tuple form of ground_truth.

:param tool_names: The first element of the tuple/2-element list.
:type tool_names: Any
:param parameters: The second element of the tuple/2-element list.
:type parameters: Any
:return: An :class:`EvaluationException` if validation fails, or ``None`` on success.
:rtype: Optional[EvaluationException]
"""
# Validate tool names list
if not isinstance(tool_names, list):
return EvaluationException(
message="'ground_truth' parameter is required and cannot be None or empty.",
message="First element of 'ground_truth' tuple must be a list of tool names.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.MISSING_FIELD,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)

# ground_truth can be either:
# 1. A list of tool names (strings)
# 2. A tuple of (list of tool names, dict of parameters)
if len(tool_names) == 0:
return EvaluationException(
message="Tool names list in 'ground_truth' cannot be empty.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)

if isinstance(ground_truth, tuple):
# Validate tuple format: (list, dict)
if len(ground_truth) != 2:
for idx, name in enumerate(tool_names):
if not isinstance(name, str):
return EvaluationException(
message="When 'ground_truth' is a tuple, it must contain exactly 2 elements: (tool_names_list, parameters_dict).",
message=f"Tool name at index {idx} in 'ground_truth' must be a string, got {type(name).__name__}.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)

tool_names, parameters = ground_truth
# Validate parameters dict
if not isinstance(parameters, dict):
return EvaluationException(
message="Second element of 'ground_truth' tuple must be a dictionary of parameters.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)

# Validate tool names list
if not isinstance(tool_names, list):
# Validate parameter values are dicts
for tool_name, params in parameters.items():
if not isinstance(params, dict):
return EvaluationException(
message="First element of 'ground_truth' tuple must be a list of tool names.",
message=f"Parameters for tool '{tool_name}' in 'ground_truth' must be a dictionary, got {type(params).__name__}.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)

if len(tool_names) == 0:
return EvaluationException(
message="Tool names list in 'ground_truth' cannot be empty.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)
return None

for idx, name in enumerate(tool_names):
if not isinstance(name, str):
return EvaluationException(
message=f"Tool name at index {idx} in 'ground_truth' must be a string, got {type(name).__name__}.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)
def _validate_ground_truth(self, ground_truth: Any) -> Optional[EvaluationException]:
"""Validate the ground_truth parameter."""
if not ground_truth:
return EvaluationException(
message="'ground_truth' parameter is required and cannot be None or empty.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.MISSING_FIELD,
target=self.error_target,
)

# Validate parameters dict
if not isinstance(parameters, dict):
# ground_truth can be either:
# 1. A list of tool names (strings)
# 2. A tuple of (list of tool names, dict of parameters)
# 3. A 2-element list [list, dict] — the JSON round-tripped form of (2)

if isinstance(ground_truth, tuple):
# Validate tuple format: (list, dict)
if len(ground_truth) != 2:
return EvaluationException(
message="Second element of 'ground_truth' tuple must be a dictionary of parameters.",
message="When 'ground_truth' is a tuple, it must contain exactly 2 elements: (tool_names_list, parameters_dict).",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)

# Validate parameter values are dicts
for tool_name, params in parameters.items():
if not isinstance(params, dict):
return EvaluationException(
message=f"Parameters for tool '{tool_name}' in 'ground_truth' must be a dictionary, got {type(params).__name__}.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)
tool_names, parameters = ground_truth
return self._validate_tool_names_and_params(tool_names, parameters)

elif isinstance(ground_truth, list):
# Validate list of tool names
Expand All @@ -196,6 +212,20 @@ def _validate_ground_truth(self, ground_truth: Any) -> Optional[EvaluationExcept
target=self.error_target,
)

if all(isinstance(name, str) for name in ground_truth):
# Plain list of tool name strings — nothing further to validate.
return None

if (
len(ground_truth) == 2
and isinstance(ground_truth[0], list)
and isinstance(ground_truth[1], dict)
):
# 2-element list [list, dict] — the JSON round-tripped form of a
# (tool_names_list, parameters_dict) tuple. Validate it the same way.
return self._validate_tool_names_and_params(ground_truth[0], ground_truth[1])

# Identify the first non-string element to give a helpful error.
for idx, name in enumerate(ground_truth):
if not isinstance(name, str):
return EvaluationException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,36 @@ def __init__(

super().__init__(threshold=1.0)

@staticmethod
def _maybe_json_decode(value: Any, field_name: str) -> Any:
"""Decode a JSON-encoded string into a Python object.

The cloud Foundry / ACA evaluation runtime serializes list/object fields
to JSON-encoded strings via ``dataMapping`` template substitution
(e.g. ``${data.response}``) before invoking the Python evaluator entry-point.
This method accepts that shape transparently so callers using either the
in-process Python SDK or the cloud runtime get consistent behaviour.

:param value: The value to potentially decode.
:type value: Any
:param field_name: The field name used in error messages.
:type field_name: str
:return: The decoded Python object, or the original value if not a string.
:rtype: Any
:raises EvaluationException: If ``value`` is a string but not valid JSON.
"""
if isinstance(value, str):
try:
return json.loads(value)
except json.JSONDecodeError as exc:
raise EvaluationException(
message=(f"'{field_name}' arrived as a string but is not valid JSON: {exc}"),
internal_message=str(exc),
target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
category=ErrorCategory.INVALID_VALUE,
)
return value

@override
async def _real_call(self, **kwargs):
"""The asynchronous call where real end-to-end evaluation logic is performed.
Expand All @@ -148,6 +178,10 @@ async def _real_call(self, **kwargs):
:return: The evaluation result.
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
"""
if "response" in kwargs:
kwargs["response"] = self._maybe_json_decode(kwargs["response"], "response")
if "ground_truth" in kwargs:
kwargs["ground_truth"] = self._maybe_json_decode(kwargs["ground_truth"], "ground_truth")
self._validator.validate_eval_input(kwargs)
return await super()._real_call(**kwargs)

Expand Down Expand Up @@ -275,8 +309,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[s
ground_truth_names = []
ground_truth_params_dict: Dict[str, Dict[str, Any]] = {}

if isinstance(ground_truth, tuple) and len(ground_truth) == 2:
if (isinstance(ground_truth, tuple) and len(ground_truth) == 2) or (
isinstance(ground_truth, list)
and len(ground_truth) == 2
and isinstance(ground_truth[0], list)
and isinstance(ground_truth[1], dict)
):
# Tuple format: (tool_names, parameters_dict)
# Also handles a 2-element list [list, dict] which is the JSON round-tripped form of a tuple.
tool_names_list, params_dict = ground_truth

if not isinstance(tool_names_list, list) or not all(isinstance(name, str) for name in tool_names_list):
Expand Down
Loading
Loading