Azure · Copilot · May 6, 2026 · May 6, 2026 · May 6, 2026
@@ -14,6 +14,13 @@
 
 ### Bugs Fixed
 
+- Fixed `_TaskNavigationEfficiencyEvaluator` failing with `'response' must be a list of messages.`
+  when invoked through the cloud Foundry / ACA evaluation runtime. The runtime serializes
+  list/object dataMapping fields to JSON-encoded strings before calling the Python evaluator;
+  the evaluator now transparently JSON-decodes such string inputs before validation.
+- Fixed the validator rejecting the JSON round-tripped form of the `ground_truth` tuple.
+  JSON has no tuple type, so a `(list, dict)` tuple round-trips to a `[list, dict]` 2-element
+  list; both forms are now accepted equivalently.
 - Fixed row classification double-counting in `_calculate_aoai_evaluation_summary` where errored rows were counted separately and could also be counted as passed/failed. Rows are now classified into mutually exclusive buckets with priority: passed > failed > errored > skipped.
 - Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True).
 - Fixed `_get_metric_result` prefix matching where shorter metric names (e.g., `xpia`) could match before longer, more-specific ones (e.g., `xpia_manipulated_content`). Now sorts by length descending for correct longest-prefix matching.

@@ -115,76 +115,92 @@ def _validate_response(self, response: Any) -> Optional[EvaluationException]:
 
         return None
 
-    def _validate_ground_truth(self, ground_truth: Any) -> Optional[EvaluationException]:
-        """Validate the ground_truth parameter."""
-        if not ground_truth:
+    def _validate_tool_names_and_params(
+        self, tool_names: Any, parameters: Any
+    ) -> Optional[EvaluationException]:
+        """Validate the (tool_names_list, parameters_dict) pair used in the tuple form of ground_truth.
+
+        :param tool_names: The first element of the tuple/2-element list.
+        :type tool_names: Any
+        :param parameters: The second element of the tuple/2-element list.
+        :type parameters: Any
+        :return: An :class:`EvaluationException` if validation fails, or ``None`` on success.
+        :rtype: Optional[EvaluationException]
+        """
+        # Validate tool names list
+        if not isinstance(tool_names, list):
             return EvaluationException(
-                message="'ground_truth' parameter is required and cannot be None or empty.",
+                message="First element of 'ground_truth' tuple must be a list of tool names.",
                 blame=ErrorBlame.USER_ERROR,
-                category=ErrorCategory.MISSING_FIELD,
+                category=ErrorCategory.INVALID_VALUE,
                 target=self.error_target,
             )
 
-        # ground_truth can be either:
-        # 1. A list of tool names (strings)
-        # 2. A tuple of (list of tool names, dict of parameters)
+        if len(tool_names) == 0:
+            return EvaluationException(
+                message="Tool names list in 'ground_truth' cannot be empty.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=self.error_target,
+            )
 
-        if isinstance(ground_truth, tuple):
-            # Validate tuple format: (list, dict)
-            if len(ground_truth) != 2:
+        for idx, name in enumerate(tool_names):
+            if not isinstance(name, str):
                 return EvaluationException(
-                    message="When 'ground_truth' is a tuple, it must contain exactly 2 elements: (tool_names_list, parameters_dict).",
+                    message=f"Tool name at index {idx} in 'ground_truth' must be a string, got {type(name).__name__}.",
                     blame=ErrorBlame.USER_ERROR,
                     category=ErrorCategory.INVALID_VALUE,
                     target=self.error_target,
                 )
 
-            tool_names, parameters = ground_truth
+        # Validate parameters dict
+        if not isinstance(parameters, dict):
+            return EvaluationException(
+                message="Second element of 'ground_truth' tuple must be a dictionary of parameters.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=self.error_target,
+            )
 
-            # Validate tool names list
-            if not isinstance(tool_names, list):
+        # Validate parameter values are dicts
+        for tool_name, params in parameters.items():
+            if not isinstance(params, dict):
                 return EvaluationException(
-                    message="First element of 'ground_truth' tuple must be a list of tool names.",
+                    message=f"Parameters for tool '{tool_name}' in 'ground_truth' must be a dictionary, got {type(params).__name__}.",
                     blame=ErrorBlame.USER_ERROR,
                     category=ErrorCategory.INVALID_VALUE,
                     target=self.error_target,
                 )
 
-            if len(tool_names) == 0:
-                return EvaluationException(
-                    message="Tool names list in 'ground_truth' cannot be empty.",
-                    blame=ErrorBlame.USER_ERROR,
-                    category=ErrorCategory.INVALID_VALUE,
-                    target=self.error_target,
-                )
+        return None
 
-            for idx, name in enumerate(tool_names):
-                if not isinstance(name, str):
-                    return EvaluationException(
-                        message=f"Tool name at index {idx} in 'ground_truth' must be a string, got {type(name).__name__}.",
-                        blame=ErrorBlame.USER_ERROR,
-                        category=ErrorCategory.INVALID_VALUE,
-                        target=self.error_target,
-                    )
+    def _validate_ground_truth(self, ground_truth: Any) -> Optional[EvaluationException]:
+        """Validate the ground_truth parameter."""
+        if not ground_truth:
+            return EvaluationException(
+                message="'ground_truth' parameter is required and cannot be None or empty.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.MISSING_FIELD,
+                target=self.error_target,
+            )
 
-            # Validate parameters dict
-            if not isinstance(parameters, dict):
+        # ground_truth can be either:
+        # 1. A list of tool names (strings)
+        # 2. A tuple of (list of tool names, dict of parameters)
+        # 3. A 2-element list [list, dict] — the JSON round-tripped form of (2)
+
+        if isinstance(ground_truth, tuple):
+            # Validate tuple format: (list, dict)
+            if len(ground_truth) != 2:
                 return EvaluationException(
-                    message="Second element of 'ground_truth' tuple must be a dictionary of parameters.",
+                    message="When 'ground_truth' is a tuple, it must contain exactly 2 elements: (tool_names_list, parameters_dict).",
                     blame=ErrorBlame.USER_ERROR,
                     category=ErrorCategory.INVALID_VALUE,
                     target=self.error_target,
                 )
 
-            # Validate parameter values are dicts
-            for tool_name, params in parameters.items():
-                if not isinstance(params, dict):
-                    return EvaluationException(
-                        message=f"Parameters for tool '{tool_name}' in 'ground_truth' must be a dictionary, got {type(params).__name__}.",
-                        blame=ErrorBlame.USER_ERROR,
-                        category=ErrorCategory.INVALID_VALUE,
-                        target=self.error_target,
-                    )
+            tool_names, parameters = ground_truth
+            return self._validate_tool_names_and_params(tool_names, parameters)
 
         elif isinstance(ground_truth, list):
             # Validate list of tool names
@@ -196,6 +212,20 @@ def _validate_ground_truth(self, ground_truth: Any) -> Optional[EvaluationExcept
                     target=self.error_target,
                 )
 
+            if all(isinstance(name, str) for name in ground_truth):
+                # Plain list of tool name strings — nothing further to validate.
+                return None
+
+            if (
+                len(ground_truth) == 2
+                and isinstance(ground_truth[0], list)
+                and isinstance(ground_truth[1], dict)
+            ):
+                # 2-element list [list, dict] — the JSON round-tripped form of a
+                # (tool_names_list, parameters_dict) tuple.  Validate it the same way.
+                return self._validate_tool_names_and_params(ground_truth[0], ground_truth[1])
+
+            # Identify the first non-string element to give a helpful error.
             for idx, name in enumerate(ground_truth):
                 if not isinstance(name, str):
                     return EvaluationException(

@@ -139,6 +139,36 @@ def __init__(
 
         super().__init__(threshold=1.0)
 
+    @staticmethod
+    def _maybe_json_decode(value: Any, field_name: str) -> Any:
+        """Decode a JSON-encoded string into a Python object.
+
+        The cloud Foundry / ACA evaluation runtime serializes list/object fields
+        to JSON-encoded strings via ``dataMapping`` template substitution
+        (e.g. ``${data.response}``) before invoking the Python evaluator entry-point.
+        This method accepts that shape transparently so callers using either the
+        in-process Python SDK or the cloud runtime get consistent behaviour.
+
+        :param value: The value to potentially decode.
+        :type value: Any
+        :param field_name: The field name used in error messages.
+        :type field_name: str
+        :return: The decoded Python object, or the original value if not a string.
+        :rtype: Any
+        :raises EvaluationException: If ``value`` is a string but not valid JSON.
+        """
+        if isinstance(value, str):
+            try:
+                return json.loads(value)
+            except json.JSONDecodeError as exc:
+                raise EvaluationException(
+                    message=(f"'{field_name}' arrived as a string but is not valid JSON: {exc}"),
+                    internal_message=str(exc),
+                    target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
+                    category=ErrorCategory.INVALID_VALUE,
+                )
+        return value
+
     @override
     async def _real_call(self, **kwargs):
         """The asynchronous call where real end-to-end evaluation logic is performed.
@@ -148,6 +178,10 @@ async def _real_call(self, **kwargs):
         :return: The evaluation result.
         :rtype: Dict[str, Union[float, str, Dict[str, float]]]
         """
+        if "response" in kwargs:
+            kwargs["response"] = self._maybe_json_decode(kwargs["response"], "response")
+        if "ground_truth" in kwargs:
+            kwargs["ground_truth"] = self._maybe_json_decode(kwargs["ground_truth"], "ground_truth")
         self._validator.validate_eval_input(kwargs)
         return await super()._real_call(**kwargs)
 
@@ -275,8 +309,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[s
         ground_truth_names = []
         ground_truth_params_dict: Dict[str, Dict[str, Any]] = {}
 
-        if isinstance(ground_truth, tuple) and len(ground_truth) == 2:
+        if (isinstance(ground_truth, tuple) and len(ground_truth) == 2) or (
+            isinstance(ground_truth, list)
+            and len(ground_truth) == 2
+            and isinstance(ground_truth[0], list)
+            and isinstance(ground_truth[1], dict)
+        ):
             # Tuple format: (tool_names, parameters_dict)
+            # Also handles a 2-element list [list, dict] which is the JSON round-tripped form of a tuple.
             tool_names_list, params_dict = ground_truth
 
             if not isinstance(tool_names_list, list) or not all(isinstance(name, str) for name in tool_names_list):