adk-python/tests/unittests/evaluation/test_agent_evaluator.py at 1e979a31a4e6061233b61d7d63ca240b35ea2ff6 · google/adk-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import sys
from unittest.mock import MagicMock
from unittest.mock import patch

from google.adk.evaluation.agent_evaluator import _EvalMetricResultWithInvocation
from google.adk.evaluation.agent_evaluator import AgentEvaluator
from google.adk.evaluation.eval_case import Invocation
from google.adk.evaluation.eval_metrics import EvalMetricResult
from google.adk.evaluation.eval_metrics import EvalStatus
from google.genai import types as genai_types


def _make_actual_invocation(
    query: str = "user query", response: str = "agent response"
) -> Invocation:
  return Invocation(
      user_content=genai_types.Content(
          parts=[genai_types.Part(text=query)], role="user"
      ),
      final_response=genai_types.Content(
          parts=[genai_types.Part(text=response)], role="model"
      ),
  )


def _make_eval_metric_result(
    score: float = 0.9, status: EvalStatus = EvalStatus.PASSED
) -> EvalMetricResult:
  return EvalMetricResult(
      metric_name="test_metric",
      threshold=0.8,
      score=score,
      eval_status=status,
  )


def _call_print_details(
    items: list[_EvalMetricResultWithInvocation],
) -> MagicMock:
  """Calls _print_details with mocked pandas/tabulate, returns the mock DataFrame class."""
  mock_pandas = MagicMock()
  mock_tabulate_module = MagicMock()
  mock_tabulate_module.tabulate = MagicMock(return_value="table")

  with patch.dict(
      sys.modules,
      {"pandas": mock_pandas, "tabulate": mock_tabulate_module},
  ):
    AgentEvaluator._print_details(
        eval_metric_result_with_invocations=items,
        overall_eval_status=EvalStatus.PASSED,
        overall_score=0.9,
        metric_name="test_metric",
        threshold=0.8,
    )

  return mock_pandas.pandas.DataFrame


class TestPrintDetailsWithNoExpectedInvocation:
  """Tests for _print_details when expected_invocation is None."""

  def test_does_not_raise(self):
    items = [
        _EvalMetricResultWithInvocation(
            actual_invocation=_make_actual_invocation(),
            expected_invocation=None,
            eval_metric_result=_make_eval_metric_result(),
        )
    ]
    _call_print_details(items)  # should not raise

  def test_multiple_invocations_all_without_expected(self):
    items = [
        _EvalMetricResultWithInvocation(
            actual_invocation=_make_actual_invocation(response=f"response {i}"),
            expected_invocation=None,
            eval_metric_result=_make_eval_metric_result(),
        )
        for i in range(3)
    ]
    mock_df_cls = _call_print_details(items)
    data = mock_df_cls.call_args[0][0]
    assert len(data) == 3
    for row in data:
      assert row["prompt"] == ""
      assert row["expected_response"] == ""
      assert row["expected_tool_calls"] == ""