Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ede184e
Fix: Use App (with plugins) for eval when available
ishanrajsingh Dec 5, 2025
d0db7fd
Merge branch 'main' into fix-eval-use-app-with-plugins
ishanrajsingh Dec 5, 2025
cf77b85
Fix critical issues from code review
ishanrajsingh Dec 5, 2025
2311528
feat: Support App plugins in evaluation framework
ishanrajsingh Dec 5, 2025
de93f9f
feat: Support App plugins in evaluation framework
ishanrajsingh Dec 5, 2025
884b8df
Update src/google/adk/evaluation/evaluation_generator.py
ishanrajsingh Dec 5, 2025
1380758
feat: Support App plugins in evaluation framework
ishanrajsingh Dec 5, 2025
9bd1433
feat: Support App plugins in evaluation framework
ishanrajsingh Dec 5, 2025
379c9bd
refactor: Address Gemini Code Assist feedback
ishanrajsingh Dec 5, 2025
3c1c7a4
Update src/google/adk/evaluation/evaluation_generator.py
ishanrajsingh Dec 5, 2025
dae85d3
Update src/google/adk/evaluation/evaluation_generator.py
ishanrajsingh Dec 5, 2025
d03e414
refactor: Address Gemini Code Assist feedback
ishanrajsingh Dec 5, 2025
c1babc6
Update src/google/adk/evaluation/evaluation_generator.py
ishanrajsingh Dec 5, 2025
7909bee
Update src/google/adk/cli/cli_tools_click.py
ishanrajsingh Dec 6, 2025
1f41633
Merge branch 'main' into fix-eval-use-app-with-plugins
ishanrajsingh Dec 6, 2025
03919bd
fix: address CI/CD check failures
ishanrajsingh Dec 11, 2025
5f6b1ca
Merge branch 'main' into fix-eval-use-app-with-plugins
ishanrajsingh Dec 11, 2025
40b879e
Merge branch 'main' into fix-eval-use-app-with-plugins
ishanrajsingh Dec 11, 2025
72f6639
Merge branch 'main' into fix-eval-use-app-with-plugins
ishanrajsingh Dec 12, 2025
e71bdc2
fix: Add AppInferenceAdapter to remove cli dependency and enable App-…
ishanrajsingh Dec 12, 2025
c04a97e
Merge branch 'fix-eval-use-app-with-plugins' of https://github.com/is…
ishanrajsingh Dec 12, 2025
81fc2b7
fix: Move app loader to utils to fix CI check
ishanrajsingh Dec 12, 2025
e858f72
Merge branch 'main' into fix-eval-use-app-with-plugins
ishanrajsingh Jan 20, 2026
5315506
Merge branch 'main' into fix-eval-use-app-with-plugins
ishanrajsingh Jan 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 42 additions & 2 deletions src/google/adk/cli/cli_tools_click.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@
import os
from pathlib import Path
import tempfile
import textwrap
from typing import Optional
from typing import Optional, TYPE_CHECKING

if TYPE_CHECKING:
from ..apps.app import App

import textwrap
import click
from click.core import ParameterSource
from fastapi import FastAPI
Expand Down Expand Up @@ -515,6 +518,34 @@ def cli_run(
)
)

def _load_app_from_module(module_path: str) -> Optional['App']:
"""Try to load an App instance from the agent module.

Args:
module_path: Python module path (e.g., 'my_package.my_agent')

Returns:
App instance if found, None otherwise
"""
try:
import importlib
Comment thread
ishanrajsingh marked this conversation as resolved.
Outdated
module = importlib.import_module(module_path)

# Check for 'app' attribute (most common convention)
if hasattr(module, 'app'):
from ..apps.app import App
candidate = getattr(module, 'app')
if isinstance(candidate, App):
logger.info(f"Loaded App instance from {module_path}")
return candidate

logger.debug(f"No App instance found in {module_path}")

except (ImportError, AttributeError) as e:
logger.debug(f"Could not load App from module {module_path}: {e}")

return None
Comment thread
ishanrajsingh marked this conversation as resolved.
Outdated


def eval_options():
"""Decorator to add common eval options to click commands."""
Expand Down Expand Up @@ -733,10 +764,19 @@ def cli_eval(
)

try:
# Try to load App if available (for plugin support like ReflectAndRetryToolPlugin)
app = _load_app_from_module(agent_module_file_path)

if app:
logger.info("Using App instance for evaluation (plugins will be applied)")
else:
logger.info("No App found, using root_agent directly")

Comment thread
ishanrajsingh marked this conversation as resolved.
eval_service = LocalEvalService(
root_agent=root_agent,
eval_sets_manager=eval_sets_manager,
eval_set_results_manager=eval_set_results_manager,
app=app, # NEW: Pass app if available
user_simulator_provider=user_simulator_provider,
)

Expand Down
82 changes: 79 additions & 3 deletions src/google/adk/evaluation/evaluation_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@

import copy
import importlib
from typing import Any
from typing import AsyncGenerator
from typing import Optional
from typing import Any, AsyncGenerator, Optional, TYPE_CHECKING

if TYPE_CHECKING:
from ..apps.app import App

import uuid

from google.genai.types import Content
Expand All @@ -39,6 +41,7 @@
from .app_details import AgentDetails
from .app_details import AppDetails
from .eval_case import EvalCase
from .eval_case import IntermediateData
from .eval_case import Invocation
from .eval_case import InvocationEvent
from .eval_case import InvocationEvents
Expand Down Expand Up @@ -325,6 +328,78 @@ def convert_events_to_eval_invocations(
)

return invocations

@staticmethod
async def _generate_inferences_from_app(
app: 'App',
user_simulator: 'UserSimulator',
initial_session: Optional['SessionInput'],
session_id: str,
session_service: 'BaseSessionService',
artifact_service: 'BaseArtifactService',
memory_service: 'BaseMemoryService',
) -> list['Invocation']:
"""Generate inferences by invoking through App (preserving plugins)."""

# Determine user_id consistently
user_id = initial_session.user_id if initial_session else 'test_user_id'

# Initialize session (always, not just when initial_session provided)
app_name = initial_session.app_name if initial_session else app.name
await session_service.create_session(
app_name=app_name,
user_id=user_id,
session_id=session_id,
state=initial_session.state if initial_session else {},
)

# Create plugins to track requests (needed for app_details)
request_intercepter_plugin = _RequestIntercepterPlugin(
name="request_intercepter_plugin"
)
ensure_retry_options_plugin = EnsureRetryOptionsPlugin(
name="ensure_retry_options"
)

# Create a copy of the app to avoid mutating the original object and add eval-specific plugins.
app_for_runner = app.model_copy(deep=True)
app_for_runner.plugins.extend([request_intercepter_plugin, ensure_retry_options_plugin])
Comment thread
ishanrajsingh marked this conversation as resolved.
Outdated

# Create Runner with the modified App to preserve plugins
async with Runner(
app=app_for_runner,
session_service=session_service,
artifact_service=artifact_service,
memory_service=memory_service,
) as runner:
events = []

# Loop through user simulator messages (handles both static and dynamic)
while True:
next_user_message = await user_simulator.get_next_user_message(
copy.deepcopy(events)
)
if next_user_message.status == UserSimulatorStatus.SUCCESS:
async for event in EvaluationGenerator._generate_inferences_for_single_user_invocation(
runner, user_id, session_id, next_user_message.user_message
):
events.append(event)
else: # no more messages
break

# Extract app details from intercepted requests
app_details_by_invocation_id = (
EvaluationGenerator._get_app_details_by_invocation_id(
events, request_intercepter_plugin
)
)

# Convert events to invocations
return EvaluationGenerator.convert_events_to_eval_invocations(
events, app_details_by_invocation_id
)
Comment thread
ishanrajsingh marked this conversation as resolved.
Outdated



@staticmethod
def _get_app_details_by_invocation_id(
Expand Down Expand Up @@ -413,3 +488,4 @@ def _process_query_with_session(session_data, data):
responses[index]["actual_tool_use"] = actual_tool_uses
responses[index]["response"] = response
return responses

63 changes: 50 additions & 13 deletions src/google/adk/evaluation/local_eval_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
from typing import AsyncGenerator
from typing import Callable
from typing import Optional
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from ..apps.app import App
import uuid

from typing_extensions import override
Expand All @@ -41,6 +45,7 @@
from .base_eval_service import InferenceResult
from .base_eval_service import InferenceStatus
from .eval_case import Invocation
from .eval_case import SessionInput
from .eval_metrics import EvalMetric
from .eval_metrics import EvalMetricResult
from .eval_metrics import EvalMetricResultDetails
Expand Down Expand Up @@ -79,11 +84,13 @@ def __init__(
artifact_service: Optional[BaseArtifactService] = None,
eval_set_results_manager: Optional[EvalSetResultsManager] = None,
session_id_supplier: Callable[[], str] = _get_session_id,
app: Optional['App'] = None,
user_simulator_provider: UserSimulatorProvider = UserSimulatorProvider(),
memory_service: Optional[BaseMemoryService] = None,
):
self._root_agent = root_agent
self._eval_sets_manager = eval_sets_manager
self._app = app
metric_evaluator_registry = (
metric_evaluator_registry or DEFAULT_METRIC_EVALUATOR_REGISTRY
)
Expand Down Expand Up @@ -186,8 +193,8 @@ async def run_evaluation(inference_result):
yield eval_case_result

async def _evaluate_single_inference_result(
self, inference_result: InferenceResult, evaluate_config: EvaluateConfig
) -> tuple[InferenceResult, EvalCaseResult]:
self, inference_result: InferenceResult, evaluate_config: EvaluateConfig
) -> tuple[InferenceResult, EvalCaseResult]:
"""Returns the inference result and its corresponding EvalCaseResult.

A single inference result can have multiple invocations. For each
Expand All @@ -196,6 +203,24 @@ async def _evaluate_single_inference_result(
The EvalCaseResult contains scores for each metric per invocation and the
overall score.
"""
# Handle failed inferences early - skip evaluation
if (
inference_result.status == InferenceStatus.FAILURE
or inference_result.inferences is None
):
eval_case_result = EvalCaseResult(
eval_set_file=inference_result.eval_set_id,
eval_set_id=inference_result.eval_set_id,
eval_id=inference_result.eval_case_id,
final_eval_status=EvalStatus.NOT_EVALUATED,
overall_eval_metric_results=[],
eval_metric_result_per_invocation=[],
session_id=inference_result.session_id,
session_details=None,
user_id='test_user_id',
)
return (inference_result, eval_case_result)
Comment thread
ishanrajsingh marked this conversation as resolved.

eval_case = self._eval_sets_manager.get_eval_case(
app_name=inference_result.app_name,
eval_set_id=inference_result.eval_set_id,
Expand Down Expand Up @@ -406,25 +431,37 @@ async def _perform_inference_single_eval_item(
)

try:
# Use App if available (so plugins like ReflectAndRetryToolPlugin run)
with client_label_context(EVAL_CLIENT_LABEL):
inferences = (
await EvaluationGenerator._generate_inferences_from_root_agent(
# Extract common arguments to reduce duplication
common_args = {
"user_simulator": self._user_simulator_provider.provide(eval_case),
"initial_session": initial_session,
"session_id": session_id,
"session_service": self._session_service,
"artifact_service": self._artifact_service,
"memory_service": self._memory_service,
}

if self._app is not None:
inferences = await EvaluationGenerator._generate_inferences_from_app(
app=self._app,
**common_args
)
else:
# Fallback to direct root_agent usage (existing behavior)
inferences = await EvaluationGenerator._generate_inferences_from_root_agent(
root_agent=root_agent,
user_simulator=self._user_simulator_provider.provide(eval_case),
initial_session=initial_session,
session_id=session_id,
session_service=self._session_service,
artifact_service=self._artifact_service,
memory_service=self._memory_service,
**common_args
)
)

inference_result.inferences = inferences
inference_result.status = InferenceStatus.SUCCESS

return inference_result

except Exception as e:
# We intentionally catch the Exception as we don't failures to affect
# We intentionally catch the Exception as we don't want failures to affect
# other inferences.
logger.error(
'Inference failed for eval case `%s` with error %s.',
Expand All @@ -434,4 +471,4 @@ async def _perform_inference_single_eval_item(
)
inference_result.status = InferenceStatus.FAILURE
inference_result.error_message = str(e)
return inference_result
return inference_result