From ba808891d90333ad376b421926620c1706e5aae3 Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 10:39:26 -0700 Subject: [PATCH 01/10] Add DeepEvalHandler integration with unit tests Introduces a new integrations/deepeval/ module that adapts AgentCore Lambda evaluation events into DeepEval LLMTestCase objects, runs any BaseMetric, and returns structured score/label/explanation responses. --- .../integrations/deepeval/__init__.py | 5 + .../integrations/deepeval/handler.py | 88 +++++ .../integrations/deepeval/input_mapper.py | 191 ++++++++++ .../integrations/deepeval/__init__.py | 0 .../integrations/deepeval/test_handler.py | 230 ++++++++++++ .../deepeval/test_input_mapper.py | 331 ++++++++++++++++++ 6 files changed, 845 insertions(+) create mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py create mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py create mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py create mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py create mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py create mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py new file mode 100644 index 00000000..76f6461f --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py @@ -0,0 +1,5 @@ +"""DeepEval integration for AgentCore Evaluation.""" + +from bedrock_agentcore.evaluation.integrations.deepeval.handler import DeepEvalHandler + +__all__ = ["DeepEvalHandler"] diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py new file mode 100644 index 00000000..b339b883 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py @@ -0,0 +1,88 @@ +"""DeepEval handler that adapts AgentCore Lambda evaluation events to DeepEval metrics.""" + +import logging +from typing import Any, Callable, Dict, Optional + +from deepeval.metrics import BaseMetric + +from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import ( + ParsedEvaluationEvent, + build_test_case, +) + +logger = logging.getLogger(__name__) + + +class DeepEvalHandler: + """Lambda handler that runs a DeepEval metric against AgentCore evaluation events. + + Never raises unhandled exceptions — always returns a valid response dict. + + Example:: + + from deepeval.metrics import AnswerRelevancyMetric + + metric = AnswerRelevancyMetric(threshold=0.7) + handler = DeepEvalHandler(metric=metric) + + # Use as Lambda handler + def lambda_handler(event, context): + return handler(event, context) + """ + + def __init__( + self, + metric: BaseMetric, + field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, + ): + """Initialize the handler. + + Args: + metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric). + field_mapper: Optional callable that receives the raw Lambda event and + returns a dict of LLMTestCase field values. Bypasses default span + extraction when provided. + """ + self.metric = metric + self.field_mapper = field_mapper + + def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]: + """Handle a Lambda invocation. + + Args: + event: Raw Lambda event dict from the evaluation service. + context: Lambda context object (unused). + + Returns: + Success: {"value": float, "label": str, "explanation": str} + Error: {"errorCode": str, "errorMessage": str} + """ + try: + parsed = ParsedEvaluationEvent.from_lambda_event(event) + except (KeyError, IndexError, TypeError) as e: + logger.error("Failed to parse evaluation event: %s", e) + return _error_response("INVALID_EVENT", f"Failed to parse evaluation event: {e}") + + try: + test_case = build_test_case(parsed, self.metric, self.field_mapper) + except ValueError as e: + logger.error("Missing required fields: %s", e) + return _error_response("MISSING_REQUIRED_FIELD", str(e)) + + try: + self.metric.measure(test_case) + except Exception as e: + logger.error("Metric measurement failed: %s", e, exc_info=True) + return _error_response("METRIC_ERROR", f"{type(self.metric).__name__} failed: {e}") + + score = self.metric.score + reason = getattr(self.metric, "reason", None) or "" + threshold = getattr(self.metric, "threshold", 0.5) + label = "Pass" if score is not None and score >= threshold else "Fail" + + return {"value": score, "label": label, "explanation": reason} + + +def _error_response(code: str, message: str) -> Dict[str, str]: + """Build a standardized error response dict.""" + return {"errorCode": code, "errorMessage": message} diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py new file mode 100644 index 00000000..50873cf5 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py @@ -0,0 +1,191 @@ +"""Map AgentCore Lambda evaluation events to DeepEval LLMTestCase objects.""" + +import logging +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional + +from deepeval.metrics import BaseMetric +from deepeval.test_case import LLMTestCase, LLMTestCaseParams + +logger = logging.getLogger(__name__) + +_PARAM_TO_FIELD: Dict[LLMTestCaseParams, str] = { + LLMTestCaseParams.INPUT: "input", + LLMTestCaseParams.ACTUAL_OUTPUT: "actual_output", + LLMTestCaseParams.EXPECTED_OUTPUT: "expected_output", + LLMTestCaseParams.CONTEXT: "context", + LLMTestCaseParams.RETRIEVAL_CONTEXT: "retrieval_context", +} + +_METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = { + "AnswerRelevancyMetric": ["input", "actual_output"], + "FaithfulnessMetric": ["input", "actual_output", "retrieval_context"], + "ContextualRelevancyMetric": ["input", "actual_output", "retrieval_context"], + "ContextualPrecisionMetric": ["input", "actual_output", "expected_output", "retrieval_context"], + "ContextualRecallMetric": ["input", "actual_output", "expected_output", "retrieval_context"], + "HallucinationMetric": ["input", "actual_output", "context"], + "BiasMetric": ["input", "actual_output"], + "ToxicityMetric": ["input", "actual_output"], + "GEval": ["input", "actual_output"], + "SummarizationMetric": ["input", "actual_output"], +} + + +@dataclass +class ParsedEvaluationEvent: + """Parsed representation of the AgentCore Lambda evaluation event.""" + + evaluation_level: str + session_spans: List[Dict[str, Any]] + target_trace_id: Optional[str] = None + target_span_id: Optional[str] = None + reference_inputs: List[Dict[str, Any]] = field(default_factory=list) + + @classmethod + def from_lambda_event(cls, event: Dict[str, Any]) -> "ParsedEvaluationEvent": + """Parse a raw Lambda event dict into a structured object. + + Args: + event: Raw Lambda event payload from the evaluation service. + + Returns: + ParsedEvaluationEvent with extracted fields. + + Raises: + KeyError: If required top-level fields are missing. + """ + evaluation_input = event["evaluationInput"] + target = event.get("evaluationTarget") or {} + trace_ids = target.get("traceIds") or [] + span_ids = target.get("spanIds") or [] + + return cls( + evaluation_level=event["evaluationLevel"], + session_spans=evaluation_input["sessionSpans"], + target_trace_id=trace_ids[0] if trace_ids else None, + target_span_id=span_ids[0] if span_ids else None, + reference_inputs=event.get("evaluationReferenceInputs") or [], + ) + + +def _get_required_params(metric: BaseMetric) -> List[str]: + """Determine which LLMTestCase fields a metric requires. + + Fallback chain: + 1. metric._required_params (DeepEval internal attribute) + 2. Static registry _METRIC_REQUIRED_PARAMS keyed by class name + 3. metric.evaluation_params (GEval special case) + 4. Default: ["input", "actual_output"] + """ + if hasattr(metric, "_required_params") and metric._required_params: + params = metric._required_params + return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params] + + class_name = type(metric).__name__ + if class_name in _METRIC_REQUIRED_PARAMS: + return _METRIC_REQUIRED_PARAMS[class_name] + + if hasattr(metric, "evaluation_params") and metric.evaluation_params: + params = metric.evaluation_params + return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params] + + return ["input", "actual_output"] + + +def _extract_fields_from_spans( + parsed: ParsedEvaluationEvent, +) -> Dict[str, Any]: + """Extract LLMTestCase fields from ADOT session spans. + + Bridges Session → LLMTestCase fields: + - input ← user messages (role=="user") + - actual_output ← assistant messages (role=="assistant") + - retrieval_context ← tool messages (role=="tool") + - expected_output ← evaluationReferenceInputs[0].expectedResponse + """ + user_messages: List[str] = [] + assistant_messages: List[str] = [] + tool_messages: List[str] = [] + + for span in parsed.session_spans: + attributes = span.get("attributes", {}) + role = attributes.get("gen_ai.message.role", "") + content = attributes.get("gen_ai.message.content", "") + + if not content: + content = attributes.get("gen_ai.completion", "") + + if role == "user" and content: + user_messages.append(content) + elif role == "assistant" and content: + assistant_messages.append(content) + elif role == "tool" and content: + tool_messages.append(content) + + fields: Dict[str, Any] = {} + + if user_messages: + fields["input"] = "\n".join(user_messages) + if assistant_messages: + fields["actual_output"] = "\n".join(assistant_messages) + if tool_messages: + fields["retrieval_context"] = tool_messages + + if parsed.reference_inputs: + expected = parsed.reference_inputs[0].get("expectedResponse") + if expected: + fields["expected_output"] = expected + + return fields + + +def build_test_case( + parsed: ParsedEvaluationEvent, + metric: BaseMetric, + field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, +) -> LLMTestCase: + """Build a DeepEval LLMTestCase from a parsed evaluation event. + + Args: + parsed: The parsed Lambda event. + metric: The DeepEval metric (used to determine required fields). + field_mapper: Optional callable that receives the raw Lambda event fields + and returns a dict of LLMTestCase field values. Bypasses default + span extraction when provided. + + Returns: + An LLMTestCase ready for metric.measure(). + + Raises: + ValueError: If required fields for the metric cannot be populated. + """ + if field_mapper is not None: + raw_event = { + "evaluationLevel": parsed.evaluation_level, + "evaluationInput": {"sessionSpans": parsed.session_spans}, + "evaluationTarget": { + "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [], + "spanIds": [parsed.target_span_id] if parsed.target_span_id else [], + }, + "evaluationReferenceInputs": parsed.reference_inputs, + } + fields = field_mapper(raw_event) + else: + fields = _extract_fields_from_spans(parsed) + + required = _get_required_params(metric) + missing = [f for f in required if f not in fields or not fields[f]] + if missing: + metric_name = type(metric).__name__ + raise ValueError( + f"Field(s) {missing} required by {metric_name} but not found in evaluation event. " + f"Provide a field_mapper or ensure spans contain the necessary data." + ) + + return LLMTestCase( + input=fields.get("input", ""), + actual_output=fields.get("actual_output", ""), + expected_output=fields.get("expected_output"), + context=fields.get("context"), + retrieval_context=fields.get("retrieval_context"), + ) diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py new file mode 100644 index 00000000..77988ab7 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py @@ -0,0 +1,230 @@ +"""Tests for DeepEvalHandler.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from bedrock_agentcore.evaluation.integrations.deepeval.handler import DeepEvalHandler + + +def _make_event( + level="TRACE", + trace_ids=None, + spans=None, + reference_inputs=None, +): + """Build a raw Lambda event dict for testing.""" + event = { + "schemaVersion": "1.0", + "evaluationLevel": level, + "evaluationInput": { + "sessionSpans": spans + or [ + { + "traceId": "abc123", + "spanId": "span1", + "attributes": { + "gen_ai.message.role": "user", + "gen_ai.message.content": "What is AI?", + }, + }, + { + "traceId": "abc123", + "spanId": "span2", + "attributes": { + "gen_ai.message.role": "assistant", + "gen_ai.message.content": "AI is artificial intelligence.", + }, + }, + ] + }, + "evaluationTarget": {}, + } + if trace_ids is not None: + event["evaluationTarget"]["traceIds"] = trace_ids + if reference_inputs is not None: + event["evaluationReferenceInputs"] = reference_inputs + return event + + +def _mock_metric(score=0.85, reason="Looks good", threshold=0.7, name="MockMetric"): + """Create a mock metric that returns a fixed score on measure().""" + metric = MagicMock() + type(metric).__name__ = name + metric.threshold = threshold + metric.score = score + metric.reason = reason + metric._required_params = None + del metric._required_params + del metric.evaluation_params + + def measure_side_effect(test_case): + metric.score = score + metric.reason = reason + + metric.measure = MagicMock(side_effect=measure_side_effect) + return metric + + +class TestDeepEvalHandlerSuccess: + def test_returns_pass_when_score_above_threshold(self): + metric = _mock_metric(score=0.9, threshold=0.7) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 0.9 + assert result["label"] == "Pass" + assert result["explanation"] == "Looks good" + + def test_returns_fail_when_score_below_threshold(self): + metric = _mock_metric(score=0.3, threshold=0.7) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 0.3 + assert result["label"] == "Fail" + + def test_returns_pass_at_exact_threshold(self): + metric = _mock_metric(score=0.7, threshold=0.7) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["label"] == "Pass" + + def test_metric_measure_called_with_test_case(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + handler(_make_event()) + + metric.measure.assert_called_once() + test_case = metric.measure.call_args[0][0] + assert test_case.input == "What is AI?" + assert test_case.actual_output == "AI is artificial intelligence." + + def test_context_parameter_ignored(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + mock_context = {"function_name": "my-lambda"} + + result = handler(_make_event(), mock_context) + + assert result["value"] == 0.85 + + def test_custom_field_mapper(self): + metric = _mock_metric() + handler = DeepEvalHandler( + metric=metric, + field_mapper=lambda event: { + "input": "mapped input", + "actual_output": "mapped output", + }, + ) + + result = handler(_make_event()) + + assert result["value"] == 0.85 + test_case = metric.measure.call_args[0][0] + assert test_case.input == "mapped input" + assert test_case.actual_output == "mapped output" + + +class TestDeepEvalHandlerErrors: + def test_invalid_event_returns_error(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + result = handler({}) + + assert result["errorCode"] == "INVALID_EVENT" + assert "errorMessage" in result + assert "value" not in result + + def test_missing_evaluation_input_returns_error(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + event = {"evaluationLevel": "TRACE", "evaluationTarget": {}} + result = handler(event) + + assert result["errorCode"] == "INVALID_EVENT" + + def test_missing_required_field_returns_error(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "q"}, + }, + { + "traceId": "t1", + "spanId": "s2", + "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "a"}, + }, + ] + metric = _mock_metric(name="FaithfulnessMetric") + handler = DeepEvalHandler(metric=metric) + + event = _make_event(spans=spans) + result = handler(event) + + assert result["errorCode"] == "MISSING_REQUIRED_FIELD" + assert "retrieval_context" in result["errorMessage"] + + def test_metric_measure_exception_returns_error(self): + metric = _mock_metric() + metric.measure = MagicMock(side_effect=RuntimeError("LLM timeout")) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["errorCode"] == "METRIC_ERROR" + assert "LLM timeout" in result["errorMessage"] + + def test_never_raises_on_any_input(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + for bad_input in [None, [], "string", 42, {"random": "keys"}]: + result = handler(bad_input) + assert "errorCode" in result or "value" in result + + +class TestDeepEvalHandlerEdgeCases: + def test_metric_with_no_reason(self): + metric = _mock_metric(score=0.8, reason=None) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["explanation"] == "" + + def test_metric_score_zero(self): + metric = _mock_metric(score=0.0, threshold=0.5) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 0.0 + assert result["label"] == "Fail" + + def test_metric_score_one(self): + metric = _mock_metric(score=1.0, threshold=0.5) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 1.0 + assert result["label"] == "Pass" + + def test_default_threshold_when_missing(self): + metric = _mock_metric(score=0.6) + del metric.threshold + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["label"] == "Pass" diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py new file mode 100644 index 00000000..efab5459 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py @@ -0,0 +1,331 @@ +"""Tests for deepeval input_mapper module.""" + +from unittest.mock import MagicMock + +import pytest +from deepeval.test_case import LLMTestCaseParams + +from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import ( + ParsedEvaluationEvent, + _get_required_params, + build_test_case, +) + + +def _make_event( + level="TRACE", + trace_ids=None, + span_ids=None, + spans=None, + reference_inputs=None, +): + """Build a raw Lambda event dict for testing.""" + event = { + "schemaVersion": "1.0", + "evaluationLevel": level, + "evaluationInput": { + "sessionSpans": spans + or [ + { + "traceId": "abc123", + "spanId": "span1", + "attributes": { + "gen_ai.message.role": "user", + "gen_ai.message.content": "What is the capital of France?", + }, + }, + { + "traceId": "abc123", + "spanId": "span2", + "attributes": { + "gen_ai.message.role": "assistant", + "gen_ai.message.content": "The capital of France is Paris.", + }, + }, + ] + }, + "evaluationTarget": {}, + } + if trace_ids is not None: + event["evaluationTarget"]["traceIds"] = trace_ids + if span_ids is not None: + event["evaluationTarget"]["spanIds"] = span_ids + if reference_inputs is not None: + event["evaluationReferenceInputs"] = reference_inputs + return event + + +def _mock_metric(name="MockMetric", required_params=None, evaluation_params=None, threshold=0.5): + """Create a mock DeepEval metric.""" + metric = MagicMock() + type(metric).__name__ = name + metric.threshold = threshold + + if required_params is not None: + metric._required_params = required_params + else: + del metric._required_params + + if evaluation_params is not None: + metric.evaluation_params = evaluation_params + else: + del metric.evaluation_params + + return metric + + +class TestParsedEvaluationEvent: + def test_from_lambda_event_trace_level(self): + event = _make_event(level="TRACE", trace_ids=["trace-1"]) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + + assert parsed.evaluation_level == "TRACE" + assert parsed.target_trace_id == "trace-1" + assert parsed.target_span_id is None + assert len(parsed.session_spans) == 2 + + def test_from_lambda_event_tool_call_level(self): + event = _make_event(level="TOOL_CALL", span_ids=["span-42"]) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + + assert parsed.evaluation_level == "TOOL_CALL" + assert parsed.target_span_id == "span-42" + assert parsed.target_trace_id is None + + def test_from_lambda_event_session_level(self): + event = _make_event(level="SESSION") + parsed = ParsedEvaluationEvent.from_lambda_event(event) + + assert parsed.evaluation_level == "SESSION" + assert parsed.target_trace_id is None + assert parsed.target_span_id is None + + def test_from_lambda_event_with_reference_inputs(self): + refs = [{"expectedResponse": "Paris is the capital of France."}] + event = _make_event(reference_inputs=refs) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + + assert parsed.reference_inputs == refs + + def test_from_lambda_event_missing_reference_inputs(self): + event = _make_event() + parsed = ParsedEvaluationEvent.from_lambda_event(event) + + assert parsed.reference_inputs == [] + + def test_from_lambda_event_missing_evaluation_level_raises(self): + event = _make_event() + del event["evaluationLevel"] + + with pytest.raises(KeyError): + ParsedEvaluationEvent.from_lambda_event(event) + + def test_from_lambda_event_missing_evaluation_input_raises(self): + event = _make_event() + del event["evaluationInput"] + + with pytest.raises(KeyError): + ParsedEvaluationEvent.from_lambda_event(event) + + def test_from_lambda_event_missing_target_key_defaults(self): + event = _make_event() + del event["evaluationTarget"] + parsed = ParsedEvaluationEvent.from_lambda_event(event) + + assert parsed.target_trace_id is None + assert parsed.target_span_id is None + + +class TestGetRequiredParams: + def test_uses_required_params_attribute(self): + metric = _mock_metric( + required_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT] + ) + result = _get_required_params(metric) + + assert result == ["input", "actual_output"] + + def test_falls_back_to_static_registry(self): + metric = _mock_metric(name="FaithfulnessMetric") + result = _get_required_params(metric) + + assert result == ["input", "actual_output", "retrieval_context"] + + def test_falls_back_to_evaluation_params(self): + metric = _mock_metric( + name="UnknownMetric", + evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT], + ) + result = _get_required_params(metric) + + assert result == ["input", "retrieval_context"] + + def test_defaults_to_input_and_actual_output(self): + metric = _mock_metric(name="UnknownMetric") + result = _get_required_params(metric) + + assert result == ["input", "actual_output"] + + def test_empty_required_params_falls_through(self): + metric = _mock_metric(name="UnknownMetric", required_params=[]) + result = _get_required_params(metric) + + assert result == ["input", "actual_output"] + + +class TestBuildTestCase: + def test_basic_span_extraction(self): + event = _make_event() + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="AnswerRelevancyMetric") + + test_case = build_test_case(parsed, metric) + + assert test_case.input == "What is the capital of France?" + assert test_case.actual_output == "The capital of France is Paris." + + def test_retrieval_context_from_tool_spans(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "query"}, + }, + { + "traceId": "t1", + "spanId": "s2", + "attributes": {"gen_ai.message.role": "tool", "gen_ai.message.content": "doc chunk 1"}, + }, + { + "traceId": "t1", + "spanId": "s3", + "attributes": {"gen_ai.message.role": "tool", "gen_ai.message.content": "doc chunk 2"}, + }, + { + "traceId": "t1", + "spanId": "s4", + "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "answer"}, + }, + ] + event = _make_event(spans=spans) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="FaithfulnessMetric") + + test_case = build_test_case(parsed, metric) + + assert test_case.input == "query" + assert test_case.actual_output == "answer" + assert test_case.retrieval_context == ["doc chunk 1", "doc chunk 2"] + + def test_expected_output_from_reference_inputs(self): + refs = [{"expectedResponse": "Paris"}] + event = _make_event(reference_inputs=refs) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="AnswerRelevancyMetric") + + test_case = build_test_case(parsed, metric) + + assert test_case.expected_output == "Paris" + + def test_missing_required_field_raises_value_error(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "query"}, + }, + { + "traceId": "t1", + "spanId": "s2", + "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "answer"}, + }, + ] + event = _make_event(spans=spans) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="FaithfulnessMetric") + + with pytest.raises(ValueError, match="retrieval_context"): + build_test_case(parsed, metric) + + def test_custom_field_mapper_bypasses_extraction(self): + event = _make_event() + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="AnswerRelevancyMetric") + + def custom_mapper(raw_event): + return { + "input": "custom input", + "actual_output": "custom output", + } + + test_case = build_test_case(parsed, metric, field_mapper=custom_mapper) + + assert test_case.input == "custom input" + assert test_case.actual_output == "custom output" + + def test_field_mapper_receives_reconstructed_event(self): + refs = [{"expectedResponse": "expected"}] + event = _make_event(level="TRACE", trace_ids=["t1"], reference_inputs=refs) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="AnswerRelevancyMetric") + + received_events = [] + + def capture_mapper(raw_event): + received_events.append(raw_event) + return {"input": "x", "actual_output": "y"} + + build_test_case(parsed, metric, field_mapper=capture_mapper) + + raw = received_events[0] + assert raw["evaluationLevel"] == "TRACE" + assert raw["evaluationTarget"]["traceIds"] == ["t1"] + assert raw["evaluationReferenceInputs"] == refs + + def test_multiple_user_messages_concatenated(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "hello"}, + }, + { + "traceId": "t1", + "spanId": "s2", + "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "world"}, + }, + { + "traceId": "t1", + "spanId": "s3", + "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "hi"}, + }, + ] + event = _make_event(spans=spans) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="AnswerRelevancyMetric") + + test_case = build_test_case(parsed, metric) + + assert test_case.input == "hello\nworld" + + def test_gen_ai_completion_fallback(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.message.role": "user", "gen_ai.completion": "fallback input"}, + }, + { + "traceId": "t1", + "spanId": "s2", + "attributes": {"gen_ai.message.role": "assistant", "gen_ai.completion": "fallback output"}, + }, + ] + event = _make_event(spans=spans) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="AnswerRelevancyMetric") + + test_case = build_test_case(parsed, metric) + + assert test_case.input == "fallback input" + assert test_case.actual_output == "fallback output" From b0d9682c7258b584c55b1664fa4b838817ce370b Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 11:38:57 -0700 Subject: [PATCH 02/10] Fix span extraction to use real AgentCore _eval_log_records structure --- .../integrations/deepeval/input_mapper.py | 94 +++- .../integrations/deepeval/test_handler.py | 57 +-- .../deepeval/test_input_mapper.py | 402 ++++++++++++++---- 3 files changed, 415 insertions(+), 138 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py index 50873cf5..cd67845f 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py @@ -1,5 +1,6 @@ """Map AgentCore Lambda evaluation events to DeepEval LLMTestCase objects.""" +import json import logging from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional @@ -92,15 +93,36 @@ def _get_required_params(metric: BaseMetric) -> List[str]: return ["input", "actual_output"] +def _get_message_content(message: Any) -> str: + """Extract text content from a message object. + + Message content can be a dict with a "content" or "message" key, or a plain string. + Handles one level of nesting (e.g. {"content": {"content": "text"}}). + """ + if isinstance(message, str): + return message + if isinstance(message, dict): + for key in ("content", "message"): + if key in message: + val = message[key] + if isinstance(val, str): + return val + if isinstance(val, dict): + return _get_message_content(val) + return str(val) + return "" + + def _extract_fields_from_spans( parsed: ParsedEvaluationEvent, ) -> Dict[str, Any]: - """Extract LLMTestCase fields from ADOT session spans. + """Extract LLMTestCase fields from AgentCore session spans. - Bridges Session → LLMTestCase fields: - - input ← user messages (role=="user") - - actual_output ← assistant messages (role=="assistant") - - retrieval_context ← tool messages (role=="tool") + Parses _eval_log_records from span attributes, filters by target_trace_id, + and extracts messages by role: + - input ← input messages where role=="user" + - actual_output ← output messages where role=="assistant" + - retrieval_context ← output messages where role=="tool" - expected_output ← evaluationReferenceInputs[0].expectedResponse """ user_messages: List[str] = [] @@ -109,18 +131,56 @@ def _extract_fields_from_spans( for span in parsed.session_spans: attributes = span.get("attributes", {}) - role = attributes.get("gen_ai.message.role", "") - content = attributes.get("gen_ai.message.content", "") - - if not content: - content = attributes.get("gen_ai.completion", "") - - if role == "user" and content: - user_messages.append(content) - elif role == "assistant" and content: - assistant_messages.append(content) - elif role == "tool" and content: - tool_messages.append(content) + log_records_raw = attributes.get("_eval_log_records") + if not log_records_raw: + continue + + if isinstance(log_records_raw, str): + try: + log_records = json.loads(log_records_raw) + except (json.JSONDecodeError, TypeError): + logger.debug("Failed to parse _eval_log_records as JSON") + continue + else: + log_records = log_records_raw + + if not isinstance(log_records, list): + continue + + for record in log_records: + if not isinstance(record, dict): + continue + + if parsed.target_trace_id: + record_trace_id = record.get("traceId") or record.get("trace_id") + if record_trace_id and record_trace_id != parsed.target_trace_id: + continue + + body = record.get("body", {}) + if not isinstance(body, dict): + continue + + input_data = body.get("input", {}) + if isinstance(input_data, dict): + for msg in input_data.get("messages", []): + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = _get_message_content(msg) + if role == "user" and content: + user_messages.append(content) + + output_data = body.get("output", {}) + if isinstance(output_data, dict): + for msg in output_data.get("messages", []): + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = _get_message_content(msg) + if role == "assistant" and content: + assistant_messages.append(content) + elif role == "tool" and content: + tool_messages.append(content) fields: Dict[str, Any] = {} diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py index 77988ab7..c3fa98ae 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py @@ -1,5 +1,6 @@ """Tests for DeepEvalHandler.""" +import json from unittest.mock import MagicMock, patch import pytest @@ -14,30 +15,27 @@ def _make_event( reference_inputs=None, ): """Build a raw Lambda event dict for testing.""" + if spans is None: + log_records = [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "What is AI?"}]}, + "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]}, + } + } + ] + spans = [ + { + "traceId": "abc123", + "spanId": "span1", + "attributes": {"_eval_log_records": json.dumps(log_records)}, + } + ] + event = { "schemaVersion": "1.0", "evaluationLevel": level, - "evaluationInput": { - "sessionSpans": spans - or [ - { - "traceId": "abc123", - "spanId": "span1", - "attributes": { - "gen_ai.message.role": "user", - "gen_ai.message.content": "What is AI?", - }, - }, - { - "traceId": "abc123", - "spanId": "span2", - "attributes": { - "gen_ai.message.role": "assistant", - "gen_ai.message.content": "AI is artificial intelligence.", - }, - }, - ] - }, + "evaluationInput": {"sessionSpans": spans}, "evaluationTarget": {}, } if trace_ids is not None: @@ -153,17 +151,20 @@ def test_missing_evaluation_input_returns_error(self): assert result["errorCode"] == "INVALID_EVENT" def test_missing_required_field_returns_error(self): + log_records = [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "q"}]}, + "output": {"messages": [{"role": "assistant", "content": "a"}]}, + } + } + ] spans = [ { "traceId": "t1", "spanId": "s1", - "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "q"}, - }, - { - "traceId": "t1", - "spanId": "s2", - "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "a"}, - }, + "attributes": {"_eval_log_records": json.dumps(log_records)}, + } ] metric = _mock_metric(name="FaithfulnessMetric") handler = DeepEvalHandler(metric=metric) diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py index efab5459..67447f48 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py @@ -1,5 +1,6 @@ """Tests for deepeval input_mapper module.""" +import json from unittest.mock import MagicMock import pytest @@ -7,11 +8,38 @@ from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import ( ParsedEvaluationEvent, + _extract_fields_from_spans, _get_required_params, build_test_case, ) +def _make_log_record( + input_messages=None, + output_messages=None, + trace_id=None, +): + """Build a single log record dict.""" + record = {"body": {}} + if input_messages is not None: + record["body"]["input"] = {"messages": input_messages} + if output_messages is not None: + record["body"]["output"] = {"messages": output_messages} + if trace_id is not None: + record["traceId"] = trace_id + return record + + +def _make_span_with_log_records(log_records, span_id="span1", as_json_string=True): + """Build a span dict with _eval_log_records in attributes.""" + value = json.dumps(log_records) if as_json_string else log_records + return { + "traceId": "abc123", + "spanId": span_id, + "attributes": {"_eval_log_records": value}, + } + + def _make_event( level="TRACE", trace_ids=None, @@ -20,30 +48,19 @@ def _make_event( reference_inputs=None, ): """Build a raw Lambda event dict for testing.""" + if spans is None: + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "What is the capital of France?"}], + output_messages=[{"role": "assistant", "content": "The capital of France is Paris."}], + ) + ] + spans = [_make_span_with_log_records(log_records)] + event = { "schemaVersion": "1.0", "evaluationLevel": level, - "evaluationInput": { - "sessionSpans": spans - or [ - { - "traceId": "abc123", - "spanId": "span1", - "attributes": { - "gen_ai.message.role": "user", - "gen_ai.message.content": "What is the capital of France?", - }, - }, - { - "traceId": "abc123", - "spanId": "span2", - "attributes": { - "gen_ai.message.role": "assistant", - "gen_ai.message.content": "The capital of France is Paris.", - }, - }, - ] - }, + "evaluationInput": {"sessionSpans": spans}, "evaluationTarget": {}, } if trace_ids is not None: @@ -82,7 +99,7 @@ def test_from_lambda_event_trace_level(self): assert parsed.evaluation_level == "TRACE" assert parsed.target_trace_id == "trace-1" assert parsed.target_span_id is None - assert len(parsed.session_spans) == 2 + assert len(parsed.session_spans) == 1 def test_from_lambda_event_tool_call_level(self): event = _make_event(level="TOOL_CALL", span_ids=["span-42"]) @@ -173,6 +190,250 @@ def test_empty_required_params_falls_through(self): assert result == ["input", "actual_output"] +class TestExtractFieldsFromSpans: + def test_basic_extraction(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "hello"}], + output_messages=[{"role": "assistant", "content": "world"}], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "hello" + assert fields["actual_output"] == "world" + + def test_tool_messages_become_retrieval_context(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "query"}], + output_messages=[ + {"role": "tool", "content": "doc chunk 1"}, + {"role": "tool", "content": "doc chunk 2"}, + {"role": "assistant", "content": "answer"}, + ], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["retrieval_context"] == ["doc chunk 1", "doc chunk 2"] + assert fields["actual_output"] == "answer" + + def test_message_content_as_dict_with_content_key(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": {"content": "nested content"}}], + output_messages=[{"role": "assistant", "content": {"content": "nested output"}}], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "nested content" + assert fields["actual_output"] == "nested output" + + def test_message_content_as_dict_with_message_key(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "message": "msg key input"}], + output_messages=[{"role": "assistant", "message": "msg key output"}], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "msg key input" + assert fields["actual_output"] == "msg key output" + + def test_message_content_as_plain_string_in_content_field(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "plain string"}], + output_messages=[{"role": "assistant", "content": "plain response"}], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "plain string" + assert fields["actual_output"] == "plain response" + + def test_target_trace_id_filters_records(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "relevant"}], + output_messages=[{"role": "assistant", "content": "relevant answer"}], + trace_id="target-trace", + ), + _make_log_record( + input_messages=[{"role": "user", "content": "irrelevant"}], + output_messages=[{"role": "assistant", "content": "irrelevant answer"}], + trace_id="other-trace", + ), + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="target-trace", + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "relevant" + assert fields["actual_output"] == "relevant answer" + + def test_no_target_trace_id_includes_all_records(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "first"}], + output_messages=[{"role": "assistant", "content": "first answer"}], + trace_id="trace-1", + ), + _make_log_record( + input_messages=[{"role": "user", "content": "second"}], + output_messages=[{"role": "assistant", "content": "second answer"}], + trace_id="trace-2", + ), + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="SESSION", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "first\nsecond" + assert fields["actual_output"] == "first answer\nsecond answer" + + def test_log_records_as_parsed_list(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "from list"}], + output_messages=[{"role": "assistant", "content": "from list answer"}], + ) + ] + spans = [_make_span_with_log_records(log_records, as_json_string=False)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "from list" + assert fields["actual_output"] == "from list answer" + + def test_invalid_json_log_records_skipped(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"_eval_log_records": "not valid json{{{"}, + } + ] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields == {} + + def test_span_without_log_records_skipped(self): + spans = [{"traceId": "t1", "spanId": "s1", "attributes": {}}] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields == {} + + def test_multiple_spans_aggregated(self): + log_records_1 = [ + _make_log_record( + input_messages=[{"role": "user", "content": "q1"}], + output_messages=[{"role": "assistant", "content": "a1"}], + ) + ] + log_records_2 = [ + _make_log_record( + input_messages=[{"role": "user", "content": "q2"}], + output_messages=[{"role": "assistant", "content": "a2"}], + ) + ] + spans = [ + _make_span_with_log_records(log_records_1, span_id="s1"), + _make_span_with_log_records(log_records_2, span_id="s2"), + ] + parsed = ParsedEvaluationEvent( + evaluation_level="SESSION", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "q1\nq2" + assert fields["actual_output"] == "a1\na2" + + def test_reference_inputs_expected_output(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "q"}], + output_messages=[{"role": "assistant", "content": "a"}], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", + session_spans=spans, + reference_inputs=[{"expectedResponse": "expected answer"}], + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["expected_output"] == "expected answer" + + def test_record_without_matching_trace_id_key_included(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "no trace id record"}], + output_messages=[{"role": "assistant", "content": "response"}], + ), + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="target-trace", + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "no trace id record" + + class TestBuildTestCase: def test_basic_span_extraction(self): event = _make_event() @@ -184,29 +445,18 @@ def test_basic_span_extraction(self): assert test_case.input == "What is the capital of France?" assert test_case.actual_output == "The capital of France is Paris." - def test_retrieval_context_from_tool_spans(self): - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "query"}, - }, - { - "traceId": "t1", - "spanId": "s2", - "attributes": {"gen_ai.message.role": "tool", "gen_ai.message.content": "doc chunk 1"}, - }, - { - "traceId": "t1", - "spanId": "s3", - "attributes": {"gen_ai.message.role": "tool", "gen_ai.message.content": "doc chunk 2"}, - }, - { - "traceId": "t1", - "spanId": "s4", - "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "answer"}, - }, + def test_retrieval_context_from_tool_messages(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "query"}], + output_messages=[ + {"role": "tool", "content": "doc chunk 1"}, + {"role": "tool", "content": "doc chunk 2"}, + {"role": "assistant", "content": "answer"}, + ], + ) ] + spans = [_make_span_with_log_records(log_records)] event = _make_event(spans=spans) parsed = ParsedEvaluationEvent.from_lambda_event(event) metric = _mock_metric(name="FaithfulnessMetric") @@ -228,18 +478,13 @@ def test_expected_output_from_reference_inputs(self): assert test_case.expected_output == "Paris" def test_missing_required_field_raises_value_error(self): - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "query"}, - }, - { - "traceId": "t1", - "spanId": "s2", - "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "answer"}, - }, + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "query"}], + output_messages=[{"role": "assistant", "content": "answer"}], + ) ] + spans = [_make_span_with_log_records(log_records)] event = _make_event(spans=spans) parsed = ParsedEvaluationEvent.from_lambda_event(event) metric = _mock_metric(name="FaithfulnessMetric") @@ -283,23 +528,16 @@ def capture_mapper(raw_event): assert raw["evaluationReferenceInputs"] == refs def test_multiple_user_messages_concatenated(self): - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "hello"}, - }, - { - "traceId": "t1", - "spanId": "s2", - "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "world"}, - }, - { - "traceId": "t1", - "spanId": "s3", - "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "hi"}, - }, + log_records = [ + _make_log_record( + input_messages=[ + {"role": "user", "content": "hello"}, + {"role": "user", "content": "world"}, + ], + output_messages=[{"role": "assistant", "content": "hi"}], + ) ] + spans = [_make_span_with_log_records(log_records)] event = _make_event(spans=spans) parsed = ParsedEvaluationEvent.from_lambda_event(event) metric = _mock_metric(name="AnswerRelevancyMetric") @@ -307,25 +545,3 @@ def test_multiple_user_messages_concatenated(self): test_case = build_test_case(parsed, metric) assert test_case.input == "hello\nworld" - - def test_gen_ai_completion_fallback(self): - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"gen_ai.message.role": "user", "gen_ai.completion": "fallback input"}, - }, - { - "traceId": "t1", - "spanId": "s2", - "attributes": {"gen_ai.message.role": "assistant", "gen_ai.completion": "fallback output"}, - }, - ] - event = _make_event(spans=spans) - parsed = ParsedEvaluationEvent.from_lambda_event(event) - metric = _mock_metric(name="AnswerRelevancyMetric") - - test_case = build_test_case(parsed, metric) - - assert test_case.input == "fallback input" - assert test_case.actual_output == "fallback output" From 81a46dd5ccb835a61fa314fe0f16c31608dd15b8 Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 12:14:10 -0700 Subject: [PATCH 03/10] Set context field from tool messages for HallucinationMetric support --- .../integrations/deepeval/input_mapper.py | 1 + .../deepeval/test_input_mapper.py | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py index cd67845f..39182636 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py @@ -190,6 +190,7 @@ def _extract_fields_from_spans( fields["actual_output"] = "\n".join(assistant_messages) if tool_messages: fields["retrieval_context"] = tool_messages + fields["context"] = tool_messages if parsed.reference_inputs: expected = parsed.reference_inputs[0].get("expectedResponse") diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py index 67447f48..ca661128 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py @@ -229,6 +229,26 @@ def test_tool_messages_become_retrieval_context(self): assert fields["retrieval_context"] == ["doc chunk 1", "doc chunk 2"] assert fields["actual_output"] == "answer" + def test_tool_messages_also_set_context_for_hallucination_metric(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "query"}], + output_messages=[ + {"role": "tool", "content": "context chunk"}, + {"role": "assistant", "content": "answer"}, + ], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["context"] == ["context chunk"] + assert fields["context"] == fields["retrieval_context"] + def test_message_content_as_dict_with_content_key(self): log_records = [ _make_log_record( From 3080e407aad5566583c6fc7609486adc0d99075d Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 12:36:35 -0700 Subject: [PATCH 04/10] Use metric.success for label instead of manual threshold comparison --- .../integrations/deepeval/handler.py | 3 +- .../integrations/deepeval/test_handler.py | 29 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py index b339b883..4893889c 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py @@ -78,7 +78,8 @@ def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any] score = self.metric.score reason = getattr(self.metric, "reason", None) or "" threshold = getattr(self.metric, "threshold", 0.5) - label = "Pass" if score is not None and score >= threshold else "Fail" + success = getattr(self.metric, "success", score is not None and score >= threshold) + label = "Pass" if success else "Fail" return {"value": score, "label": label, "explanation": reason} diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py index c3fa98ae..009f5e54 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py @@ -55,6 +55,7 @@ def _mock_metric(score=0.85, reason="Looks good", threshold=0.7, name="MockMetri metric._required_params = None del metric._required_params del metric.evaluation_params + del metric.success def measure_side_effect(test_case): metric.score = score @@ -229,3 +230,31 @@ def test_default_threshold_when_missing(self): result = handler(_make_event()) assert result["label"] == "Pass" + + def test_label_uses_metric_success_true(self): + metric = _mock_metric(score=0.3, threshold=0.7) + metric.success = True + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 0.3 + assert result["label"] == "Pass" + + def test_label_uses_metric_success_false(self): + metric = _mock_metric(score=0.9, threshold=0.7) + metric.success = False + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 0.9 + assert result["label"] == "Fail" + + def test_label_falls_back_to_threshold_when_no_success(self): + metric = _mock_metric(score=0.8, threshold=0.7) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["label"] == "Pass" From 34674bbeb50ae6f88de74fd6da0bb2768d1a7489 Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 12:42:07 -0700 Subject: [PATCH 05/10] Add model override and timeout enforcement to DeepEvalHandler --- .../integrations/deepeval/handler.py | 49 ++++++++++++++- .../integrations/deepeval/test_handler.py | 61 +++++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py index 4893889c..c71ed6da 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py @@ -1,6 +1,7 @@ """DeepEval handler that adapts AgentCore Lambda evaluation events to DeepEval metrics.""" import logging +import threading from typing import Any, Callable, Dict, Optional from deepeval.metrics import BaseMetric @@ -30,10 +31,14 @@ def lambda_handler(event, context): return handler(event, context) """ + DEFAULT_TIMEOUT = 290 + def __init__( self, metric: BaseMetric, field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, + model: Optional[str] = None, + timeout: Optional[int] = None, ): """Initialize the handler. @@ -42,9 +47,15 @@ def __init__( field_mapper: Optional callable that receives the raw Lambda event and returns a dict of LLMTestCase field values. Bypasses default span extraction when provided. + model: Optional model identifier to override the metric's LLM + (e.g. a Bedrock model string instead of the default OpenAI model). + timeout: Maximum seconds to allow for metric.measure(). Defaults to 290 + (slightly under Lambda's 300s max). Set to None to disable. """ self.metric = metric self.field_mapper = field_mapper + self.model = model + self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]: """Handle a Lambda invocation. @@ -69,8 +80,16 @@ def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any] logger.error("Missing required fields: %s", e) return _error_response("MISSING_REQUIRED_FIELD", str(e)) + if self.model is not None: + self.metric.model = self.model + try: - self.metric.measure(test_case) + self._measure_with_timeout(test_case) + except _MetricTimeout: + return _error_response( + "METRIC_TIMEOUT", + f"{type(self.metric).__name__} exceeded {self.timeout}s timeout.", + ) except Exception as e: logger.error("Metric measurement failed: %s", e, exc_info=True) return _error_response("METRIC_ERROR", f"{type(self.metric).__name__} failed: {e}") @@ -83,6 +102,34 @@ def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any] return {"value": score, "label": label, "explanation": reason} + def _measure_with_timeout(self, test_case: Any) -> None: + """Run metric.measure with a thread-based timeout.""" + if self.timeout <= 0: + self.metric.measure(test_case) + return + + exception_holder: list = [] + + def target(): + try: + self.metric.measure(test_case) + except Exception as e: + exception_holder.append(e) + + thread = threading.Thread(target=target, daemon=True) + thread.start() + thread.join(timeout=self.timeout) + + if thread.is_alive(): + raise _MetricTimeout() + + if exception_holder: + raise exception_holder[0] + + +class _MetricTimeout(Exception): + """Raised when metric.measure exceeds the configured timeout.""" + def _error_response(code: str, message: str) -> Dict[str, str]: """Build a standardized error response dict.""" diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py index 009f5e54..9867969b 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py @@ -1,6 +1,7 @@ """Tests for DeepEvalHandler.""" import json +import time from unittest.mock import MagicMock, patch import pytest @@ -258,3 +259,63 @@ def test_label_falls_back_to_threshold_when_no_success(self): result = handler(_make_event()) assert result["label"] == "Pass" + + def test_model_override_sets_metric_model(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric, model="bedrock/anthropic.claude-3") + + handler(_make_event()) + + assert metric.model == "bedrock/anthropic.claude-3" + + def test_no_model_override_leaves_metric_unchanged(self): + metric = _mock_metric() + metric.model = "original-model" + handler = DeepEvalHandler(metric=metric) + + handler(_make_event()) + + assert metric.model == "original-model" + + +class TestDeepEvalHandlerTimeout: + def test_timeout_returns_error(self): + metric = _mock_metric() + metric.measure = MagicMock(side_effect=lambda tc: time.sleep(5)) + handler = DeepEvalHandler(metric=metric, timeout=1) + + result = handler(_make_event()) + + assert result["errorCode"] == "METRIC_TIMEOUT" + assert "1s timeout" in result["errorMessage"] + + def test_no_timeout_when_measure_completes_in_time(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric, timeout=10) + + result = handler(_make_event()) + + assert result["value"] == 0.85 + assert "errorCode" not in result + + def test_default_timeout_is_290(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + assert handler.timeout == 290 + + def test_custom_timeout_value(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric, timeout=60) + + assert handler.timeout == 60 + + def test_metric_exception_still_propagates_with_timeout(self): + metric = _mock_metric() + metric.measure = MagicMock(side_effect=RuntimeError("LLM error")) + handler = DeepEvalHandler(metric=metric, timeout=10) + + result = handler(_make_event()) + + assert result["errorCode"] == "METRIC_ERROR" + assert "LLM error" in result["errorMessage"] From 6aedcbf6caf1d4f34823b3b35633b7ec921398be Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 12:56:33 -0700 Subject: [PATCH 06/10] Add model override, timeout enforcement, use metric.success, fix SingleTurnParams deprecation --- .../evaluation/integrations/deepeval/handler.py | 7 ++++--- .../integrations/deepeval/input_mapper.py | 14 +++++++------- .../integrations/deepeval/test_input_mapper.py | 6 +++--- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py index c71ed6da..ed261727 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py @@ -37,7 +37,7 @@ def __init__( self, metric: BaseMetric, field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, - model: Optional[str] = None, + model: Optional[Any] = None, timeout: Optional[int] = None, ): """Initialize the handler. @@ -47,8 +47,9 @@ def __init__( field_mapper: Optional callable that receives the raw Lambda event and returns a dict of LLMTestCase field values. Bypasses default span extraction when provided. - model: Optional model identifier to override the metric's LLM - (e.g. a Bedrock model string instead of the default OpenAI model). + model: Optional model override for the metric's LLM. Can be a string + model ID (e.g. "bedrock/anthropic.claude-3") or a DeepEvalBaseLLM + subclass instance. timeout: Maximum seconds to allow for metric.measure(). Defaults to 290 (slightly under Lambda's 300s max). Set to None to disable. """ diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py index 39182636..47e75c0c 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py @@ -6,16 +6,16 @@ from typing import Any, Callable, Dict, List, Optional from deepeval.metrics import BaseMetric -from deepeval.test_case import LLMTestCase, LLMTestCaseParams +from deepeval.test_case import LLMTestCase, SingleTurnParams logger = logging.getLogger(__name__) -_PARAM_TO_FIELD: Dict[LLMTestCaseParams, str] = { - LLMTestCaseParams.INPUT: "input", - LLMTestCaseParams.ACTUAL_OUTPUT: "actual_output", - LLMTestCaseParams.EXPECTED_OUTPUT: "expected_output", - LLMTestCaseParams.CONTEXT: "context", - LLMTestCaseParams.RETRIEVAL_CONTEXT: "retrieval_context", +_PARAM_TO_FIELD: Dict[SingleTurnParams, str] = { + SingleTurnParams.INPUT: "input", + SingleTurnParams.ACTUAL_OUTPUT: "actual_output", + SingleTurnParams.EXPECTED_OUTPUT: "expected_output", + SingleTurnParams.CONTEXT: "context", + SingleTurnParams.RETRIEVAL_CONTEXT: "retrieval_context", } _METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = { diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py index ca661128..6d2a5420 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py @@ -4,7 +4,7 @@ from unittest.mock import MagicMock import pytest -from deepeval.test_case import LLMTestCaseParams +from deepeval.test_case import SingleTurnParams from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import ( ParsedEvaluationEvent, @@ -156,7 +156,7 @@ def test_from_lambda_event_missing_target_key_defaults(self): class TestGetRequiredParams: def test_uses_required_params_attribute(self): metric = _mock_metric( - required_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT] + required_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT] ) result = _get_required_params(metric) @@ -171,7 +171,7 @@ def test_falls_back_to_static_registry(self): def test_falls_back_to_evaluation_params(self): metric = _mock_metric( name="UnknownMetric", - evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT], + evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.RETRIEVAL_CONTEXT], ) result = _get_required_params(metric) From 2260eb364ef1794290d3ccdec8601d9d866ecbcc Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 16:42:01 -0700 Subject: [PATCH 07/10] Fix _get_required_params to handle GEval unmappable typing params --- .deepeval/.deepeval_telemetry.txt | 2 ++ .../evaluation/integrations/deepeval/input_mapper.py | 3 ++- .../integrations/deepeval/test_input_mapper.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 .deepeval/.deepeval_telemetry.txt diff --git a/.deepeval/.deepeval_telemetry.txt b/.deepeval/.deepeval_telemetry.txt new file mode 100644 index 00000000..916744ae --- /dev/null +++ b/.deepeval/.deepeval_telemetry.txt @@ -0,0 +1,2 @@ +DEEPEVAL_ID=f26d66a4-b0b0-4096-859f-89f1ddf7ceee +DEEPEVAL_STATUS=old diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py index 47e75c0c..941afce2 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py @@ -80,7 +80,8 @@ def _get_required_params(metric: BaseMetric) -> List[str]: """ if hasattr(metric, "_required_params") and metric._required_params: params = metric._required_params - return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params] + if all(p in _PARAM_TO_FIELD for p in params): + return [_PARAM_TO_FIELD[p] for p in params] class_name = type(metric).__name__ if class_name in _METRIC_REQUIRED_PARAMS: diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py index 6d2a5420..1d90a689 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py @@ -183,6 +183,18 @@ def test_defaults_to_input_and_actual_output(self): assert result == ["input", "actual_output"] + def test_unmappable_required_params_skips_to_static_registry(self): + metric = _mock_metric(name="GEval", required_params=["SomeTypingObject", "AnotherType"]) + result = _get_required_params(metric) + + assert result == ["input", "actual_output"] + + def test_unmappable_required_params_falls_to_default(self): + metric = _mock_metric(name="UnknownMetric", required_params=["SomeTypingObject"]) + result = _get_required_params(metric) + + assert result == ["input", "actual_output"] + def test_empty_required_params_falls_through(self): metric = _mock_metric(name="UnknownMetric", required_params=[]) result = _get_required_params(metric) From 14f035408d340fc0a8adbe53bf4a569cbf2052aa Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 16:50:12 -0700 Subject: [PATCH 08/10] Add .deepeval/ to gitignore --- .deepeval/.deepeval_telemetry.txt | 2 -- .gitignore | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 .deepeval/.deepeval_telemetry.txt diff --git a/.deepeval/.deepeval_telemetry.txt b/.deepeval/.deepeval_telemetry.txt deleted file mode 100644 index 916744ae..00000000 --- a/.deepeval/.deepeval_telemetry.txt +++ /dev/null @@ -1,2 +0,0 @@ -DEEPEVAL_ID=f26d66a4-b0b0-4096-859f-89f1ddf7ceee -DEEPEVAL_STATUS=old diff --git a/.gitignore b/.gitignore index 01fe8e22..161403e7 100644 --- a/.gitignore +++ b/.gitignore @@ -229,3 +229,4 @@ local_settings.py Dockerfile CLAUDE.md .omc/ +.deepeval/ From b109a64fa38bbe4b4f9943b9113577592484897e Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Tue, 16 Jun 2026 15:26:05 -0700 Subject: [PATCH 09/10] Move model override to init to avoid per-call mutation --- .../evaluation/integrations/deepeval/handler.py | 6 ++---- .../evaluation/integrations/deepeval/test_handler.py | 2 -- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py index ed261727..0e91bafe 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py @@ -55,8 +55,9 @@ def __init__( """ self.metric = metric self.field_mapper = field_mapper - self.model = model self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT + if model is not None: + self.metric.model = model def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]: """Handle a Lambda invocation. @@ -81,9 +82,6 @@ def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any] logger.error("Missing required fields: %s", e) return _error_response("MISSING_REQUIRED_FIELD", str(e)) - if self.model is not None: - self.metric.model = self.model - try: self._measure_with_timeout(test_case) except _MetricTimeout: diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py index 9867969b..77961f14 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py @@ -264,8 +264,6 @@ def test_model_override_sets_metric_model(self): metric = _mock_metric() handler = DeepEvalHandler(metric=metric, model="bedrock/anthropic.claude-3") - handler(_make_event()) - assert metric.model == "bedrock/anthropic.claude-3" def test_no_model_override_leaves_metric_unchanged(self): From 4e7492644a53d4c5e4a663cd531b38b4a4481d8b Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Wed, 24 Jun 2026 16:25:17 -0700 Subject: [PATCH 10/10] Refactor to BaseAdapter framework with DeepEval/Autoevals adapters and EvaluatorInput support --- .../evaluation/integrations/__init__.py | 4 + .../integrations/autoevals/__init__.py | 5 + .../integrations/autoevals/adapter.py | 72 +++++ .../evaluation/integrations/base.py | 302 ++++++++++++++++++ .../integrations/deepeval/__init__.py | 4 +- .../integrations/deepeval/adapter.py | 189 +++++++++++ .../integrations/deepeval/handler.py | 135 -------- .../integrations/deepeval/input_mapper.py | 253 --------------- .../integrations/autoevals/__init__.py | 0 .../integrations/autoevals/test_adapter.py | 217 +++++++++++++ .../integrations/deepeval/test_handler.py | 112 ++++++- .../deepeval/test_input_mapper.py | 8 +- 12 files changed, 906 insertions(+), 395 deletions(-) create mode 100644 src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py create mode 100644 src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py create mode 100644 src/bedrock_agentcore/evaluation/integrations/base.py create mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py delete mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py delete mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py create mode 100644 tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py create mode 100644 tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py diff --git a/src/bedrock_agentcore/evaluation/integrations/__init__.py b/src/bedrock_agentcore/evaluation/integrations/__init__.py index 33048d5d..a1ff7691 100644 --- a/src/bedrock_agentcore/evaluation/integrations/__init__.py +++ b/src/bedrock_agentcore/evaluation/integrations/__init__.py @@ -1 +1,5 @@ """AgentCore Evaluation integrations.""" + +from bedrock_agentcore.evaluation.integrations.base import BaseAdapter, ParsedEvaluationEvent + +__all__ = ["BaseAdapter", "ParsedEvaluationEvent"] diff --git a/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py b/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py new file mode 100644 index 00000000..0bc3b4ff --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py @@ -0,0 +1,5 @@ +"""Autoevals integration for AgentCore Evaluation.""" + +from bedrock_agentcore.evaluation.integrations.autoevals.adapter import AutoevalsAdapter + +__all__ = ["AutoevalsAdapter"] diff --git a/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py b/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py new file mode 100644 index 00000000..fe89435e --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py @@ -0,0 +1,72 @@ +"""Autoevals adapter for AgentCore evaluation integrations.""" + +import logging +from typing import Any, Callable, Dict, Optional + +from bedrock_agentcore.evaluation.integrations.base import BaseAdapter + +logger = logging.getLogger(__name__) + + +class AutoevalsAdapter(BaseAdapter): + """Adapter that runs an Autoevals scorer against AgentCore evaluation events. + + Example:: + + from autoevals import Factuality + + scorer = Factuality() + handler = AutoevalsAdapter(scorer=scorer) + + # Use as Lambda handler + def lambda_handler(event, context): + return handler(event, context) + """ + + def __init__( + self, + scorer: Any, + field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, + timeout: Optional[int] = None, + ): + """Initialize the adapter. + + Args: + scorer: An Autoevals scorer instance (e.g. Factuality(), ClosedQA()). + field_mapper: Optional callable that receives the raw Lambda event and + returns a dict of field values. Bypasses default span extraction. + timeout: Maximum seconds to allow for scorer.eval(). Defaults to 290. + """ + super().__init__(field_mapper=field_mapper, timeout=timeout) + self.scorer = scorer + + def validate_fields(self, fields: Dict[str, Any]) -> None: + """Validate that input and actual_output are present.""" + missing = [] + if not fields.get("input"): + missing.append("input") + if not fields.get("actual_output"): + missing.append("actual_output") + if missing: + scorer_name = type(self.scorer).__name__ + raise ValueError( + f"Field(s) {missing} required by {scorer_name} but not found in evaluation event. " + f"Provide a field_mapper or ensure spans contain the necessary data." + ) + + def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]: + """Run the Autoevals scorer and return formatted results.""" + kwargs: Dict[str, Any] = { + "input": fields.get("input", ""), + "output": fields.get("actual_output", ""), + } + if fields.get("expected_output"): + kwargs["expected"] = fields["expected_output"] + + result = self.scorer.eval(**kwargs) + + score = result.score + label = "Pass" if score is not None and score >= 0.5 else "Fail" + explanation = getattr(result, "metadata", {}).get("rationale", "") if hasattr(result, "metadata") else "" + + return {"value": score, "label": label, "explanation": explanation} diff --git a/src/bedrock_agentcore/evaluation/integrations/base.py b/src/bedrock_agentcore/evaluation/integrations/base.py new file mode 100644 index 00000000..a10f6606 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/base.py @@ -0,0 +1,302 @@ +"""Base adapter for AgentCore evaluation integrations.""" + +import abc +import json +import logging +import threading +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Union + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput + +logger = logging.getLogger(__name__) + + +@dataclass +class ParsedEvaluationEvent: + """Parsed representation of the AgentCore Lambda evaluation event.""" + + evaluation_level: str + session_spans: List[Dict[str, Any]] + target_trace_id: Optional[str] = None + target_span_id: Optional[str] = None + reference_inputs: List[Dict[str, Any]] = field(default_factory=list) + + @classmethod + def from_lambda_event(cls, event: Dict[str, Any]) -> "ParsedEvaluationEvent": + """Parse a raw Lambda event dict into a structured object. + + Args: + event: Raw Lambda event payload from the evaluation service. + + Returns: + ParsedEvaluationEvent with extracted fields. + + Raises: + KeyError: If required top-level fields are missing. + """ + evaluation_input = event["evaluationInput"] + target = event.get("evaluationTarget") or {} + trace_ids = target.get("traceIds") or [] + span_ids = target.get("spanIds") or [] + + return cls( + evaluation_level=event["evaluationLevel"], + session_spans=evaluation_input["sessionSpans"], + target_trace_id=trace_ids[0] if trace_ids else None, + target_span_id=span_ids[0] if span_ids else None, + reference_inputs=event.get("evaluationReferenceInputs") or [], + ) + + +def _get_message_content(message: Any) -> str: + """Extract text content from a message object. + + Message content can be a dict with a "content" or "message" key, or a plain string. + Handles one level of nesting (e.g. {"content": {"content": "text"}}). + """ + if isinstance(message, str): + return message + if isinstance(message, dict): + for key in ("content", "message"): + if key in message: + val = message[key] + if isinstance(val, str): + return val + if isinstance(val, dict): + return _get_message_content(val) + return str(val) + return "" + + +def extract_fields_from_spans( + parsed: ParsedEvaluationEvent, +) -> Dict[str, Any]: + """Extract evaluation fields from AgentCore session spans. + + Parses _eval_log_records from span attributes, filters by target_trace_id, + and extracts messages by role: + - input ← input messages where role=="user" + - actual_output ← output messages where role=="assistant" + - retrieval_context ← output messages where role=="tool" + - context ← same as retrieval_context + - expected_output ← evaluationReferenceInputs[0].expectedResponse + """ + user_messages: List[str] = [] + assistant_messages: List[str] = [] + tool_messages: List[str] = [] + + for span in parsed.session_spans: + attributes = span.get("attributes", {}) + log_records_raw = attributes.get("_eval_log_records") + if not log_records_raw: + continue + + if isinstance(log_records_raw, str): + try: + log_records = json.loads(log_records_raw) + except (json.JSONDecodeError, TypeError): + logger.debug("Failed to parse _eval_log_records as JSON") + continue + else: + log_records = log_records_raw + + if not isinstance(log_records, list): + continue + + for record in log_records: + if not isinstance(record, dict): + continue + + if parsed.target_trace_id: + record_trace_id = record.get("traceId") or record.get("trace_id") + if record_trace_id and record_trace_id != parsed.target_trace_id: + continue + + body = record.get("body", {}) + if not isinstance(body, dict): + continue + + input_data = body.get("input", {}) + if isinstance(input_data, dict): + for msg in input_data.get("messages", []): + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = _get_message_content(msg) + if role == "user" and content: + user_messages.append(content) + + output_data = body.get("output", {}) + if isinstance(output_data, dict): + for msg in output_data.get("messages", []): + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = _get_message_content(msg) + if role == "assistant" and content: + assistant_messages.append(content) + elif role == "tool" and content: + tool_messages.append(content) + + fields: Dict[str, Any] = {} + + if user_messages: + fields["input"] = "\n".join(user_messages) + if assistant_messages: + fields["actual_output"] = "\n".join(assistant_messages) + if tool_messages: + fields["retrieval_context"] = tool_messages + fields["context"] = tool_messages + + if parsed.reference_inputs: + expected = parsed.reference_inputs[0].get("expectedResponse") + if expected: + fields["expected_output"] = expected + + return fields + + +class _ExecutionTimeout(Exception): + """Raised when execution exceeds the configured timeout.""" + + +def _error_response(code: str, message: str) -> Dict[str, str]: + """Build a standardized error response dict.""" + return {"errorCode": code, "errorMessage": message} + + +class BaseAdapter(abc.ABC): + """Base adapter for evaluation framework integrations. + + Subclasses only need to implement execute(fields) which runs the actual + evaluation logic and returns (score, label, explanation). + + Never raises unhandled exceptions — always returns a valid response dict. + """ + + DEFAULT_TIMEOUT = 290 + + def __init__( + self, + field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, + timeout: Optional[int] = None, + ): + """Initialize the adapter. + + Args: + field_mapper: Optional callable that receives the raw Lambda event and + returns a dict of field values. Bypasses default span extraction. + timeout: Maximum seconds to allow for execute(). Defaults to 290 + (slightly under Lambda's 300s max). + """ + self.field_mapper = field_mapper + self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT + + def __call__(self, event: Union[Dict[str, Any], EvaluatorInput], context: Any = None) -> Dict[str, Any]: + """Handle a Lambda invocation. + + Args: + event: Either a raw Lambda event dict or an EvaluatorInput instance + from bedrock_agentcore.evaluation.custom_code_based_evaluators.models. + context: Lambda context object (unused). + + Returns: + Success: {"value": float, "label": str, "explanation": str} + Error: {"errorCode": str, "errorMessage": str} + """ + try: + if isinstance(event, EvaluatorInput): + parsed = ParsedEvaluationEvent( + evaluation_level=event.evaluation_level, + session_spans=event.session_spans, + target_trace_id=event.target_trace_id, + target_span_id=event.target_span_id, + reference_inputs=getattr(event, "reference_inputs", []) or [], + ) + else: + parsed = ParsedEvaluationEvent.from_lambda_event(event) + except (KeyError, IndexError, TypeError) as e: + logger.error("Failed to parse evaluation event: %s", e) + return _error_response("INVALID_EVENT", f"Failed to parse evaluation event: {e}") + + try: + fields = self._extract_fields(parsed) + except ValueError as e: + logger.error("Missing required fields: %s", e) + return _error_response("MISSING_REQUIRED_FIELD", str(e)) + + try: + result = self._execute_with_timeout(fields) + except _ExecutionTimeout: + return _error_response( + "METRIC_TIMEOUT", + f"{type(self).__name__} exceeded {self.timeout}s timeout.", + ) + except Exception as e: + logger.error("Execution failed: %s", e, exc_info=True) + return _error_response("METRIC_ERROR", f"{type(self).__name__} failed: {e}") + + return result + + def _extract_fields(self, parsed: ParsedEvaluationEvent) -> Dict[str, Any]: + """Extract fields from event, using field_mapper if provided.""" + if self.field_mapper is not None: + raw_event = { + "evaluationLevel": parsed.evaluation_level, + "evaluationInput": {"sessionSpans": parsed.session_spans}, + "evaluationTarget": { + "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [], + "spanIds": [parsed.target_span_id] if parsed.target_span_id else [], + }, + "evaluationReferenceInputs": parsed.reference_inputs, + } + return self.field_mapper(raw_event) + + fields = extract_fields_from_spans(parsed) + self.validate_fields(fields) + return fields + + def validate_fields(self, fields: Dict[str, Any]) -> None: + """Validate that required fields are present. + + Override in subclasses to enforce field requirements. + Default implementation does nothing. + """ + + @abc.abstractmethod + def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]: + """Run the evaluation and return the response dict. + + Args: + fields: Extracted field dict with keys like "input", "actual_output", etc. + + Returns: + {"value": float, "label": str, "explanation": str} + """ + + def _execute_with_timeout(self, fields: Dict[str, Any]) -> Dict[str, Any]: + """Run execute() with a thread-based timeout.""" + if self.timeout <= 0: + return self.execute(fields) + + result_holder: list = [] + exception_holder: list = [] + + def target(): + try: + result_holder.append(self.execute(fields)) + except Exception as e: + exception_holder.append(e) + + thread = threading.Thread(target=target, daemon=True) + thread.start() + thread.join(timeout=self.timeout) + + if thread.is_alive(): + raise _ExecutionTimeout() + + if exception_holder: + raise exception_holder[0] + + return result_holder[0] diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py index 76f6461f..adb6ba44 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py @@ -1,5 +1,5 @@ """DeepEval integration for AgentCore Evaluation.""" -from bedrock_agentcore.evaluation.integrations.deepeval.handler import DeepEvalHandler +from bedrock_agentcore.evaluation.integrations.deepeval.adapter import DeepEvalAdapter, DeepEvalHandler -__all__ = ["DeepEvalHandler"] +__all__ = ["DeepEvalAdapter", "DeepEvalHandler"] diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py new file mode 100644 index 00000000..e8748782 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py @@ -0,0 +1,189 @@ +"""DeepEval adapter for AgentCore evaluation integrations.""" + +import logging +from typing import Any, Callable, Dict, List, Optional + +from deepeval.metrics import BaseMetric +from deepeval.test_case import LLMTestCase, SingleTurnParams + +from bedrock_agentcore.evaluation.integrations.base import ( + BaseAdapter, + ParsedEvaluationEvent, + extract_fields_from_spans, +) + +logger = logging.getLogger(__name__) + +_PARAM_TO_FIELD: Dict[SingleTurnParams, str] = { + SingleTurnParams.INPUT: "input", + SingleTurnParams.ACTUAL_OUTPUT: "actual_output", + SingleTurnParams.EXPECTED_OUTPUT: "expected_output", + SingleTurnParams.CONTEXT: "context", + SingleTurnParams.RETRIEVAL_CONTEXT: "retrieval_context", +} + +_METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = { + "AnswerRelevancyMetric": ["input", "actual_output"], + "FaithfulnessMetric": ["input", "actual_output", "retrieval_context"], + "ContextualRelevancyMetric": ["input", "actual_output", "retrieval_context"], + "ContextualPrecisionMetric": ["input", "actual_output", "expected_output", "retrieval_context"], + "ContextualRecallMetric": ["input", "actual_output", "expected_output", "retrieval_context"], + "HallucinationMetric": ["input", "actual_output", "context"], + "BiasMetric": ["input", "actual_output"], + "ToxicityMetric": ["input", "actual_output"], + "GEval": ["input", "actual_output"], + "SummarizationMetric": ["input", "actual_output"], +} + + +def _get_required_params(metric: BaseMetric) -> List[str]: + """Determine which LLMTestCase fields a metric requires. + + Fallback chain: + 1. metric._required_params (DeepEval internal attribute) + 2. Static registry _METRIC_REQUIRED_PARAMS keyed by class name + 3. metric.evaluation_params (GEval special case) + 4. Default: ["input", "actual_output"] + """ + if hasattr(metric, "_required_params") and metric._required_params: + params = metric._required_params + if all(p in _PARAM_TO_FIELD for p in params): + return [_PARAM_TO_FIELD[p] for p in params] + + class_name = type(metric).__name__ + if class_name in _METRIC_REQUIRED_PARAMS: + return _METRIC_REQUIRED_PARAMS[class_name] + + if hasattr(metric, "evaluation_params") and metric.evaluation_params: + params = metric.evaluation_params + return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params] + + return ["input", "actual_output"] + + +class DeepEvalAdapter(BaseAdapter): + """Adapter that runs a DeepEval metric against AgentCore evaluation events. + + Example:: + + from deepeval.metrics import AnswerRelevancyMetric + + metric = AnswerRelevancyMetric(threshold=0.7) + handler = DeepEvalAdapter(metric=metric) + + # Use as Lambda handler + def lambda_handler(event, context): + return handler(event, context) + """ + + def __init__( + self, + metric: BaseMetric, + field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, + model: Optional[Any] = None, + timeout: Optional[int] = None, + ): + """Initialize the adapter. + + Args: + metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric). + field_mapper: Optional callable that receives the raw Lambda event and + returns a dict of LLMTestCase field values. Bypasses default span + extraction when provided. + model: Optional model override for the metric's LLM. Can be a string + model ID (e.g. "bedrock/anthropic.claude-3") or a DeepEvalBaseLLM + subclass instance. + timeout: Maximum seconds to allow for metric.measure(). Defaults to 290 + (slightly under Lambda's 300s max). + """ + super().__init__(field_mapper=field_mapper, timeout=timeout) + self.metric = metric + if model is not None: + self.metric.model = model + + def validate_fields(self, fields: Dict[str, Any]) -> None: + """Validate that fields required by the metric are present.""" + required = _get_required_params(self.metric) + missing = [f for f in required if f not in fields or not fields[f]] + if missing: + metric_name = type(self.metric).__name__ + raise ValueError( + f"Field(s) {missing} required by {metric_name} but not found in evaluation event. " + f"Provide a field_mapper or ensure spans contain the necessary data." + ) + + def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]: + """Run the DeepEval metric and return formatted results.""" + test_case = LLMTestCase( + input=fields.get("input", ""), + actual_output=fields.get("actual_output", ""), + expected_output=fields.get("expected_output"), + context=fields.get("context"), + retrieval_context=fields.get("retrieval_context"), + ) + + self.metric.measure(test_case) + + score = self.metric.score + reason = getattr(self.metric, "reason", None) or "" + threshold = getattr(self.metric, "threshold", 0.5) + success = getattr(self.metric, "success", score is not None and score >= threshold) + label = "Pass" if success else "Fail" + + return {"value": score, "label": label, "explanation": reason} + + +def build_test_case( + parsed: ParsedEvaluationEvent, + metric: BaseMetric, + field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, +) -> LLMTestCase: + """Build a DeepEval LLMTestCase from a parsed evaluation event. + + Args: + parsed: The parsed Lambda event. + metric: The DeepEval metric (used to determine required fields). + field_mapper: Optional callable that receives the raw Lambda event fields + and returns a dict of LLMTestCase field values. Bypasses default + span extraction when provided. + + Returns: + An LLMTestCase ready for metric.measure(). + + Raises: + ValueError: If required fields for the metric cannot be populated. + """ + if field_mapper is not None: + raw_event = { + "evaluationLevel": parsed.evaluation_level, + "evaluationInput": {"sessionSpans": parsed.session_spans}, + "evaluationTarget": { + "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [], + "spanIds": [parsed.target_span_id] if parsed.target_span_id else [], + }, + "evaluationReferenceInputs": parsed.reference_inputs, + } + fields = field_mapper(raw_event) + else: + fields = extract_fields_from_spans(parsed) + + required = _get_required_params(metric) + missing = [f for f in required if f not in fields or not fields[f]] + if missing: + metric_name = type(metric).__name__ + raise ValueError( + f"Field(s) {missing} required by {metric_name} but not found in evaluation event. " + f"Provide a field_mapper or ensure spans contain the necessary data." + ) + + return LLMTestCase( + input=fields.get("input", ""), + actual_output=fields.get("actual_output", ""), + expected_output=fields.get("expected_output"), + context=fields.get("context"), + retrieval_context=fields.get("retrieval_context"), + ) + + +# Backward-compatible alias +DeepEvalHandler = DeepEvalAdapter diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py deleted file mode 100644 index 0e91bafe..00000000 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py +++ /dev/null @@ -1,135 +0,0 @@ -"""DeepEval handler that adapts AgentCore Lambda evaluation events to DeepEval metrics.""" - -import logging -import threading -from typing import Any, Callable, Dict, Optional - -from deepeval.metrics import BaseMetric - -from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import ( - ParsedEvaluationEvent, - build_test_case, -) - -logger = logging.getLogger(__name__) - - -class DeepEvalHandler: - """Lambda handler that runs a DeepEval metric against AgentCore evaluation events. - - Never raises unhandled exceptions — always returns a valid response dict. - - Example:: - - from deepeval.metrics import AnswerRelevancyMetric - - metric = AnswerRelevancyMetric(threshold=0.7) - handler = DeepEvalHandler(metric=metric) - - # Use as Lambda handler - def lambda_handler(event, context): - return handler(event, context) - """ - - DEFAULT_TIMEOUT = 290 - - def __init__( - self, - metric: BaseMetric, - field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, - model: Optional[Any] = None, - timeout: Optional[int] = None, - ): - """Initialize the handler. - - Args: - metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric). - field_mapper: Optional callable that receives the raw Lambda event and - returns a dict of LLMTestCase field values. Bypasses default span - extraction when provided. - model: Optional model override for the metric's LLM. Can be a string - model ID (e.g. "bedrock/anthropic.claude-3") or a DeepEvalBaseLLM - subclass instance. - timeout: Maximum seconds to allow for metric.measure(). Defaults to 290 - (slightly under Lambda's 300s max). Set to None to disable. - """ - self.metric = metric - self.field_mapper = field_mapper - self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT - if model is not None: - self.metric.model = model - - def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]: - """Handle a Lambda invocation. - - Args: - event: Raw Lambda event dict from the evaluation service. - context: Lambda context object (unused). - - Returns: - Success: {"value": float, "label": str, "explanation": str} - Error: {"errorCode": str, "errorMessage": str} - """ - try: - parsed = ParsedEvaluationEvent.from_lambda_event(event) - except (KeyError, IndexError, TypeError) as e: - logger.error("Failed to parse evaluation event: %s", e) - return _error_response("INVALID_EVENT", f"Failed to parse evaluation event: {e}") - - try: - test_case = build_test_case(parsed, self.metric, self.field_mapper) - except ValueError as e: - logger.error("Missing required fields: %s", e) - return _error_response("MISSING_REQUIRED_FIELD", str(e)) - - try: - self._measure_with_timeout(test_case) - except _MetricTimeout: - return _error_response( - "METRIC_TIMEOUT", - f"{type(self.metric).__name__} exceeded {self.timeout}s timeout.", - ) - except Exception as e: - logger.error("Metric measurement failed: %s", e, exc_info=True) - return _error_response("METRIC_ERROR", f"{type(self.metric).__name__} failed: {e}") - - score = self.metric.score - reason = getattr(self.metric, "reason", None) or "" - threshold = getattr(self.metric, "threshold", 0.5) - success = getattr(self.metric, "success", score is not None and score >= threshold) - label = "Pass" if success else "Fail" - - return {"value": score, "label": label, "explanation": reason} - - def _measure_with_timeout(self, test_case: Any) -> None: - """Run metric.measure with a thread-based timeout.""" - if self.timeout <= 0: - self.metric.measure(test_case) - return - - exception_holder: list = [] - - def target(): - try: - self.metric.measure(test_case) - except Exception as e: - exception_holder.append(e) - - thread = threading.Thread(target=target, daemon=True) - thread.start() - thread.join(timeout=self.timeout) - - if thread.is_alive(): - raise _MetricTimeout() - - if exception_holder: - raise exception_holder[0] - - -class _MetricTimeout(Exception): - """Raised when metric.measure exceeds the configured timeout.""" - - -def _error_response(code: str, message: str) -> Dict[str, str]: - """Build a standardized error response dict.""" - return {"errorCode": code, "errorMessage": message} diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py deleted file mode 100644 index 941afce2..00000000 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py +++ /dev/null @@ -1,253 +0,0 @@ -"""Map AgentCore Lambda evaluation events to DeepEval LLMTestCase objects.""" - -import json -import logging -from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional - -from deepeval.metrics import BaseMetric -from deepeval.test_case import LLMTestCase, SingleTurnParams - -logger = logging.getLogger(__name__) - -_PARAM_TO_FIELD: Dict[SingleTurnParams, str] = { - SingleTurnParams.INPUT: "input", - SingleTurnParams.ACTUAL_OUTPUT: "actual_output", - SingleTurnParams.EXPECTED_OUTPUT: "expected_output", - SingleTurnParams.CONTEXT: "context", - SingleTurnParams.RETRIEVAL_CONTEXT: "retrieval_context", -} - -_METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = { - "AnswerRelevancyMetric": ["input", "actual_output"], - "FaithfulnessMetric": ["input", "actual_output", "retrieval_context"], - "ContextualRelevancyMetric": ["input", "actual_output", "retrieval_context"], - "ContextualPrecisionMetric": ["input", "actual_output", "expected_output", "retrieval_context"], - "ContextualRecallMetric": ["input", "actual_output", "expected_output", "retrieval_context"], - "HallucinationMetric": ["input", "actual_output", "context"], - "BiasMetric": ["input", "actual_output"], - "ToxicityMetric": ["input", "actual_output"], - "GEval": ["input", "actual_output"], - "SummarizationMetric": ["input", "actual_output"], -} - - -@dataclass -class ParsedEvaluationEvent: - """Parsed representation of the AgentCore Lambda evaluation event.""" - - evaluation_level: str - session_spans: List[Dict[str, Any]] - target_trace_id: Optional[str] = None - target_span_id: Optional[str] = None - reference_inputs: List[Dict[str, Any]] = field(default_factory=list) - - @classmethod - def from_lambda_event(cls, event: Dict[str, Any]) -> "ParsedEvaluationEvent": - """Parse a raw Lambda event dict into a structured object. - - Args: - event: Raw Lambda event payload from the evaluation service. - - Returns: - ParsedEvaluationEvent with extracted fields. - - Raises: - KeyError: If required top-level fields are missing. - """ - evaluation_input = event["evaluationInput"] - target = event.get("evaluationTarget") or {} - trace_ids = target.get("traceIds") or [] - span_ids = target.get("spanIds") or [] - - return cls( - evaluation_level=event["evaluationLevel"], - session_spans=evaluation_input["sessionSpans"], - target_trace_id=trace_ids[0] if trace_ids else None, - target_span_id=span_ids[0] if span_ids else None, - reference_inputs=event.get("evaluationReferenceInputs") or [], - ) - - -def _get_required_params(metric: BaseMetric) -> List[str]: - """Determine which LLMTestCase fields a metric requires. - - Fallback chain: - 1. metric._required_params (DeepEval internal attribute) - 2. Static registry _METRIC_REQUIRED_PARAMS keyed by class name - 3. metric.evaluation_params (GEval special case) - 4. Default: ["input", "actual_output"] - """ - if hasattr(metric, "_required_params") and metric._required_params: - params = metric._required_params - if all(p in _PARAM_TO_FIELD for p in params): - return [_PARAM_TO_FIELD[p] for p in params] - - class_name = type(metric).__name__ - if class_name in _METRIC_REQUIRED_PARAMS: - return _METRIC_REQUIRED_PARAMS[class_name] - - if hasattr(metric, "evaluation_params") and metric.evaluation_params: - params = metric.evaluation_params - return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params] - - return ["input", "actual_output"] - - -def _get_message_content(message: Any) -> str: - """Extract text content from a message object. - - Message content can be a dict with a "content" or "message" key, or a plain string. - Handles one level of nesting (e.g. {"content": {"content": "text"}}). - """ - if isinstance(message, str): - return message - if isinstance(message, dict): - for key in ("content", "message"): - if key in message: - val = message[key] - if isinstance(val, str): - return val - if isinstance(val, dict): - return _get_message_content(val) - return str(val) - return "" - - -def _extract_fields_from_spans( - parsed: ParsedEvaluationEvent, -) -> Dict[str, Any]: - """Extract LLMTestCase fields from AgentCore session spans. - - Parses _eval_log_records from span attributes, filters by target_trace_id, - and extracts messages by role: - - input ← input messages where role=="user" - - actual_output ← output messages where role=="assistant" - - retrieval_context ← output messages where role=="tool" - - expected_output ← evaluationReferenceInputs[0].expectedResponse - """ - user_messages: List[str] = [] - assistant_messages: List[str] = [] - tool_messages: List[str] = [] - - for span in parsed.session_spans: - attributes = span.get("attributes", {}) - log_records_raw = attributes.get("_eval_log_records") - if not log_records_raw: - continue - - if isinstance(log_records_raw, str): - try: - log_records = json.loads(log_records_raw) - except (json.JSONDecodeError, TypeError): - logger.debug("Failed to parse _eval_log_records as JSON") - continue - else: - log_records = log_records_raw - - if not isinstance(log_records, list): - continue - - for record in log_records: - if not isinstance(record, dict): - continue - - if parsed.target_trace_id: - record_trace_id = record.get("traceId") or record.get("trace_id") - if record_trace_id and record_trace_id != parsed.target_trace_id: - continue - - body = record.get("body", {}) - if not isinstance(body, dict): - continue - - input_data = body.get("input", {}) - if isinstance(input_data, dict): - for msg in input_data.get("messages", []): - if not isinstance(msg, dict): - continue - role = msg.get("role", "") - content = _get_message_content(msg) - if role == "user" and content: - user_messages.append(content) - - output_data = body.get("output", {}) - if isinstance(output_data, dict): - for msg in output_data.get("messages", []): - if not isinstance(msg, dict): - continue - role = msg.get("role", "") - content = _get_message_content(msg) - if role == "assistant" and content: - assistant_messages.append(content) - elif role == "tool" and content: - tool_messages.append(content) - - fields: Dict[str, Any] = {} - - if user_messages: - fields["input"] = "\n".join(user_messages) - if assistant_messages: - fields["actual_output"] = "\n".join(assistant_messages) - if tool_messages: - fields["retrieval_context"] = tool_messages - fields["context"] = tool_messages - - if parsed.reference_inputs: - expected = parsed.reference_inputs[0].get("expectedResponse") - if expected: - fields["expected_output"] = expected - - return fields - - -def build_test_case( - parsed: ParsedEvaluationEvent, - metric: BaseMetric, - field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, -) -> LLMTestCase: - """Build a DeepEval LLMTestCase from a parsed evaluation event. - - Args: - parsed: The parsed Lambda event. - metric: The DeepEval metric (used to determine required fields). - field_mapper: Optional callable that receives the raw Lambda event fields - and returns a dict of LLMTestCase field values. Bypasses default - span extraction when provided. - - Returns: - An LLMTestCase ready for metric.measure(). - - Raises: - ValueError: If required fields for the metric cannot be populated. - """ - if field_mapper is not None: - raw_event = { - "evaluationLevel": parsed.evaluation_level, - "evaluationInput": {"sessionSpans": parsed.session_spans}, - "evaluationTarget": { - "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [], - "spanIds": [parsed.target_span_id] if parsed.target_span_id else [], - }, - "evaluationReferenceInputs": parsed.reference_inputs, - } - fields = field_mapper(raw_event) - else: - fields = _extract_fields_from_spans(parsed) - - required = _get_required_params(metric) - missing = [f for f in required if f not in fields or not fields[f]] - if missing: - metric_name = type(metric).__name__ - raise ValueError( - f"Field(s) {missing} required by {metric_name} but not found in evaluation event. " - f"Provide a field_mapper or ensure spans contain the necessary data." - ) - - return LLMTestCase( - input=fields.get("input", ""), - actual_output=fields.get("actual_output", ""), - expected_output=fields.get("expected_output"), - context=fields.get("context"), - retrieval_context=fields.get("retrieval_context"), - ) diff --git a/tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py b/tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py b/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py new file mode 100644 index 00000000..17f674bd --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py @@ -0,0 +1,217 @@ +"""Tests for AutoevalsAdapter.""" + +import json +import time +from unittest.mock import MagicMock + +import pytest + +from bedrock_agentcore.evaluation.integrations.autoevals.adapter import AutoevalsAdapter + + +def _make_event( + level="TRACE", + trace_ids=None, + spans=None, + reference_inputs=None, +): + """Build a raw Lambda event dict for testing.""" + if spans is None: + log_records = [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "What is AI?"}]}, + "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]}, + } + } + ] + spans = [ + { + "traceId": "abc123", + "spanId": "span1", + "attributes": {"_eval_log_records": json.dumps(log_records)}, + } + ] + + event = { + "schemaVersion": "1.0", + "evaluationLevel": level, + "evaluationInput": {"sessionSpans": spans}, + "evaluationTarget": {}, + } + if trace_ids is not None: + event["evaluationTarget"]["traceIds"] = trace_ids + if reference_inputs is not None: + event["evaluationReferenceInputs"] = reference_inputs + return event + + +def _mock_scorer(score=0.9, rationale="Good answer"): + """Create a mock Autoevals scorer.""" + scorer = MagicMock() + type(scorer).__name__ = "MockScorer" + + result = MagicMock() + result.score = score + result.metadata = {"rationale": rationale} + + scorer.eval = MagicMock(return_value=result) + return scorer + + +class TestAutoevalsAdapterSuccess: + def test_returns_pass_when_score_above_half(self): + scorer = _mock_scorer(score=0.8) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_event()) + + assert result["value"] == 0.8 + assert result["label"] == "Pass" + assert result["explanation"] == "Good answer" + + def test_returns_fail_when_score_below_half(self): + scorer = _mock_scorer(score=0.3) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_event()) + + assert result["value"] == 0.3 + assert result["label"] == "Fail" + + def test_scorer_eval_called_with_input_and_output(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + adapter(_make_event()) + + scorer.eval.assert_called_once() + call_kwargs = scorer.eval.call_args[1] + assert call_kwargs["input"] == "What is AI?" + assert call_kwargs["output"] == "AI is artificial intelligence." + + def test_expected_output_passed_as_expected(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + refs = [{"expectedResponse": "AI stands for artificial intelligence."}] + result = adapter(_make_event(reference_inputs=refs)) + + call_kwargs = scorer.eval.call_args[1] + assert call_kwargs["expected"] == "AI stands for artificial intelligence." + + def test_no_expected_output_omits_expected_kwarg(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + adapter(_make_event()) + + call_kwargs = scorer.eval.call_args[1] + assert "expected" not in call_kwargs + + def test_custom_field_mapper(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter( + scorer=scorer, + field_mapper=lambda event: { + "input": "custom input", + "actual_output": "custom output", + }, + ) + + result = adapter(_make_event()) + + call_kwargs = scorer.eval.call_args[1] + assert call_kwargs["input"] == "custom input" + assert call_kwargs["output"] == "custom output" + + +class TestAutoevalsAdapterErrors: + def test_invalid_event_returns_error(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter({}) + + assert result["errorCode"] == "INVALID_EVENT" + + def test_missing_input_returns_error(self): + log_records = [ + { + "body": { + "output": {"messages": [{"role": "assistant", "content": "answer"}]}, + } + } + ] + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"_eval_log_records": json.dumps(log_records)}, + } + ] + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_event(spans=spans)) + + assert result["errorCode"] == "MISSING_REQUIRED_FIELD" + assert "input" in result["errorMessage"] + + def test_scorer_exception_returns_error(self): + scorer = _mock_scorer() + scorer.eval = MagicMock(side_effect=RuntimeError("API error")) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_event()) + + assert result["errorCode"] == "METRIC_ERROR" + assert "API error" in result["errorMessage"] + + def test_never_raises_on_bad_input(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + for bad_input in [None, [], "string", 42]: + result = adapter(bad_input) + assert "errorCode" in result + + +class TestAutoevalsAdapterTimeout: + def test_timeout_returns_error(self): + scorer = _mock_scorer() + scorer.eval = MagicMock(side_effect=lambda **kw: time.sleep(5)) + adapter = AutoevalsAdapter(scorer=scorer, timeout=1) + + result = adapter(_make_event()) + + assert result["errorCode"] == "METRIC_TIMEOUT" + + def test_default_timeout_is_290(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + assert adapter.timeout == 290 + + +class TestAutoevalsAdapterEdgeCases: + def test_score_none_returns_fail(self): + scorer = _mock_scorer(score=None) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_event()) + + assert result["label"] == "Fail" + + def test_no_metadata_returns_empty_explanation(self): + scorer = MagicMock() + type(scorer).__name__ = "MockScorer" + result_obj = MagicMock(spec=[]) + result_obj.score = 0.9 + scorer.eval = MagicMock(return_value=result_obj) + + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_event()) + + assert result["explanation"] == "" diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py index 77961f14..67bfda3d 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py @@ -1,4 +1,4 @@ -"""Tests for DeepEvalHandler.""" +"""Tests for DeepEvalHandler and DeepEvalAdapter.""" import json import time @@ -6,7 +6,9 @@ import pytest -from bedrock_agentcore.evaluation.integrations.deepeval.handler import DeepEvalHandler +from bedrock_agentcore.evaluation.integrations.deepeval.adapter import DeepEvalAdapter, DeepEvalHandler +from bedrock_agentcore.evaluation.integrations.base import BaseAdapter +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput def _make_event( @@ -317,3 +319,109 @@ def test_metric_exception_still_propagates_with_timeout(self): assert result["errorCode"] == "METRIC_ERROR" assert "LLM error" in result["errorMessage"] + + +class TestBackwardCompatibility: + def test_handler_is_alias_for_adapter(self): + assert DeepEvalHandler is DeepEvalAdapter + + def test_adapter_is_subclass_of_base(self): + assert issubclass(DeepEvalAdapter, BaseAdapter) + + def test_import_from_init(self): + from bedrock_agentcore.evaluation.integrations.deepeval import DeepEvalHandler as H + from bedrock_agentcore.evaluation.integrations.deepeval import DeepEvalAdapter as A + + assert H is A + + def test_handler_works_same_as_before(self): + metric = _mock_metric(score=0.9, threshold=0.7) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 0.9 + assert result["label"] == "Pass" + + +class TestEvaluatorInputAcceptance: + def _make_evaluator_input(self): + log_records = [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "Hello"}]}, + "output": {"messages": [{"role": "assistant", "content": "Hi there"}]}, + } + } + ] + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"_eval_log_records": json.dumps(log_records)}, + } + ] + return EvaluatorInput( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="t1", + target_span_id=None, + ) + + def test_accepts_evaluator_input(self): + metric = _mock_metric(score=0.95) + handler = DeepEvalHandler(metric=metric) + + result = handler(self._make_evaluator_input()) + + assert result["value"] == 0.95 + assert result["label"] == "Pass" + + def test_evaluator_input_extracts_fields_correctly(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + handler(self._make_evaluator_input()) + + test_case = metric.measure.call_args[0][0] + assert test_case.input == "Hello" + assert test_case.actual_output == "Hi there" + + def test_evaluator_input_with_trace_id_filtering(self): + log_records = [ + { + "traceId": "target", + "body": { + "input": {"messages": [{"role": "user", "content": "relevant"}]}, + "output": {"messages": [{"role": "assistant", "content": "yes"}]}, + }, + }, + { + "traceId": "other", + "body": { + "input": {"messages": [{"role": "user", "content": "irrelevant"}]}, + "output": {"messages": [{"role": "assistant", "content": "no"}]}, + }, + }, + ] + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"_eval_log_records": json.dumps(log_records)}, + } + ] + evaluator_input = EvaluatorInput( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="target", + ) + + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + handler(evaluator_input) + + test_case = metric.measure.call_args[0][0] + assert test_case.input == "relevant" + assert test_case.actual_output == "yes" diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py index 1d90a689..2d6fbaea 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py @@ -1,4 +1,4 @@ -"""Tests for deepeval input_mapper module.""" +"""Tests for deepeval input mapping and test case building.""" import json from unittest.mock import MagicMock @@ -6,9 +6,11 @@ import pytest from deepeval.test_case import SingleTurnParams -from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import ( +from bedrock_agentcore.evaluation.integrations.base import ( ParsedEvaluationEvent, - _extract_fields_from_spans, + extract_fields_from_spans as _extract_fields_from_spans, +) +from bedrock_agentcore.evaluation.integrations.deepeval.adapter import ( _get_required_params, build_test_case, )