From 8985189e80ee09edf543f3a1ff39e43327963d0b Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Tue, 14 Apr 2026 12:28:38 -0400 Subject: [PATCH 1/9] Add response_speed_with_tool_calls and response_speed_no_tool_calls metrics Splits the existing response_speed diagnostic metric into two filtered variants based on whether the assistant made a tool call in the turn. Parses conversation_trace to map each latency to its turn and checks for tool_call entries on that turn_id. Shared logic (sanity filtering, mean/max, MetricScore construction) is extracted into a _ResponseSpeedBase class; each variant only implements _get_latencies(). Bumps metrics_version to 0.1.2. --- pyproject.toml | 2 +- src/eva/__init__.py | 2 +- src/eva/metrics/diagnostic/response_speed.py | 134 +++++++++++++++---- 3 files changed, 111 insertions(+), 27 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 561cba2d..47827e98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ ignore = ["D203", "D206", "D213", "D400", "D401", "D413", "D415", "E1", "E501"] simulation_version = "0.1.0" # Bump when metrics pipeline changes (metrics code, judge prompts, pricing, # postprocessor). Old metric results become stale — cheap to recompute. -metrics_version = "0.1.1" +metrics_version = "0.1.2" [tool.mypy] python_version = "3.11" diff --git a/src/eva/__init__.py b/src/eva/__init__.py index 6796f4aa..03f1f13b 100644 --- a/src/eva/__init__.py +++ b/src/eva/__init__.py @@ -11,4 +11,4 @@ # Bump metrics_version when changes affect metric computation (metrics code, # judge prompts, pricing tables, postprocessor). -metrics_version = "0.1.1" +metrics_version = "0.1.2" diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py index 0dd4fb53..68160a60 100644 --- a/src/eva/metrics/diagnostic/response_speed.py +++ b/src/eva/metrics/diagnostic/response_speed.py @@ -4,33 +4,60 @@ final evaluation scores. """ +from abc import abstractmethod + from eva.metrics.base import CodeMetric, MetricContext from eva.metrics.registry import register_metric from eva.models.results import MetricScore -@register_metric -class ResponseSpeedMetric(CodeMetric): - """Response speed metric. +def _split_latencies_by_tool_calls( + context: MetricContext, +) -> tuple[list[float], list[float]]: + """Partition response_speed_latencies into (with_tool_calls, no_tool_calls). - Measures the elapsed time between the end of the user's utterance - and the beginning of the assistant's response. + The i-th latency corresponds to the i-th user turn in chronological order. + We look at the conversation_trace to find which turn_ids contain at least + one tool_call entry. - Reports raw latency values in seconds — no normalization applied. + Returns: + (with_tool_latencies, no_tool_latencies) + """ + trace = context.conversation_trace or [] - This is a diagnostic metric used for diagnosing model performance issues. - It is not directly used in final evaluation scores. + user_turn_ids = sorted({entry["turn_id"] for entry in trace if entry.get("type") == "transcribed"}) + tool_call_turn_ids = {entry["turn_id"] for entry in trace if entry.get("type") == "tool_call"} + + with_tool: list[float] = [] + no_tool: list[float] = [] + + for i, latency in enumerate(context.response_speed_latencies): + if i >= len(user_turn_ids): + break + if user_turn_ids[i] in tool_call_turn_ids: + with_tool.append(latency) + else: + no_tool.append(latency) + + return with_tool, no_tool + + +class _ResponseSpeedBase(CodeMetric): + """Base class for response-speed metrics. + + Subclasses implement `_get_latencies` to return the subset of latencies + to compute over; everything else is shared. """ - name = "response_speed" - description = "Debug metric: latency between user utterance end and assistant response start" category = "diagnostic" exclude_from_pass_at_k = True + @abstractmethod + def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: + """Return (latencies, error_if_empty) for this metric variant.""" + async def compute(self, context: MetricContext) -> MetricScore: - """Compute response speed from Pipecat's UserBotLatencyObserver measurements.""" try: - # Check if we have response speed latencies from UserBotLatencyObserver if not context.response_speed_latencies: return MetricScore( name=self.name, @@ -39,19 +66,25 @@ async def compute(self, context: MetricContext) -> MetricScore: error="No response latencies available (UserBotLatencyObserver data missing)", ) - # Use latencies measured by Pipecat's UserBotLatencyObserver - # These measure the time from user stopped speaking to assistant started speaking + latencies, empty_error = self._get_latencies(context) + + if not latencies: + return MetricScore( + name=self.name, + score=0.0, + normalized_score=None, + error=empty_error, + ) + speeds = [] per_turn_speeds = [] - - for response_speed in context.response_speed_latencies: - # Filter out invalid values (negative or extremely large) - if 0 < response_speed < 1000: # Sanity check: under 1000 seconds - speeds.append(response_speed) - per_turn_speeds.append(round(response_speed, 3)) + for latency in latencies: + if 0 < latency < 1000: + speeds.append(latency) + per_turn_speeds.append(round(latency, 3)) else: self.logger.warning( - f"[{context.record_id}] Unusual response speed detected and dropped: {response_speed} seconds" + f"[{context.record_id}] Unusual response speed detected and dropped: {latency} seconds" ) if not speeds: @@ -63,15 +96,14 @@ async def compute(self, context: MetricContext) -> MetricScore: ) mean_speed = sum(speeds) / len(speeds) - max_speed = max(speeds) return MetricScore( name=self.name, - score=round(mean_speed, 3), # Mean response speed in seconds - normalized_score=None, # Raw latency in seconds; not normalizable to [0,1] + score=round(mean_speed, 3), + normalized_score=None, details={ "mean_speed_seconds": round(mean_speed, 3), - "max_speed_seconds": round(max_speed, 3), + "max_speed_seconds": round(max(speeds), 3), "num_turns": len(speeds), "per_turn_speeds": per_turn_speeds, }, @@ -79,3 +111,55 @@ async def compute(self, context: MetricContext) -> MetricScore: except Exception as e: return self._handle_error(e, context) + + +@register_metric +class ResponseSpeedMetric(_ResponseSpeedBase): + """Response speed metric. + + Measures the elapsed time between the end of the user's utterance + and the beginning of the assistant's response. + + Reports raw latency values in seconds — no normalization applied. + + This is a diagnostic metric used for diagnosing model performance issues. + It is not directly used in final evaluation scores. + """ + + name = "response_speed" + description = "Debug metric: latency between user utterance end and assistant response start" + + def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: + return context.response_speed_latencies, "No valid response speeds computed" + + +@register_metric +class ResponseSpeedWithToolCallsMetric(_ResponseSpeedBase): + """Response speed restricted to turns where the assistant made at least one tool call. + + Computed the same way as response_speed but only over tool-call turns. + This is a diagnostic metric not used in final evaluation scores. + """ + + name = "response_speed_with_tool_calls" + description = "Debug metric: response latency for turns that included a tool call" + + def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: + with_tool, _ = _split_latencies_by_tool_calls(context) + return with_tool, "No turns with tool calls found" + + +@register_metric +class ResponseSpeedNoToolCallsMetric(_ResponseSpeedBase): + """Response speed restricted to turns where the assistant made no tool calls. + + Computed the same way as response_speed but only over non-tool-call turns. + This is a diagnostic metric not used in final evaluation scores. + """ + + name = "response_speed_no_tool_calls" + description = "Debug metric: response latency for turns that did not include a tool call" + + def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: + _, no_tool = _split_latencies_by_tool_calls(context) + return no_tool, "No turns without tool calls found" From 7886625710b48724ce9cdeb8e696d1efcab5139b Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Tue, 14 Apr 2026 12:28:52 -0400 Subject: [PATCH 2/9] Warn instead of raise when unused deployments are missing from EVA_MODEL_LIST When restoring redacted secrets in apply_env_overrides, skip deployments that are not present in the current environment's EVA_MODEL_LIST rather than raising a ValueError. Only raise if the missing deployment is the active LLM for this run. This allows metrics-only reruns in environments that don't have every deployment from the original run configured. --- src/eva/models/config.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index e08783bd..f3885c54 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -648,10 +648,17 @@ def apply_env_overrides(self, live: "RunConfig") -> None: if not has_redacted: continue if name not in live_by_name: - raise ValueError( - f"Cannot restore secrets: deployment {name!r} not found in " - f"current EVA_MODEL_LIST (available: {list(live_by_name)})" + active_llm = getattr(self.model, "llm", None) + if name == active_llm: + raise ValueError( + f"Cannot restore secrets: deployment {name!r} not found in " + f"current EVA_MODEL_LIST (available: {list(live_by_name)})" + ) + logger.warning( + f"Deployment {name!r} has redacted secrets but is not in the current " + f"EVA_MODEL_LIST — skipping (not used in this run)." ) + continue live_params = live_by_name[name].get("litellm_params", {}) for key, value in saved_params.items(): if value == "***" and key in live_params: From b2bc3654387fadc0cb5c37f32c4d954bcbc1fa7b Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Tue, 14 Apr 2026 12:29:13 -0400 Subject: [PATCH 3/9] Fall back to output_dir when stored log paths no longer exist Adds _resolve_path() helper that returns the stored path if it exists on disk, otherwise falls back to output_dir/. Used in _build_history for pipecat_logs.jsonl and elevenlabs_events.jsonl so that metric reruns work correctly when a run directory has been moved from its original location. --- src/eva/metrics/processor.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/eva/metrics/processor.py b/src/eva/metrics/processor.py index 660e7ce9..94aa6e44 100644 --- a/src/eva/metrics/processor.py +++ b/src/eva/metrics/processor.py @@ -24,6 +24,19 @@ logger = get_logger(__name__) + +def _resolve_path(stored: str | None, fallback: Path) -> str | Path: + """Return *stored* if it exists on disk, otherwise *fallback*. + + Allows metrics to re-run correctly when a run directory has been moved: + the stored path reflects the original location, but the file is now at + *fallback* (i.e. output_dir / filename). + """ + if stored and Path(stored).exists(): + return stored + return fallback + + # Elevenlabs audio user field → _ProcessorContext attribute name AUDIO_ATTR = { "pipecat_agent": "audio_timestamps_assistant_turns", @@ -824,8 +837,10 @@ def _build_history( Each entry: {timestamp_ms, source, event_type, data}. """ history = self._load_audit_log_transcript(output_dir) - history.extend(self._load_pipecat_logs(result.pipecat_logs_path)) - history.extend(self._load_elevenlabs_logs(result.elevenlabs_logs_path)) + pipecat_path = _resolve_path(result.pipecat_logs_path, output_dir / "pipecat_logs.jsonl") + history.extend(self._load_pipecat_logs(pipecat_path)) + elevenlabs_path = _resolve_path(result.elevenlabs_logs_path, output_dir / "elevenlabs_events.jsonl") + history.extend(self._load_elevenlabs_logs(elevenlabs_path)) history.sort(key=lambda e: e["timestamp_ms"]) context.history = history From e99a8a444113d4a9b22ffcf1ff33210567228d7f Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Tue, 14 Apr 2026 12:33:10 -0400 Subject: [PATCH 4/9] Show response_speed_with_tool_calls and response_speed_no_tool_calls in analysis app Adds both new metrics to _NON_NORMALIZED_METRICS so they are rendered as standalone seconds bar charts alongside response_speed. Category grouping, color, and table sorting are handled dynamically via the metric registry. --- apps/analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/analysis.py b/apps/analysis.py index 4e651752..9bb15520 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -75,7 +75,7 @@ def _build_metric_group_map() -> dict[str, str]: "Other": "#AAAAAA", } -_NON_NORMALIZED_METRICS = {"response_speed"} +_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"} # EVA composite scores to show in the bar chart _EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"] From b26b79aa00bbfe5694c563f5b22cd0d95448d796 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Tue, 14 Apr 2026 14:32:22 -0400 Subject: [PATCH 5/9] Use turn_taking per_turn_latency as the data source for filtered response speed metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The filtered variants now read metrics/turn_taking/details/per_turn_latency from the record's metrics.json instead of using context.response_speed_latencies. This gives a direct turn_id → latency mapping, avoiding the index-based alignment that was previously needed to correlate latencies with tool calls. The base response_speed metric is unchanged (still uses UserBotLatencyObserver). --- src/eva/metrics/diagnostic/response_speed.py | 70 ++++++++++++-------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py index 68160a60..224ebc49 100644 --- a/src/eva/metrics/diagnostic/response_speed.py +++ b/src/eva/metrics/diagnostic/response_speed.py @@ -4,37 +4,51 @@ final evaluation scores. """ +import json from abc import abstractmethod +from pathlib import Path from eva.metrics.base import CodeMetric, MetricContext from eva.metrics.registry import register_metric from eva.models.results import MetricScore -def _split_latencies_by_tool_calls( +def _split_turn_taking_latencies_by_tool_calls( context: MetricContext, ) -> tuple[list[float], list[float]]: - """Partition response_speed_latencies into (with_tool_calls, no_tool_calls). + """Partition turn_taking per_turn_latency values into (with_tool_calls, no_tool_calls). - The i-th latency corresponds to the i-th user turn in chronological order. - We look at the conversation_trace to find which turn_ids contain at least - one tool_call entry. + Reads metrics/turn_taking/details/per_turn_latency from the record's + metrics.json, then checks conversation_trace to determine which turn_ids + had at least one tool call. Returns: (with_tool_latencies, no_tool_latencies) """ - trace = context.conversation_trace or [] + if not context.output_dir: + return [], [] - user_turn_ids = sorted({entry["turn_id"] for entry in trace if entry.get("type") == "transcribed"}) - tool_call_turn_ids = {entry["turn_id"] for entry in trace if entry.get("type") == "tool_call"} + metrics_path = Path(context.output_dir) / "metrics.json" + if not metrics_path.exists(): + return [], [] + + with open(metrics_path) as f: + data = json.load(f) + + per_turn_latency: dict[str, float] = ( + data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {}) + ) + if not per_turn_latency: + return [], [] + + tool_call_turn_ids = { + entry["turn_id"] for entry in (context.conversation_trace or []) if entry.get("type") == "tool_call" + } with_tool: list[float] = [] no_tool: list[float] = [] - - for i, latency in enumerate(context.response_speed_latencies): - if i >= len(user_turn_ids): - break - if user_turn_ids[i] in tool_call_turn_ids: + for turn_id_str, latency in per_turn_latency.items(): + if int(turn_id_str) in tool_call_turn_ids: with_tool.append(latency) else: no_tool.append(latency) @@ -58,14 +72,6 @@ def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: async def compute(self, context: MetricContext) -> MetricScore: try: - if not context.response_speed_latencies: - return MetricScore( - name=self.name, - score=0.0, - normalized_score=None, - error="No response latencies available (UserBotLatencyObserver data missing)", - ) - latencies, empty_error = self._get_latencies(context) if not latencies: @@ -118,7 +124,8 @@ class ResponseSpeedMetric(_ResponseSpeedBase): """Response speed metric. Measures the elapsed time between the end of the user's utterance - and the beginning of the assistant's response. + and the beginning of the assistant's response, using Pipecat's + UserBotLatencyObserver measurements. Reports raw latency values in seconds — no normalization applied. @@ -130,14 +137,18 @@ class ResponseSpeedMetric(_ResponseSpeedBase): description = "Debug metric: latency between user utterance end and assistant response start" def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: - return context.response_speed_latencies, "No valid response speeds computed" + return ( + context.response_speed_latencies, + "No response latencies available (UserBotLatencyObserver data missing)", + ) @register_metric class ResponseSpeedWithToolCallsMetric(_ResponseSpeedBase): """Response speed restricted to turns where the assistant made at least one tool call. - Computed the same way as response_speed but only over tool-call turns. + Uses per_turn_latency from the turn_taking metric and filters to turns + that contain a tool_call entry in the conversation trace. This is a diagnostic metric not used in final evaluation scores. """ @@ -145,15 +156,16 @@ class ResponseSpeedWithToolCallsMetric(_ResponseSpeedBase): description = "Debug metric: response latency for turns that included a tool call" def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: - with_tool, _ = _split_latencies_by_tool_calls(context) - return with_tool, "No turns with tool calls found" + with_tool, _ = _split_turn_taking_latencies_by_tool_calls(context) + return with_tool, "No turns with tool calls found (or turn_taking latency data unavailable)" @register_metric class ResponseSpeedNoToolCallsMetric(_ResponseSpeedBase): """Response speed restricted to turns where the assistant made no tool calls. - Computed the same way as response_speed but only over non-tool-call turns. + Uses per_turn_latency from the turn_taking metric and filters to turns + that contain no tool_call entry in the conversation trace. This is a diagnostic metric not used in final evaluation scores. """ @@ -161,5 +173,5 @@ class ResponseSpeedNoToolCallsMetric(_ResponseSpeedBase): description = "Debug metric: response latency for turns that did not include a tool call" def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: - _, no_tool = _split_latencies_by_tool_calls(context) - return no_tool, "No turns without tool calls found" + _, no_tool = _split_turn_taking_latencies_by_tool_calls(context) + return no_tool, "No turns without tool calls found (or turn_taking latency data unavailable)" From ccef0956562758899b53a50a2ed79c3426f041b6 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Tue, 14 Apr 2026 14:43:20 -0400 Subject: [PATCH 6/9] Add unit tests for ResponseSpeedWithToolCallsMetric and ResponseSpeedNoToolCallsMetric Tests cover: missing output_dir, missing metrics.json, missing turn_taking data, no tool-call turns, all tool-call turns, mixed turns (correct split), invalid latency filtering, and an exhaustiveness check that with_tool + no_tool latencies together equal the full per_turn_latency set. --- tests/unit/metrics/test_response_speed.py | 232 +++++++++++++++++++++- 1 file changed, 231 insertions(+), 1 deletion(-) diff --git a/tests/unit/metrics/test_response_speed.py b/tests/unit/metrics/test_response_speed.py index 8cb3ecfc..343e73ee 100644 --- a/tests/unit/metrics/test_response_speed.py +++ b/tests/unit/metrics/test_response_speed.py @@ -1,11 +1,51 @@ """Tests for the ResponseSpeedMetric.""" +import json + import pytest -from eva.metrics.diagnostic.response_speed import ResponseSpeedMetric +from eva.metrics.diagnostic.response_speed import ( + ResponseSpeedMetric, + ResponseSpeedNoToolCallsMetric, + ResponseSpeedWithToolCallsMetric, +) from .conftest import make_metric_context +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _write_metrics_json(tmp_path, per_turn_latency: dict) -> None: + """Write a minimal metrics.json with turn_taking per_turn_latency data.""" + data = { + "metrics": { + "turn_taking": { + "details": { + "per_turn_latency": per_turn_latency, + } + } + } + } + (tmp_path / "metrics.json").write_text(json.dumps(data)) + + +def _make_trace(tool_call_turn_ids: set[int], all_turn_ids: set[int]) -> list[dict]: + """Build a minimal conversation_trace with the given turn structure.""" + trace = [] + for tid in sorted(all_turn_ids): + trace.append({"turn_id": tid, "type": "transcribed", "content": "user utterance"}) + if tid in tool_call_turn_ids: + trace.append({"turn_id": tid, "type": "tool_call", "tool_name": "some_tool"}) + trace.append({"turn_id": tid, "type": "tool_response", "tool_name": "some_tool"}) + return trace + + +# --------------------------------------------------------------------------- +# ResponseSpeedMetric +# --------------------------------------------------------------------------- + class TestResponseSpeedMetric: @pytest.mark.asyncio @@ -91,3 +131,193 @@ async def test_single_latency_value(self): assert result.details["max_speed_seconds"] == pytest.approx(0.75) assert result.details["num_turns"] == 1 assert result.details["per_turn_speeds"] == [0.75] + + +# --------------------------------------------------------------------------- +# ResponseSpeedWithToolCallsMetric +# --------------------------------------------------------------------------- + + +class TestResponseSpeedWithToolCallsMetric: + @pytest.mark.asyncio + async def test_no_output_dir(self): + """Missing output_dir returns error.""" + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context() + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + + @pytest.mark.asyncio + async def test_missing_metrics_json(self, tmp_path): + """output_dir exists but has no metrics.json — returns error.""" + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path) + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + + @pytest.mark.asyncio + async def test_missing_turn_taking_data(self, tmp_path): + """metrics.json exists but has no turn_taking entry — returns error.""" + (tmp_path / "metrics.json").write_text(json.dumps({"metrics": {}})) + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path) + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + + @pytest.mark.asyncio + async def test_no_turns_with_tool_calls(self, tmp_path): + """Record has no tool-call turns — returns 'not found' error.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0}) + trace = _make_trace(tool_call_turn_ids=set(), all_turn_ids={1, 2, 3}) + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + assert "No turns with tool calls" in result.error + + @pytest.mark.asyncio + async def test_mixed_turns(self, tmp_path): + """Correctly includes only tool-call turn latencies.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0}) + # Turns 2 and 4 have tool calls + trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4}) + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + assert result.details["num_turns"] == 2 + assert result.score == pytest.approx((5.0 + 7.0) / 2) + assert result.details["max_speed_seconds"] == pytest.approx(7.0) + assert result.details["per_turn_speeds"] == [5.0, 7.0] + + @pytest.mark.asyncio + async def test_all_turns_have_tool_calls(self, tmp_path): + """When every turn has a tool call, all latencies are included.""" + _write_metrics_json(tmp_path, {"1": 2.0, "2": 4.0}) + trace = _make_trace(tool_call_turn_ids={1, 2}, all_turn_ids={1, 2}) + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + assert result.details["num_turns"] == 2 + assert result.score == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_filters_invalid_latency_values(self, tmp_path): + """Sanity filter (0 < x < 1000) applies to per_turn_latency values.""" + _write_metrics_json(tmp_path, {"1": -1.0, "2": 5.0, "3": 2000.0, "4": 3.0}) + trace = _make_trace(tool_call_turn_ids={1, 2, 3, 4}, all_turn_ids={1, 2, 3, 4}) + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + assert result.details["num_turns"] == 2 # only 5.0 and 3.0 pass + assert result.score == pytest.approx((5.0 + 3.0) / 2) + + +# --------------------------------------------------------------------------- +# ResponseSpeedNoToolCallsMetric +# --------------------------------------------------------------------------- + + +class TestResponseSpeedNoToolCallsMetric: + @pytest.mark.asyncio + async def test_no_output_dir(self): + """Missing output_dir returns error.""" + metric = ResponseSpeedNoToolCallsMetric() + ctx = make_metric_context() + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + + @pytest.mark.asyncio + async def test_missing_metrics_json(self, tmp_path): + """output_dir exists but has no metrics.json — returns error.""" + metric = ResponseSpeedNoToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path) + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + + @pytest.mark.asyncio + async def test_all_turns_have_tool_calls(self, tmp_path): + """Every turn has a tool call — no-tool bucket is empty.""" + _write_metrics_json(tmp_path, {"1": 2.0, "2": 4.0}) + trace = _make_trace(tool_call_turn_ids={1, 2}, all_turn_ids={1, 2}) + metric = ResponseSpeedNoToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + assert "No turns without tool calls" in result.error + + @pytest.mark.asyncio + async def test_mixed_turns(self, tmp_path): + """Correctly includes only non-tool-call turn latencies.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0}) + # Turns 2 and 4 have tool calls; turns 1 and 3 do not + trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4}) + metric = ResponseSpeedNoToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + assert result.details["num_turns"] == 2 + assert result.score == pytest.approx((1.0 + 3.0) / 2) + assert result.details["max_speed_seconds"] == pytest.approx(3.0) + assert result.details["per_turn_speeds"] == [1.0, 3.0] + + @pytest.mark.asyncio + async def test_no_turns_with_tool_calls(self, tmp_path): + """Record with no tool-call turns — all latencies included.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0}) + trace = _make_trace(tool_call_turn_ids=set(), all_turn_ids={1, 2, 3}) + metric = ResponseSpeedNoToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + assert result.details["num_turns"] == 3 + assert result.score == pytest.approx(2.0) + + @pytest.mark.asyncio + async def test_with_and_no_tool_split_is_exhaustive(self, tmp_path): + """with_tool + no_tool latencies together cover all per_turn_latency values.""" + per_turn = {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0, "5": 2.0} + _write_metrics_json(tmp_path, per_turn) + trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4, 5}) + + ctx_with = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + ctx_no = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result_with = await ResponseSpeedWithToolCallsMetric().compute(ctx_with) + result_no = await ResponseSpeedNoToolCallsMetric().compute(ctx_no) + + combined = result_with.details["per_turn_speeds"] + result_no.details["per_turn_speeds"] + assert sorted(combined) == sorted(per_turn.values()) From a8c92047dada4cbb3f84c3be18af72f9141cc333 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Wed, 15 Apr 2026 09:27:04 -0400 Subject: [PATCH 7/9] Fold response_speed tool-call breakdown into details instead of separate metrics --- apps/analysis.py | 2 +- src/eva/metrics/diagnostic/response_speed.py | 143 ++++------- tests/unit/metrics/test_response_speed.py | 251 ++++++------------- 3 files changed, 133 insertions(+), 263 deletions(-) diff --git a/apps/analysis.py b/apps/analysis.py index 9bb15520..4e651752 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -75,7 +75,7 @@ def _build_metric_group_map() -> dict[str, str]: "Other": "#AAAAAA", } -_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"} +_NON_NORMALIZED_METRICS = {"response_speed"} # EVA composite scores to show in the bar chart _EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"] diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py index 224ebc49..1b2d5bee 100644 --- a/src/eva/metrics/diagnostic/response_speed.py +++ b/src/eva/metrics/diagnostic/response_speed.py @@ -5,7 +5,6 @@ """ import json -from abc import abstractmethod from pathlib import Path from eva.metrics.base import CodeMetric, MetricContext @@ -13,34 +12,32 @@ from eva.models.results import MetricScore -def _split_turn_taking_latencies_by_tool_calls( - context: MetricContext, -) -> tuple[list[float], list[float]]: - """Partition turn_taking per_turn_latency values into (with_tool_calls, no_tool_calls). +def _load_per_turn_latency(context: MetricContext) -> dict[str, float]: + """Load turn_taking per_turn_latency from the record's metrics.json. - Reads metrics/turn_taking/details/per_turn_latency from the record's - metrics.json, then checks conversation_trace to determine which turn_ids - had at least one tool call. - - Returns: - (with_tool_latencies, no_tool_latencies) + Returns an empty dict if the data is unavailable. """ if not context.output_dir: - return [], [] + return {} metrics_path = Path(context.output_dir) / "metrics.json" if not metrics_path.exists(): - return [], [] + return {} with open(metrics_path) as f: data = json.load(f) - per_turn_latency: dict[str, float] = ( - data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {}) - ) - if not per_turn_latency: - return [], [] + return data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {}) + + +def _split_by_tool_calls( + per_turn_latency: dict[str, float], + context: MetricContext, +) -> tuple[list[float], list[float]]: + """Partition per_turn_latency values into (with_tool_calls, no_tool_calls). + Checks conversation_trace to determine which turn_ids had at least one tool call. + """ tool_call_turn_ids = { entry["turn_id"] for entry in (context.conversation_trace or []) if entry.get("type") == "tool_call" } @@ -56,35 +53,59 @@ def _split_turn_taking_latencies_by_tool_calls( return with_tool, no_tool -class _ResponseSpeedBase(CodeMetric): - """Base class for response-speed metrics. +def _compute_speed_stats(latencies: list[float]) -> dict | None: + """Compute summary stats for a list of latencies, applying the sanity filter. - Subclasses implement `_get_latencies` to return the subset of latencies - to compute over; everything else is shared. + Returns None if no valid values remain after filtering. """ + valid = [v for v in latencies if 0 < v < 1000] + if not valid: + return None + return { + "mean_speed_seconds": round(sum(valid) / len(valid), 3), + "max_speed_seconds": round(max(valid), 3), + "num_turns": len(valid), + "per_turn_speeds": [round(v, 3) for v in valid], + } + +@register_metric +class ResponseSpeedMetric(CodeMetric): + """Response speed metric. + + Measures the elapsed time between the end of the user's utterance + and the beginning of the assistant's response, using per_turn_latency + from the turn_taking metric. + + Reports raw latency values in seconds — no normalization applied. + + Details include a breakdown by turns with and without tool calls. + + This is a diagnostic metric used for diagnosing model performance issues. + It is not directly used in final evaluation scores. + """ + + name = "response_speed" category = "diagnostic" + description = "Diagnostic metric: latency between user utterance end and assistant response start" exclude_from_pass_at_k = True - @abstractmethod - def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: - """Return (latencies, error_if_empty) for this metric variant.""" - async def compute(self, context: MetricContext) -> MetricScore: try: - latencies, empty_error = self._get_latencies(context) + per_turn_latency = _load_per_turn_latency(context) - if not latencies: + if not per_turn_latency: return MetricScore( name=self.name, score=0.0, normalized_score=None, - error=empty_error, + error="No response latencies available (turn_taking per_turn_latency data missing)", ) + all_latencies = list(per_turn_latency.values()) speeds = [] per_turn_speeds = [] - for latency in latencies: + for latency in all_latencies: if 0 < latency < 1000: speeds.append(latency) per_turn_speeds.append(round(latency, 3)) @@ -103,6 +124,8 @@ async def compute(self, context: MetricContext) -> MetricScore: mean_speed = sum(speeds) / len(speeds) + with_tool, no_tool = _split_by_tool_calls(per_turn_latency, context) + return MetricScore( name=self.name, score=round(mean_speed, 3), @@ -112,66 +135,10 @@ async def compute(self, context: MetricContext) -> MetricScore: "max_speed_seconds": round(max(speeds), 3), "num_turns": len(speeds), "per_turn_speeds": per_turn_speeds, + "with_tool_calls": _compute_speed_stats(with_tool), + "no_tool_calls": _compute_speed_stats(no_tool), }, ) except Exception as e: return self._handle_error(e, context) - - -@register_metric -class ResponseSpeedMetric(_ResponseSpeedBase): - """Response speed metric. - - Measures the elapsed time between the end of the user's utterance - and the beginning of the assistant's response, using Pipecat's - UserBotLatencyObserver measurements. - - Reports raw latency values in seconds — no normalization applied. - - This is a diagnostic metric used for diagnosing model performance issues. - It is not directly used in final evaluation scores. - """ - - name = "response_speed" - description = "Debug metric: latency between user utterance end and assistant response start" - - def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: - return ( - context.response_speed_latencies, - "No response latencies available (UserBotLatencyObserver data missing)", - ) - - -@register_metric -class ResponseSpeedWithToolCallsMetric(_ResponseSpeedBase): - """Response speed restricted to turns where the assistant made at least one tool call. - - Uses per_turn_latency from the turn_taking metric and filters to turns - that contain a tool_call entry in the conversation trace. - This is a diagnostic metric not used in final evaluation scores. - """ - - name = "response_speed_with_tool_calls" - description = "Debug metric: response latency for turns that included a tool call" - - def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: - with_tool, _ = _split_turn_taking_latencies_by_tool_calls(context) - return with_tool, "No turns with tool calls found (or turn_taking latency data unavailable)" - - -@register_metric -class ResponseSpeedNoToolCallsMetric(_ResponseSpeedBase): - """Response speed restricted to turns where the assistant made no tool calls. - - Uses per_turn_latency from the turn_taking metric and filters to turns - that contain no tool_call entry in the conversation trace. - This is a diagnostic metric not used in final evaluation scores. - """ - - name = "response_speed_no_tool_calls" - description = "Debug metric: response latency for turns that did not include a tool call" - - def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: - _, no_tool = _split_turn_taking_latencies_by_tool_calls(context) - return no_tool, "No turns without tool calls found (or turn_taking latency data unavailable)" diff --git a/tests/unit/metrics/test_response_speed.py b/tests/unit/metrics/test_response_speed.py index 343e73ee..b9369a6e 100644 --- a/tests/unit/metrics/test_response_speed.py +++ b/tests/unit/metrics/test_response_speed.py @@ -4,11 +4,7 @@ import pytest -from eva.metrics.diagnostic.response_speed import ( - ResponseSpeedMetric, - ResponseSpeedNoToolCallsMetric, - ResponseSpeedWithToolCallsMetric, -) +from eva.metrics.diagnostic.response_speed import ResponseSpeedMetric from .conftest import make_metric_context @@ -49,10 +45,10 @@ def _make_trace(tool_call_turn_ids: set[int], all_turn_ids: set[int]) -> list[di class TestResponseSpeedMetric: @pytest.mark.asyncio - async def test_no_latencies_none(self): - """None latencies returns error.""" + async def test_no_output_dir(self): + """Missing output_dir returns error — no per_turn_latency data.""" metric = ResponseSpeedMetric() - ctx = make_metric_context(response_speed_latencies=None) + ctx = make_metric_context() result = await metric.compute(ctx) @@ -60,13 +56,13 @@ async def test_no_latencies_none(self): assert result.score == 0.0 assert result.normalized_score is None assert result.error is not None - assert "No response latencies" in result.error + assert "turn_taking" in result.error @pytest.mark.asyncio - async def test_no_latencies_empty(self): - """Empty list returns error.""" + async def test_missing_metrics_json(self, tmp_path): + """output_dir exists but has no metrics.json — returns error.""" metric = ResponseSpeedMetric() - ctx = make_metric_context(response_speed_latencies=[]) + ctx = make_metric_context(output_dir=tmp_path) result = await metric.compute(ctx) @@ -74,10 +70,23 @@ async def test_no_latencies_empty(self): assert result.error is not None @pytest.mark.asyncio - async def test_valid_latencies(self): - """Valid latencies produce correct mean, max, and per-turn details.""" + async def test_missing_turn_taking_data(self, tmp_path): + """metrics.json exists but has no turn_taking entry — returns error.""" + (tmp_path / "metrics.json").write_text(json.dumps({"metrics": {}})) metric = ResponseSpeedMetric() - ctx = make_metric_context(response_speed_latencies=[1.0, 2.0, 3.0]) + ctx = make_metric_context(output_dir=tmp_path) + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + + @pytest.mark.asyncio + async def test_valid_latencies(self, tmp_path): + """Valid per_turn_latency produces correct mean, max, and per-turn details.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0}) + metric = ResponseSpeedMetric() + ctx = make_metric_context(output_dir=tmp_path) result = await metric.compute(ctx) @@ -87,29 +96,28 @@ async def test_valid_latencies(self): assert result.details["mean_speed_seconds"] == pytest.approx(2.0) assert result.details["max_speed_seconds"] == pytest.approx(3.0) assert result.details["num_turns"] == 3 - assert result.details["per_turn_speeds"] == [1.0, 2.0, 3.0] @pytest.mark.asyncio - async def test_filters_invalid_values(self): + async def test_filters_invalid_values(self, tmp_path): """Negative and >1000s values are filtered out.""" + _write_metrics_json(tmp_path, {"1": -1.0, "2": 0.5, "3": 1500.0, "4": 2.5, "5": 0.0}) metric = ResponseSpeedMetric() - ctx = make_metric_context(response_speed_latencies=[-1.0, 0.5, 1500.0, 2.5, 0.0]) + ctx = make_metric_context(output_dir=tmp_path) result = await metric.compute(ctx) # Only 0.5 and 2.5 are valid (0 < x < 1000); 0.0 is excluded (not > 0) assert result.error is None assert result.details["num_turns"] == 2 - expected_mean = (0.5 + 2.5) / 2 - assert result.score == pytest.approx(expected_mean) + assert result.score == pytest.approx((0.5 + 2.5) / 2) assert result.details["max_speed_seconds"] == pytest.approx(2.5) - assert result.details["per_turn_speeds"] == [0.5, 2.5] @pytest.mark.asyncio - async def test_all_latencies_filtered_out(self): + async def test_all_latencies_filtered_out(self, tmp_path): """When all values are invalid, returns error.""" + _write_metrics_json(tmp_path, {"1": -5.0, "2": 0.0, "3": 2000.0}) metric = ResponseSpeedMetric() - ctx = make_metric_context(response_speed_latencies=[-5.0, 0.0, 2000.0]) + ctx = make_metric_context(output_dir=tmp_path) result = await metric.compute(ctx) @@ -119,10 +127,11 @@ async def test_all_latencies_filtered_out(self): assert "No valid response speeds" in result.error @pytest.mark.asyncio - async def test_single_latency_value(self): + async def test_single_latency_value(self, tmp_path): """Single valid latency works correctly.""" + _write_metrics_json(tmp_path, {"1": 0.75}) metric = ResponseSpeedMetric() - ctx = make_metric_context(response_speed_latencies=[0.75]) + ctx = make_metric_context(output_dir=tmp_path) result = await metric.compute(ctx) @@ -132,179 +141,72 @@ async def test_single_latency_value(self): assert result.details["num_turns"] == 1 assert result.details["per_turn_speeds"] == [0.75] - -# --------------------------------------------------------------------------- -# ResponseSpeedWithToolCallsMetric -# --------------------------------------------------------------------------- - - -class TestResponseSpeedWithToolCallsMetric: @pytest.mark.asyncio - async def test_no_output_dir(self): - """Missing output_dir returns error.""" - metric = ResponseSpeedWithToolCallsMetric() - ctx = make_metric_context() - - result = await metric.compute(ctx) - - assert result.score == 0.0 - assert result.error is not None - - @pytest.mark.asyncio - async def test_missing_metrics_json(self, tmp_path): - """output_dir exists but has no metrics.json — returns error.""" - metric = ResponseSpeedWithToolCallsMetric() - ctx = make_metric_context(output_dir=tmp_path) - - result = await metric.compute(ctx) - - assert result.score == 0.0 - assert result.error is not None - - @pytest.mark.asyncio - async def test_missing_turn_taking_data(self, tmp_path): - """metrics.json exists but has no turn_taking entry — returns error.""" - (tmp_path / "metrics.json").write_text(json.dumps({"metrics": {}})) - metric = ResponseSpeedWithToolCallsMetric() + async def test_no_tool_call_breakdown_without_trace(self, tmp_path): + """with_tool_calls is None and no_tool_calls covers all turns when trace is absent.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0}) + metric = ResponseSpeedMetric() ctx = make_metric_context(output_dir=tmp_path) result = await metric.compute(ctx) - assert result.score == 0.0 - assert result.error is not None - - @pytest.mark.asyncio - async def test_no_turns_with_tool_calls(self, tmp_path): - """Record has no tool-call turns — returns 'not found' error.""" - _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0}) - trace = _make_trace(tool_call_turn_ids=set(), all_turn_ids={1, 2, 3}) - metric = ResponseSpeedWithToolCallsMetric() - ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) - - result = await metric.compute(ctx) - - assert result.score == 0.0 - assert result.error is not None - assert "No turns with tool calls" in result.error + assert result.error is None + # No trace → no tool call turn ids → all turns go into no_tool bucket + assert result.details["with_tool_calls"] is None + assert result.details["no_tool_calls"] is not None + assert result.details["no_tool_calls"]["num_turns"] == 2 @pytest.mark.asyncio - async def test_mixed_turns(self, tmp_path): - """Correctly includes only tool-call turn latencies.""" + async def test_tool_call_breakdown_mixed_turns(self, tmp_path): + """with_tool_calls and no_tool_calls sub-fields reflect the correct split.""" _write_metrics_json(tmp_path, {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0}) - # Turns 2 and 4 have tool calls trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4}) - metric = ResponseSpeedWithToolCallsMetric() + metric = ResponseSpeedMetric() ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) result = await metric.compute(ctx) assert result.error is None - assert result.details["num_turns"] == 2 - assert result.score == pytest.approx((5.0 + 7.0) / 2) - assert result.details["max_speed_seconds"] == pytest.approx(7.0) - assert result.details["per_turn_speeds"] == [5.0, 7.0] - - @pytest.mark.asyncio - async def test_all_turns_have_tool_calls(self, tmp_path): - """When every turn has a tool call, all latencies are included.""" + with_tc = result.details["with_tool_calls"] + no_tc = result.details["no_tool_calls"] + assert with_tc is not None + assert no_tc is not None + assert with_tc["num_turns"] == 2 + assert with_tc["mean_speed_seconds"] == pytest.approx((5.0 + 7.0) / 2) + assert with_tc["max_speed_seconds"] == pytest.approx(7.0) + assert no_tc["num_turns"] == 2 + assert no_tc["mean_speed_seconds"] == pytest.approx((1.0 + 3.0) / 2) + assert no_tc["max_speed_seconds"] == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_tool_call_breakdown_all_tool_turns(self, tmp_path): + """no_tool_calls is None when every turn has a tool call.""" _write_metrics_json(tmp_path, {"1": 2.0, "2": 4.0}) trace = _make_trace(tool_call_turn_ids={1, 2}, all_turn_ids={1, 2}) - metric = ResponseSpeedWithToolCallsMetric() + metric = ResponseSpeedMetric() ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) result = await metric.compute(ctx) assert result.error is None - assert result.details["num_turns"] == 2 - assert result.score == pytest.approx(3.0) + assert result.details["with_tool_calls"] is not None + assert result.details["with_tool_calls"]["num_turns"] == 2 + assert result.details["no_tool_calls"] is None @pytest.mark.asyncio - async def test_filters_invalid_latency_values(self, tmp_path): - """Sanity filter (0 < x < 1000) applies to per_turn_latency values.""" + async def test_tool_call_breakdown_filters_invalid_latencies(self, tmp_path): + """Sanity filter (0 < x < 1000) applies within the breakdown sub-fields.""" _write_metrics_json(tmp_path, {"1": -1.0, "2": 5.0, "3": 2000.0, "4": 3.0}) trace = _make_trace(tool_call_turn_ids={1, 2, 3, 4}, all_turn_ids={1, 2, 3, 4}) - metric = ResponseSpeedWithToolCallsMetric() - ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) - - result = await metric.compute(ctx) - - assert result.error is None - assert result.details["num_turns"] == 2 # only 5.0 and 3.0 pass - assert result.score == pytest.approx((5.0 + 3.0) / 2) - - -# --------------------------------------------------------------------------- -# ResponseSpeedNoToolCallsMetric -# --------------------------------------------------------------------------- - - -class TestResponseSpeedNoToolCallsMetric: - @pytest.mark.asyncio - async def test_no_output_dir(self): - """Missing output_dir returns error.""" - metric = ResponseSpeedNoToolCallsMetric() - ctx = make_metric_context() - - result = await metric.compute(ctx) - - assert result.score == 0.0 - assert result.error is not None - - @pytest.mark.asyncio - async def test_missing_metrics_json(self, tmp_path): - """output_dir exists but has no metrics.json — returns error.""" - metric = ResponseSpeedNoToolCallsMetric() - ctx = make_metric_context(output_dir=tmp_path) - - result = await metric.compute(ctx) - - assert result.score == 0.0 - assert result.error is not None - - @pytest.mark.asyncio - async def test_all_turns_have_tool_calls(self, tmp_path): - """Every turn has a tool call — no-tool bucket is empty.""" - _write_metrics_json(tmp_path, {"1": 2.0, "2": 4.0}) - trace = _make_trace(tool_call_turn_ids={1, 2}, all_turn_ids={1, 2}) - metric = ResponseSpeedNoToolCallsMetric() - ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) - - result = await metric.compute(ctx) - - assert result.score == 0.0 - assert result.error is not None - assert "No turns without tool calls" in result.error - - @pytest.mark.asyncio - async def test_mixed_turns(self, tmp_path): - """Correctly includes only non-tool-call turn latencies.""" - _write_metrics_json(tmp_path, {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0}) - # Turns 2 and 4 have tool calls; turns 1 and 3 do not - trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4}) - metric = ResponseSpeedNoToolCallsMetric() - ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) - - result = await metric.compute(ctx) - - assert result.error is None - assert result.details["num_turns"] == 2 - assert result.score == pytest.approx((1.0 + 3.0) / 2) - assert result.details["max_speed_seconds"] == pytest.approx(3.0) - assert result.details["per_turn_speeds"] == [1.0, 3.0] - - @pytest.mark.asyncio - async def test_no_turns_with_tool_calls(self, tmp_path): - """Record with no tool-call turns — all latencies included.""" - _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0}) - trace = _make_trace(tool_call_turn_ids=set(), all_turn_ids={1, 2, 3}) - metric = ResponseSpeedNoToolCallsMetric() + metric = ResponseSpeedMetric() ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) result = await metric.compute(ctx) assert result.error is None - assert result.details["num_turns"] == 3 - assert result.score == pytest.approx(2.0) + with_tc = result.details["with_tool_calls"] + assert with_tc is not None + assert with_tc["num_turns"] == 2 # only 5.0 and 3.0 pass the filter @pytest.mark.asyncio async def test_with_and_no_tool_split_is_exhaustive(self, tmp_path): @@ -312,12 +214,13 @@ async def test_with_and_no_tool_split_is_exhaustive(self, tmp_path): per_turn = {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0, "5": 2.0} _write_metrics_json(tmp_path, per_turn) trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4, 5}) + metric = ResponseSpeedMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) - ctx_with = make_metric_context(output_dir=tmp_path, conversation_trace=trace) - ctx_no = make_metric_context(output_dir=tmp_path, conversation_trace=trace) - - result_with = await ResponseSpeedWithToolCallsMetric().compute(ctx_with) - result_no = await ResponseSpeedNoToolCallsMetric().compute(ctx_no) + result = await metric.compute(ctx) - combined = result_with.details["per_turn_speeds"] + result_no.details["per_turn_speeds"] + assert result.error is None + combined = ( + result.details["with_tool_calls"]["per_turn_speeds"] + result.details["no_tool_calls"]["per_turn_speeds"] + ) assert sorted(combined) == sorted(per_turn.values()) From 796c638edd96d8cd2486c32192f1e6d22cfe045d Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Wed, 15 Apr 2026 09:47:26 -0400 Subject: [PATCH 8/9] Show response_speed with/no tool call breakdown in diagnostic table and metrics_summary --- apps/analysis.py | 21 ++++++++++++++++++++- src/eva/metrics/runner.py | 26 ++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/apps/analysis.py b/apps/analysis.py index 40516a78..aba9c101 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -44,6 +44,9 @@ def _build_metric_group_map() -> dict[str, str]: _METRIC_GROUP: dict[str, str] = _build_metric_group_map() +# Synthetic columns derived from response_speed details sub-fields +_METRIC_GROUP["response_speed_with_tool_calls"] = "Diagnostic" +_METRIC_GROUP["response_speed_no_tool_calls"] = "Diagnostic" # Ordered categories for display; anything not listed sorts to the end _CATEGORY_ORDER = ["Accuracy", "Experience", "Conversation Quality", "Diagnostic", "Validation"] @@ -76,7 +79,7 @@ def _build_metric_group_map() -> dict[str, str]: "Other": "#AAAAAA", } -_NON_NORMALIZED_METRICS = {"response_speed"} +_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"} # EVA composite scores to show in the bar chart _EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"] @@ -545,6 +548,15 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]: else metric_score.score ) + if metric_name == "response_speed" and metric_score.details: + details = metric_score.details + with_tc = details.get("with_tool_calls") or {} + no_tc = details.get("no_tool_calls") or {} + row["response_speed_with_tool_calls"] = with_tc.get("mean_speed_seconds") + row["response_speed_no_tool_calls"] = no_tc.get("mean_speed_seconds") + all_metric_names.add("response_speed_with_tool_calls") + all_metric_names.add("response_speed_no_tool_calls") + rows.append(row) return rows, sorted(all_metric_names) @@ -970,6 +982,13 @@ def render_cross_run_comparison(run_dirs: list[Path]): for m, stats in per_metric.items(): if stats.get("mean") is not None: summary[m] = stats["mean"] + # Expose response_speed sub-field means as synthetic columns + for sub_key in ("with_tool_calls", "no_tool_calls"): + sub = stats.get(sub_key) + if sub and sub.get("mean") is not None: + col = f"{m}_{sub_key}" + summary[col] = sub["mean"] + all_metric_names.add(col) # Add EVA composite scores from overall_scores overall = metrics_summary.get("overall_scores", {}) for composite in _EVA_BAR_COMPOSITES: diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py index 2d808d7a..892fcbe7 100644 --- a/src/eva/metrics/runner.py +++ b/src/eva/metrics/runner.py @@ -620,6 +620,32 @@ def _build_per_metric_aggregates( "count": count, } + # Nest with/without tool call breakdowns inside the response_speed aggregate + if "response_speed" in metric_names and "response_speed" in metric_aggregates: + for sub_key in ("with_tool_calls", "no_tool_calls"): + sub_scores: list[float] = [] + sub_missing = 0 + for record_metrics in all_metrics.values(): + rs = record_metrics.metrics.get("response_speed") + if rs is None or rs.error is not None: + sub_missing += 1 + continue + sub_details = (rs.details or {}).get(sub_key) + if sub_details and sub_details.get("mean_speed_seconds") is not None: + sub_scores.append(sub_details["mean_speed_seconds"]) + else: + sub_missing += 1 + if sub_scores or sub_missing > 0: + metric_aggregates["response_speed"][sub_key] = { + "mean": round(sum(sub_scores) / len(sub_scores), 4) if sub_scores else None, + "min": round(min(sub_scores), 4) if sub_scores else None, + "max": round(max(sub_scores), 4) if sub_scores else None, + "count": len(sub_scores), + "none_count": sub_missing, + "missing_count": sub_missing, + "total_records": total_records, + } + return metric_aggregates @staticmethod From 4b60c309ab370e72751f2c71b2de4c8382a7d384 Mon Sep 17 00:00:00 2001 From: Fanny Riols Date: Thu, 16 Apr 2026 09:24:27 -0400 Subject: [PATCH 9/9] Update response_speed.py --- src/eva/metrics/diagnostic/response_speed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py index 1b2d5bee..7dce04f6 100644 --- a/src/eva/metrics/diagnostic/response_speed.py +++ b/src/eva/metrics/diagnostic/response_speed.py @@ -106,7 +106,7 @@ async def compute(self, context: MetricContext) -> MetricScore: speeds = [] per_turn_speeds = [] for latency in all_latencies: - if 0 < latency < 1000: + if latency is not None and 0 < latency < 1000: speeds.append(latency) per_turn_speeds.append(round(latency, 3)) else: