From 8985189e80ee09edf543f3a1ff39e43327963d0b Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Tue, 14 Apr 2026 12:28:38 -0400
Subject: [PATCH 1/9] Add response_speed_with_tool_calls and
 response_speed_no_tool_calls metrics

Splits the existing response_speed diagnostic metric into two filtered
variants based on whether the assistant made a tool call in the turn.
Parses conversation_trace to map each latency to its turn and checks
for tool_call entries on that turn_id.

Shared logic (sanity filtering, mean/max, MetricScore construction) is
extracted into a _ResponseSpeedBase class; each variant only implements
_get_latencies(). Bumps metrics_version to 0.1.2.
---
 pyproject.toml                               |   2 +-
 src/eva/__init__.py                          |   2 +-
 src/eva/metrics/diagnostic/response_speed.py | 134 +++++++++++++++----
 3 files changed, 111 insertions(+), 27 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 561cba2d..47827e98 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,7 +101,7 @@ ignore = ["D203", "D206", "D213", "D400", "D401", "D413", "D415", "E1", "E501"]
 simulation_version = "0.1.0"
 # Bump when metrics pipeline changes (metrics code, judge prompts, pricing,
 # postprocessor). Old metric results become stale — cheap to recompute.
-metrics_version = "0.1.1"
+metrics_version = "0.1.2"
 
 [tool.mypy]
 python_version = "3.11"
diff --git a/src/eva/__init__.py b/src/eva/__init__.py
index 6796f4aa..03f1f13b 100644
--- a/src/eva/__init__.py
+++ b/src/eva/__init__.py
@@ -11,4 +11,4 @@
 
 # Bump metrics_version when changes affect metric computation (metrics code,
 # judge prompts, pricing tables, postprocessor).
-metrics_version = "0.1.1"
+metrics_version = "0.1.2"
diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py
index 0dd4fb53..68160a60 100644
--- a/src/eva/metrics/diagnostic/response_speed.py
+++ b/src/eva/metrics/diagnostic/response_speed.py
@@ -4,33 +4,60 @@
 final evaluation scores.
 """
 
+from abc import abstractmethod
+
 from eva.metrics.base import CodeMetric, MetricContext
 from eva.metrics.registry import register_metric
 from eva.models.results import MetricScore
 
 
-@register_metric
-class ResponseSpeedMetric(CodeMetric):
-    """Response speed metric.
+def _split_latencies_by_tool_calls(
+    context: MetricContext,
+) -> tuple[list[float], list[float]]:
+    """Partition response_speed_latencies into (with_tool_calls, no_tool_calls).
 
-    Measures the elapsed time between the end of the user's utterance
-    and the beginning of the assistant's response.
+    The i-th latency corresponds to the i-th user turn in chronological order.
+    We look at the conversation_trace to find which turn_ids contain at least
+    one tool_call entry.
 
-    Reports raw latency values in seconds — no normalization applied.
+    Returns:
+        (with_tool_latencies, no_tool_latencies)
+    """
+    trace = context.conversation_trace or []
 
-    This is a diagnostic metric used for diagnosing model performance issues.
-    It is not directly used in final evaluation scores.
+    user_turn_ids = sorted({entry["turn_id"] for entry in trace if entry.get("type") == "transcribed"})
+    tool_call_turn_ids = {entry["turn_id"] for entry in trace if entry.get("type") == "tool_call"}
+
+    with_tool: list[float] = []
+    no_tool: list[float] = []
+
+    for i, latency in enumerate(context.response_speed_latencies):
+        if i >= len(user_turn_ids):
+            break
+        if user_turn_ids[i] in tool_call_turn_ids:
+            with_tool.append(latency)
+        else:
+            no_tool.append(latency)
+
+    return with_tool, no_tool
+
+
+class _ResponseSpeedBase(CodeMetric):
+    """Base class for response-speed metrics.
+
+    Subclasses implement `_get_latencies` to return the subset of latencies
+    to compute over; everything else is shared.
     """
 
-    name = "response_speed"
-    description = "Debug metric: latency between user utterance end and assistant response start"
     category = "diagnostic"
     exclude_from_pass_at_k = True
 
+    @abstractmethod
+    def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
+        """Return (latencies, error_if_empty) for this metric variant."""
+
     async def compute(self, context: MetricContext) -> MetricScore:
-        """Compute response speed from Pipecat's UserBotLatencyObserver measurements."""
         try:
-            # Check if we have response speed latencies from UserBotLatencyObserver
             if not context.response_speed_latencies:
                 return MetricScore(
                     name=self.name,
@@ -39,19 +66,25 @@ async def compute(self, context: MetricContext) -> MetricScore:
                     error="No response latencies available (UserBotLatencyObserver data missing)",
                 )
 
-            # Use latencies measured by Pipecat's UserBotLatencyObserver
-            # These measure the time from user stopped speaking to assistant started speaking
+            latencies, empty_error = self._get_latencies(context)
+
+            if not latencies:
+                return MetricScore(
+                    name=self.name,
+                    score=0.0,
+                    normalized_score=None,
+                    error=empty_error,
+                )
+
             speeds = []
             per_turn_speeds = []
-
-            for response_speed in context.response_speed_latencies:
-                # Filter out invalid values (negative or extremely large)
-                if 0 < response_speed < 1000:  # Sanity check: under 1000 seconds
-                    speeds.append(response_speed)
-                    per_turn_speeds.append(round(response_speed, 3))
+            for latency in latencies:
+                if 0 < latency < 1000:
+                    speeds.append(latency)
+                    per_turn_speeds.append(round(latency, 3))
                 else:
                     self.logger.warning(
-                        f"[{context.record_id}] Unusual response speed detected and dropped: {response_speed} seconds"
+                        f"[{context.record_id}] Unusual response speed detected and dropped: {latency} seconds"
                     )
 
             if not speeds:
@@ -63,15 +96,14 @@ async def compute(self, context: MetricContext) -> MetricScore:
                 )
 
             mean_speed = sum(speeds) / len(speeds)
-            max_speed = max(speeds)
 
             return MetricScore(
                 name=self.name,
-                score=round(mean_speed, 3),  # Mean response speed in seconds
-                normalized_score=None,  # Raw latency in seconds; not normalizable to [0,1]
+                score=round(mean_speed, 3),
+                normalized_score=None,
                 details={
                     "mean_speed_seconds": round(mean_speed, 3),
-                    "max_speed_seconds": round(max_speed, 3),
+                    "max_speed_seconds": round(max(speeds), 3),
                     "num_turns": len(speeds),
                     "per_turn_speeds": per_turn_speeds,
                 },
@@ -79,3 +111,55 @@ async def compute(self, context: MetricContext) -> MetricScore:
 
         except Exception as e:
             return self._handle_error(e, context)
+
+
+@register_metric
+class ResponseSpeedMetric(_ResponseSpeedBase):
+    """Response speed metric.
+
+    Measures the elapsed time between the end of the user's utterance
+    and the beginning of the assistant's response.
+
+    Reports raw latency values in seconds — no normalization applied.
+
+    This is a diagnostic metric used for diagnosing model performance issues.
+    It is not directly used in final evaluation scores.
+    """
+
+    name = "response_speed"
+    description = "Debug metric: latency between user utterance end and assistant response start"
+
+    def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
+        return context.response_speed_latencies, "No valid response speeds computed"
+
+
+@register_metric
+class ResponseSpeedWithToolCallsMetric(_ResponseSpeedBase):
+    """Response speed restricted to turns where the assistant made at least one tool call.
+
+    Computed the same way as response_speed but only over tool-call turns.
+    This is a diagnostic metric not used in final evaluation scores.
+    """
+
+    name = "response_speed_with_tool_calls"
+    description = "Debug metric: response latency for turns that included a tool call"
+
+    def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
+        with_tool, _ = _split_latencies_by_tool_calls(context)
+        return with_tool, "No turns with tool calls found"
+
+
+@register_metric
+class ResponseSpeedNoToolCallsMetric(_ResponseSpeedBase):
+    """Response speed restricted to turns where the assistant made no tool calls.
+
+    Computed the same way as response_speed but only over non-tool-call turns.
+    This is a diagnostic metric not used in final evaluation scores.
+    """
+
+    name = "response_speed_no_tool_calls"
+    description = "Debug metric: response latency for turns that did not include a tool call"
+
+    def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
+        _, no_tool = _split_latencies_by_tool_calls(context)
+        return no_tool, "No turns without tool calls found"

From 7886625710b48724ce9cdeb8e696d1efcab5139b Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Tue, 14 Apr 2026 12:28:52 -0400
Subject: [PATCH 2/9] Warn instead of raise when unused deployments are missing
 from EVA_MODEL_LIST

When restoring redacted secrets in apply_env_overrides, skip deployments
that are not present in the current environment's EVA_MODEL_LIST rather
than raising a ValueError. Only raise if the missing deployment is the
active LLM for this run. This allows metrics-only reruns in environments
that don't have every deployment from the original run configured.
---
 src/eva/models/config.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index e08783bd..f3885c54 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -648,10 +648,17 @@ def apply_env_overrides(self, live: "RunConfig") -> None:
             if not has_redacted:
                 continue
             if name not in live_by_name:
-                raise ValueError(
-                    f"Cannot restore secrets: deployment {name!r} not found in "
-                    f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
+                active_llm = getattr(self.model, "llm", None)
+                if name == active_llm:
+                    raise ValueError(
+                        f"Cannot restore secrets: deployment {name!r} not found in "
+                        f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
+                    )
+                logger.warning(
+                    f"Deployment {name!r} has redacted secrets but is not in the current "
+                    f"EVA_MODEL_LIST — skipping (not used in this run)."
                 )
+                continue
             live_params = live_by_name[name].get("litellm_params", {})
             for key, value in saved_params.items():
                 if value == "***" and key in live_params:

From b2bc3654387fadc0cb5c37f32c4d954bcbc1fa7b Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Tue, 14 Apr 2026 12:29:13 -0400
Subject: [PATCH 3/9] Fall back to output_dir when stored log paths no longer
 exist

Adds _resolve_path() helper that returns the stored path if it exists on
disk, otherwise falls back to output_dir/<filename>. Used in _build_history
for pipecat_logs.jsonl and elevenlabs_events.jsonl so that metric reruns
work correctly when a run directory has been moved from its original location.
---
 src/eva/metrics/processor.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/eva/metrics/processor.py b/src/eva/metrics/processor.py
index 660e7ce9..94aa6e44 100644
--- a/src/eva/metrics/processor.py
+++ b/src/eva/metrics/processor.py
@@ -24,6 +24,19 @@
 
 logger = get_logger(__name__)
 
+
+def _resolve_path(stored: str | None, fallback: Path) -> str | Path:
+    """Return *stored* if it exists on disk, otherwise *fallback*.
+
+    Allows metrics to re-run correctly when a run directory has been moved:
+    the stored path reflects the original location, but the file is now at
+    *fallback* (i.e. output_dir / filename).
+    """
+    if stored and Path(stored).exists():
+        return stored
+    return fallback
+
+
 # Elevenlabs audio user field → _ProcessorContext attribute name
 AUDIO_ATTR = {
     "pipecat_agent": "audio_timestamps_assistant_turns",
@@ -824,8 +837,10 @@ def _build_history(
         Each entry: {timestamp_ms, source, event_type, data}.
         """
         history = self._load_audit_log_transcript(output_dir)
-        history.extend(self._load_pipecat_logs(result.pipecat_logs_path))
-        history.extend(self._load_elevenlabs_logs(result.elevenlabs_logs_path))
+        pipecat_path = _resolve_path(result.pipecat_logs_path, output_dir / "pipecat_logs.jsonl")
+        history.extend(self._load_pipecat_logs(pipecat_path))
+        elevenlabs_path = _resolve_path(result.elevenlabs_logs_path, output_dir / "elevenlabs_events.jsonl")
+        history.extend(self._load_elevenlabs_logs(elevenlabs_path))
 
         history.sort(key=lambda e: e["timestamp_ms"])
         context.history = history

From e99a8a444113d4a9b22ffcf1ff33210567228d7f Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Tue, 14 Apr 2026 12:33:10 -0400
Subject: [PATCH 4/9] Show response_speed_with_tool_calls and
 response_speed_no_tool_calls in analysis app

Adds both new metrics to _NON_NORMALIZED_METRICS so they are rendered as
standalone seconds bar charts alongside response_speed. Category grouping,
color, and table sorting are handled dynamically via the metric registry.
---
 apps/analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index 4e651752..9bb15520 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -75,7 +75,7 @@ def _build_metric_group_map() -> dict[str, str]:
     "Other": "#AAAAAA",
 }
 
-_NON_NORMALIZED_METRICS = {"response_speed"}
+_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"}
 
 # EVA composite scores to show in the bar chart
 _EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"]

From b26b79aa00bbfe5694c563f5b22cd0d95448d796 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Tue, 14 Apr 2026 14:32:22 -0400
Subject: [PATCH 5/9] Use turn_taking per_turn_latency as the data source for
 filtered response speed metrics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The filtered variants now read metrics/turn_taking/details/per_turn_latency
from the record's metrics.json instead of using context.response_speed_latencies.
This gives a direct turn_id → latency mapping, avoiding the index-based
alignment that was previously needed to correlate latencies with tool calls.

The base response_speed metric is unchanged (still uses UserBotLatencyObserver).
---
 src/eva/metrics/diagnostic/response_speed.py | 70 ++++++++++++--------
 1 file changed, 41 insertions(+), 29 deletions(-)

diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py
index 68160a60..224ebc49 100644
--- a/src/eva/metrics/diagnostic/response_speed.py
+++ b/src/eva/metrics/diagnostic/response_speed.py
@@ -4,37 +4,51 @@
 final evaluation scores.
 """
 
+import json
 from abc import abstractmethod
+from pathlib import Path
 
 from eva.metrics.base import CodeMetric, MetricContext
 from eva.metrics.registry import register_metric
 from eva.models.results import MetricScore
 
 
-def _split_latencies_by_tool_calls(
+def _split_turn_taking_latencies_by_tool_calls(
     context: MetricContext,
 ) -> tuple[list[float], list[float]]:
-    """Partition response_speed_latencies into (with_tool_calls, no_tool_calls).
+    """Partition turn_taking per_turn_latency values into (with_tool_calls, no_tool_calls).
 
-    The i-th latency corresponds to the i-th user turn in chronological order.
-    We look at the conversation_trace to find which turn_ids contain at least
-    one tool_call entry.
+    Reads metrics/turn_taking/details/per_turn_latency from the record's
+    metrics.json, then checks conversation_trace to determine which turn_ids
+    had at least one tool call.
 
     Returns:
         (with_tool_latencies, no_tool_latencies)
     """
-    trace = context.conversation_trace or []
+    if not context.output_dir:
+        return [], []
 
-    user_turn_ids = sorted({entry["turn_id"] for entry in trace if entry.get("type") == "transcribed"})
-    tool_call_turn_ids = {entry["turn_id"] for entry in trace if entry.get("type") == "tool_call"}
+    metrics_path = Path(context.output_dir) / "metrics.json"
+    if not metrics_path.exists():
+        return [], []
+
+    with open(metrics_path) as f:
+        data = json.load(f)
+
+    per_turn_latency: dict[str, float] = (
+        data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {})
+    )
+    if not per_turn_latency:
+        return [], []
+
+    tool_call_turn_ids = {
+        entry["turn_id"] for entry in (context.conversation_trace or []) if entry.get("type") == "tool_call"
+    }
 
     with_tool: list[float] = []
     no_tool: list[float] = []
-
-    for i, latency in enumerate(context.response_speed_latencies):
-        if i >= len(user_turn_ids):
-            break
-        if user_turn_ids[i] in tool_call_turn_ids:
+    for turn_id_str, latency in per_turn_latency.items():
+        if int(turn_id_str) in tool_call_turn_ids:
             with_tool.append(latency)
         else:
             no_tool.append(latency)
@@ -58,14 +72,6 @@ def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
 
     async def compute(self, context: MetricContext) -> MetricScore:
         try:
-            if not context.response_speed_latencies:
-                return MetricScore(
-                    name=self.name,
-                    score=0.0,
-                    normalized_score=None,
-                    error="No response latencies available (UserBotLatencyObserver data missing)",
-                )
-
             latencies, empty_error = self._get_latencies(context)
 
             if not latencies:
@@ -118,7 +124,8 @@ class ResponseSpeedMetric(_ResponseSpeedBase):
     """Response speed metric.
 
     Measures the elapsed time between the end of the user's utterance
-    and the beginning of the assistant's response.
+    and the beginning of the assistant's response, using Pipecat's
+    UserBotLatencyObserver measurements.
 
     Reports raw latency values in seconds — no normalization applied.
 
@@ -130,14 +137,18 @@ class ResponseSpeedMetric(_ResponseSpeedBase):
     description = "Debug metric: latency between user utterance end and assistant response start"
 
     def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
-        return context.response_speed_latencies, "No valid response speeds computed"
+        return (
+            context.response_speed_latencies,
+            "No response latencies available (UserBotLatencyObserver data missing)",
+        )
 
 
 @register_metric
 class ResponseSpeedWithToolCallsMetric(_ResponseSpeedBase):
     """Response speed restricted to turns where the assistant made at least one tool call.
 
-    Computed the same way as response_speed but only over tool-call turns.
+    Uses per_turn_latency from the turn_taking metric and filters to turns
+    that contain a tool_call entry in the conversation trace.
     This is a diagnostic metric not used in final evaluation scores.
     """
 
@@ -145,15 +156,16 @@ class ResponseSpeedWithToolCallsMetric(_ResponseSpeedBase):
     description = "Debug metric: response latency for turns that included a tool call"
 
     def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
-        with_tool, _ = _split_latencies_by_tool_calls(context)
-        return with_tool, "No turns with tool calls found"
+        with_tool, _ = _split_turn_taking_latencies_by_tool_calls(context)
+        return with_tool, "No turns with tool calls found (or turn_taking latency data unavailable)"
 
 
 @register_metric
 class ResponseSpeedNoToolCallsMetric(_ResponseSpeedBase):
     """Response speed restricted to turns where the assistant made no tool calls.
 
-    Computed the same way as response_speed but only over non-tool-call turns.
+    Uses per_turn_latency from the turn_taking metric and filters to turns
+    that contain no tool_call entry in the conversation trace.
     This is a diagnostic metric not used in final evaluation scores.
     """
 
@@ -161,5 +173,5 @@ class ResponseSpeedNoToolCallsMetric(_ResponseSpeedBase):
     description = "Debug metric: response latency for turns that did not include a tool call"
 
     def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
-        _, no_tool = _split_latencies_by_tool_calls(context)
-        return no_tool, "No turns without tool calls found"
+        _, no_tool = _split_turn_taking_latencies_by_tool_calls(context)
+        return no_tool, "No turns without tool calls found (or turn_taking latency data unavailable)"

From ccef0956562758899b53a50a2ed79c3426f041b6 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Tue, 14 Apr 2026 14:43:20 -0400
Subject: [PATCH 6/9] Add unit tests for ResponseSpeedWithToolCallsMetric and
 ResponseSpeedNoToolCallsMetric

Tests cover: missing output_dir, missing metrics.json, missing turn_taking
data, no tool-call turns, all tool-call turns, mixed turns (correct split),
invalid latency filtering, and an exhaustiveness check that with_tool +
no_tool latencies together equal the full per_turn_latency set.
---
 tests/unit/metrics/test_response_speed.py | 232 +++++++++++++++++++++-
 1 file changed, 231 insertions(+), 1 deletion(-)

diff --git a/tests/unit/metrics/test_response_speed.py b/tests/unit/metrics/test_response_speed.py
index 8cb3ecfc..343e73ee 100644
--- a/tests/unit/metrics/test_response_speed.py
+++ b/tests/unit/metrics/test_response_speed.py
@@ -1,11 +1,51 @@
 """Tests for the ResponseSpeedMetric."""
 
+import json
+
 import pytest
 
-from eva.metrics.diagnostic.response_speed import ResponseSpeedMetric
+from eva.metrics.diagnostic.response_speed import (
+    ResponseSpeedMetric,
+    ResponseSpeedNoToolCallsMetric,
+    ResponseSpeedWithToolCallsMetric,
+)
 
 from .conftest import make_metric_context
 
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _write_metrics_json(tmp_path, per_turn_latency: dict) -> None:
+    """Write a minimal metrics.json with turn_taking per_turn_latency data."""
+    data = {
+        "metrics": {
+            "turn_taking": {
+                "details": {
+                    "per_turn_latency": per_turn_latency,
+                }
+            }
+        }
+    }
+    (tmp_path / "metrics.json").write_text(json.dumps(data))
+
+
+def _make_trace(tool_call_turn_ids: set[int], all_turn_ids: set[int]) -> list[dict]:
+    """Build a minimal conversation_trace with the given turn structure."""
+    trace = []
+    for tid in sorted(all_turn_ids):
+        trace.append({"turn_id": tid, "type": "transcribed", "content": "user utterance"})
+        if tid in tool_call_turn_ids:
+            trace.append({"turn_id": tid, "type": "tool_call", "tool_name": "some_tool"})
+            trace.append({"turn_id": tid, "type": "tool_response", "tool_name": "some_tool"})
+    return trace
+
+
+# ---------------------------------------------------------------------------
+# ResponseSpeedMetric
+# ---------------------------------------------------------------------------
+
 
 class TestResponseSpeedMetric:
     @pytest.mark.asyncio
@@ -91,3 +131,193 @@ async def test_single_latency_value(self):
         assert result.details["max_speed_seconds"] == pytest.approx(0.75)
         assert result.details["num_turns"] == 1
         assert result.details["per_turn_speeds"] == [0.75]
+
+
+# ---------------------------------------------------------------------------
+# ResponseSpeedWithToolCallsMetric
+# ---------------------------------------------------------------------------
+
+
+class TestResponseSpeedWithToolCallsMetric:
+    @pytest.mark.asyncio
+    async def test_no_output_dir(self):
+        """Missing output_dir returns error."""
+        metric = ResponseSpeedWithToolCallsMetric()
+        ctx = make_metric_context()
+
+        result = await metric.compute(ctx)
+
+        assert result.score == 0.0
+        assert result.error is not None
+
+    @pytest.mark.asyncio
+    async def test_missing_metrics_json(self, tmp_path):
+        """output_dir exists but has no metrics.json — returns error."""
+        metric = ResponseSpeedWithToolCallsMetric()
+        ctx = make_metric_context(output_dir=tmp_path)
+
+        result = await metric.compute(ctx)
+
+        assert result.score == 0.0
+        assert result.error is not None
+
+    @pytest.mark.asyncio
+    async def test_missing_turn_taking_data(self, tmp_path):
+        """metrics.json exists but has no turn_taking entry — returns error."""
+        (tmp_path / "metrics.json").write_text(json.dumps({"metrics": {}}))
+        metric = ResponseSpeedWithToolCallsMetric()
+        ctx = make_metric_context(output_dir=tmp_path)
+
+        result = await metric.compute(ctx)
+
+        assert result.score == 0.0
+        assert result.error is not None
+
+    @pytest.mark.asyncio
+    async def test_no_turns_with_tool_calls(self, tmp_path):
+        """Record has no tool-call turns — returns 'not found' error."""
+        _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0})
+        trace = _make_trace(tool_call_turn_ids=set(), all_turn_ids={1, 2, 3})
+        metric = ResponseSpeedWithToolCallsMetric()
+        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+
+        result = await metric.compute(ctx)
+
+        assert result.score == 0.0
+        assert result.error is not None
+        assert "No turns with tool calls" in result.error
+
+    @pytest.mark.asyncio
+    async def test_mixed_turns(self, tmp_path):
+        """Correctly includes only tool-call turn latencies."""
+        _write_metrics_json(tmp_path, {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0})
+        # Turns 2 and 4 have tool calls
+        trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4})
+        metric = ResponseSpeedWithToolCallsMetric()
+        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+
+        result = await metric.compute(ctx)
+
+        assert result.error is None
+        assert result.details["num_turns"] == 2
+        assert result.score == pytest.approx((5.0 + 7.0) / 2)
+        assert result.details["max_speed_seconds"] == pytest.approx(7.0)
+        assert result.details["per_turn_speeds"] == [5.0, 7.0]
+
+    @pytest.mark.asyncio
+    async def test_all_turns_have_tool_calls(self, tmp_path):
+        """When every turn has a tool call, all latencies are included."""
+        _write_metrics_json(tmp_path, {"1": 2.0, "2": 4.0})
+        trace = _make_trace(tool_call_turn_ids={1, 2}, all_turn_ids={1, 2})
+        metric = ResponseSpeedWithToolCallsMetric()
+        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+
+        result = await metric.compute(ctx)
+
+        assert result.error is None
+        assert result.details["num_turns"] == 2
+        assert result.score == pytest.approx(3.0)
+
+    @pytest.mark.asyncio
+    async def test_filters_invalid_latency_values(self, tmp_path):
+        """Sanity filter (0 < x < 1000) applies to per_turn_latency values."""
+        _write_metrics_json(tmp_path, {"1": -1.0, "2": 5.0, "3": 2000.0, "4": 3.0})
+        trace = _make_trace(tool_call_turn_ids={1, 2, 3, 4}, all_turn_ids={1, 2, 3, 4})
+        metric = ResponseSpeedWithToolCallsMetric()
+        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+
+        result = await metric.compute(ctx)
+
+        assert result.error is None
+        assert result.details["num_turns"] == 2  # only 5.0 and 3.0 pass
+        assert result.score == pytest.approx((5.0 + 3.0) / 2)
+
+
+# ---------------------------------------------------------------------------
+# ResponseSpeedNoToolCallsMetric
+# ---------------------------------------------------------------------------
+
+
+class TestResponseSpeedNoToolCallsMetric:
+    @pytest.mark.asyncio
+    async def test_no_output_dir(self):
+        """Missing output_dir returns error."""
+        metric = ResponseSpeedNoToolCallsMetric()
+        ctx = make_metric_context()
+
+        result = await metric.compute(ctx)
+
+        assert result.score == 0.0
+        assert result.error is not None
+
+    @pytest.mark.asyncio
+    async def test_missing_metrics_json(self, tmp_path):
+        """output_dir exists but has no metrics.json — returns error."""
+        metric = ResponseSpeedNoToolCallsMetric()
+        ctx = make_metric_context(output_dir=tmp_path)
+
+        result = await metric.compute(ctx)
+
+        assert result.score == 0.0
+        assert result.error is not None
+
+    @pytest.mark.asyncio
+    async def test_all_turns_have_tool_calls(self, tmp_path):
+        """Every turn has a tool call — no-tool bucket is empty."""
+        _write_metrics_json(tmp_path, {"1": 2.0, "2": 4.0})
+        trace = _make_trace(tool_call_turn_ids={1, 2}, all_turn_ids={1, 2})
+        metric = ResponseSpeedNoToolCallsMetric()
+        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+
+        result = await metric.compute(ctx)
+
+        assert result.score == 0.0
+        assert result.error is not None
+        assert "No turns without tool calls" in result.error
+
+    @pytest.mark.asyncio
+    async def test_mixed_turns(self, tmp_path):
+        """Correctly includes only non-tool-call turn latencies."""
+        _write_metrics_json(tmp_path, {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0})
+        # Turns 2 and 4 have tool calls; turns 1 and 3 do not
+        trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4})
+        metric = ResponseSpeedNoToolCallsMetric()
+        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+
+        result = await metric.compute(ctx)
+
+        assert result.error is None
+        assert result.details["num_turns"] == 2
+        assert result.score == pytest.approx((1.0 + 3.0) / 2)
+        assert result.details["max_speed_seconds"] == pytest.approx(3.0)
+        assert result.details["per_turn_speeds"] == [1.0, 3.0]
+
+    @pytest.mark.asyncio
+    async def test_no_turns_with_tool_calls(self, tmp_path):
+        """Record with no tool-call turns — all latencies included."""
+        _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0})
+        trace = _make_trace(tool_call_turn_ids=set(), all_turn_ids={1, 2, 3})
+        metric = ResponseSpeedNoToolCallsMetric()
+        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+
+        result = await metric.compute(ctx)
+
+        assert result.error is None
+        assert result.details["num_turns"] == 3
+        assert result.score == pytest.approx(2.0)
+
+    @pytest.mark.asyncio
+    async def test_with_and_no_tool_split_is_exhaustive(self, tmp_path):
+        """with_tool + no_tool latencies together cover all per_turn_latency values."""
+        per_turn = {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0, "5": 2.0}
+        _write_metrics_json(tmp_path, per_turn)
+        trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4, 5})
+
+        ctx_with = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+        ctx_no = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+
+        result_with = await ResponseSpeedWithToolCallsMetric().compute(ctx_with)
+        result_no = await ResponseSpeedNoToolCallsMetric().compute(ctx_no)
+
+        combined = result_with.details["per_turn_speeds"] + result_no.details["per_turn_speeds"]
+        assert sorted(combined) == sorted(per_turn.values())

From a8c92047dada4cbb3f84c3be18af72f9141cc333 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Wed, 15 Apr 2026 09:27:04 -0400
Subject: [PATCH 7/9] Fold response_speed tool-call breakdown into details
 instead of separate metrics

---
 apps/analysis.py                             |   2 +-
 src/eva/metrics/diagnostic/response_speed.py | 143 ++++-------
 tests/unit/metrics/test_response_speed.py    | 251 ++++++-------------
 3 files changed, 133 insertions(+), 263 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index 9bb15520..4e651752 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -75,7 +75,7 @@ def _build_metric_group_map() -> dict[str, str]:
     "Other": "#AAAAAA",
 }
 
-_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"}
+_NON_NORMALIZED_METRICS = {"response_speed"}
 
 # EVA composite scores to show in the bar chart
 _EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"]
diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py
index 224ebc49..1b2d5bee 100644
--- a/src/eva/metrics/diagnostic/response_speed.py
+++ b/src/eva/metrics/diagnostic/response_speed.py
@@ -5,7 +5,6 @@
 """
 
 import json
-from abc import abstractmethod
 from pathlib import Path
 
 from eva.metrics.base import CodeMetric, MetricContext
@@ -13,34 +12,32 @@
 from eva.models.results import MetricScore
 
 
-def _split_turn_taking_latencies_by_tool_calls(
-    context: MetricContext,
-) -> tuple[list[float], list[float]]:
-    """Partition turn_taking per_turn_latency values into (with_tool_calls, no_tool_calls).
+def _load_per_turn_latency(context: MetricContext) -> dict[str, float]:
+    """Load turn_taking per_turn_latency from the record's metrics.json.
 
-    Reads metrics/turn_taking/details/per_turn_latency from the record's
-    metrics.json, then checks conversation_trace to determine which turn_ids
-    had at least one tool call.
-
-    Returns:
-        (with_tool_latencies, no_tool_latencies)
+    Returns an empty dict if the data is unavailable.
     """
     if not context.output_dir:
-        return [], []
+        return {}
 
     metrics_path = Path(context.output_dir) / "metrics.json"
     if not metrics_path.exists():
-        return [], []
+        return {}
 
     with open(metrics_path) as f:
         data = json.load(f)
 
-    per_turn_latency: dict[str, float] = (
-        data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {})
-    )
-    if not per_turn_latency:
-        return [], []
+    return data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {})
+
+
+def _split_by_tool_calls(
+    per_turn_latency: dict[str, float],
+    context: MetricContext,
+) -> tuple[list[float], list[float]]:
+    """Partition per_turn_latency values into (with_tool_calls, no_tool_calls).
 
+    Checks conversation_trace to determine which turn_ids had at least one tool call.
+    """
     tool_call_turn_ids = {
         entry["turn_id"] for entry in (context.conversation_trace or []) if entry.get("type") == "tool_call"
     }
@@ -56,35 +53,59 @@ def _split_turn_taking_latencies_by_tool_calls(
     return with_tool, no_tool
 
 
-class _ResponseSpeedBase(CodeMetric):
-    """Base class for response-speed metrics.
+def _compute_speed_stats(latencies: list[float]) -> dict | None:
+    """Compute summary stats for a list of latencies, applying the sanity filter.
 
-    Subclasses implement `_get_latencies` to return the subset of latencies
-    to compute over; everything else is shared.
+    Returns None if no valid values remain after filtering.
     """
+    valid = [v for v in latencies if 0 < v < 1000]
+    if not valid:
+        return None
+    return {
+        "mean_speed_seconds": round(sum(valid) / len(valid), 3),
+        "max_speed_seconds": round(max(valid), 3),
+        "num_turns": len(valid),
+        "per_turn_speeds": [round(v, 3) for v in valid],
+    }
 
+
+@register_metric
+class ResponseSpeedMetric(CodeMetric):
+    """Response speed metric.
+
+    Measures the elapsed time between the end of the user's utterance
+    and the beginning of the assistant's response, using per_turn_latency
+    from the turn_taking metric.
+
+    Reports raw latency values in seconds — no normalization applied.
+
+    Details include a breakdown by turns with and without tool calls.
+
+    This is a diagnostic metric used for diagnosing model performance issues.
+    It is not directly used in final evaluation scores.
+    """
+
+    name = "response_speed"
     category = "diagnostic"
+    description = "Diagnostic metric: latency between user utterance end and assistant response start"
     exclude_from_pass_at_k = True
 
-    @abstractmethod
-    def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
-        """Return (latencies, error_if_empty) for this metric variant."""
-
     async def compute(self, context: MetricContext) -> MetricScore:
         try:
-            latencies, empty_error = self._get_latencies(context)
+            per_turn_latency = _load_per_turn_latency(context)
 
-            if not latencies:
+            if not per_turn_latency:
                 return MetricScore(
                     name=self.name,
                     score=0.0,
                     normalized_score=None,
-                    error=empty_error,
+                    error="No response latencies available (turn_taking per_turn_latency data missing)",
                 )
 
+            all_latencies = list(per_turn_latency.values())
             speeds = []
             per_turn_speeds = []
-            for latency in latencies:
+            for latency in all_latencies:
                 if 0 < latency < 1000:
                     speeds.append(latency)
                     per_turn_speeds.append(round(latency, 3))
@@ -103,6 +124,8 @@ async def compute(self, context: MetricContext) -> MetricScore:
 
             mean_speed = sum(speeds) / len(speeds)
 
+            with_tool, no_tool = _split_by_tool_calls(per_turn_latency, context)
+
             return MetricScore(
                 name=self.name,
                 score=round(mean_speed, 3),
@@ -112,66 +135,10 @@ async def compute(self, context: MetricContext) -> MetricScore:
                     "max_speed_seconds": round(max(speeds), 3),
                     "num_turns": len(speeds),
                     "per_turn_speeds": per_turn_speeds,
+                    "with_tool_calls": _compute_speed_stats(with_tool),
+                    "no_tool_calls": _compute_speed_stats(no_tool),
                 },
             )
 
         except Exception as e:
             return self._handle_error(e, context)
-
-
-@register_metric
-class ResponseSpeedMetric(_ResponseSpeedBase):
-    """Response speed metric.
-
-    Measures the elapsed time between the end of the user's utterance
-    and the beginning of the assistant's response, using Pipecat's
-    UserBotLatencyObserver measurements.
-
-    Reports raw latency values in seconds — no normalization applied.
-
-    This is a diagnostic metric used for diagnosing model performance issues.
-    It is not directly used in final evaluation scores.
-    """
-
-    name = "response_speed"
-    description = "Debug metric: latency between user utterance end and assistant response start"
-
-    def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
-        return (
-            context.response_speed_latencies,
-            "No response latencies available (UserBotLatencyObserver data missing)",
-        )
-
-
-@register_metric
-class ResponseSpeedWithToolCallsMetric(_ResponseSpeedBase):
-    """Response speed restricted to turns where the assistant made at least one tool call.
-
-    Uses per_turn_latency from the turn_taking metric and filters to turns
-    that contain a tool_call entry in the conversation trace.
-    This is a diagnostic metric not used in final evaluation scores.
-    """
-
-    name = "response_speed_with_tool_calls"
-    description = "Debug metric: response latency for turns that included a tool call"
-
-    def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
-        with_tool, _ = _split_turn_taking_latencies_by_tool_calls(context)
-        return with_tool, "No turns with tool calls found (or turn_taking latency data unavailable)"
-
-
-@register_metric
-class ResponseSpeedNoToolCallsMetric(_ResponseSpeedBase):
-    """Response speed restricted to turns where the assistant made no tool calls.
-
-    Uses per_turn_latency from the turn_taking metric and filters to turns
-    that contain no tool_call entry in the conversation trace.
-    This is a diagnostic metric not used in final evaluation scores.
-    """
-
-    name = "response_speed_no_tool_calls"
-    description = "Debug metric: response latency for turns that did not include a tool call"
-
-    def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
-        _, no_tool = _split_turn_taking_latencies_by_tool_calls(context)
-        return no_tool, "No turns without tool calls found (or turn_taking latency data unavailable)"
diff --git a/tests/unit/metrics/test_response_speed.py b/tests/unit/metrics/test_response_speed.py
index 343e73ee..b9369a6e 100644
--- a/tests/unit/metrics/test_response_speed.py
+++ b/tests/unit/metrics/test_response_speed.py
@@ -4,11 +4,7 @@
 
 import pytest
 
-from eva.metrics.diagnostic.response_speed import (
-    ResponseSpeedMetric,
-    ResponseSpeedNoToolCallsMetric,
-    ResponseSpeedWithToolCallsMetric,
-)
+from eva.metrics.diagnostic.response_speed import ResponseSpeedMetric
 
 from .conftest import make_metric_context
 
@@ -49,10 +45,10 @@ def _make_trace(tool_call_turn_ids: set[int], all_turn_ids: set[int]) -> list[di
 
 class TestResponseSpeedMetric:
     @pytest.mark.asyncio
-    async def test_no_latencies_none(self):
-        """None latencies returns error."""
+    async def test_no_output_dir(self):
+        """Missing output_dir returns error — no per_turn_latency data."""
         metric = ResponseSpeedMetric()
-        ctx = make_metric_context(response_speed_latencies=None)
+        ctx = make_metric_context()
 
         result = await metric.compute(ctx)
 
@@ -60,13 +56,13 @@ async def test_no_latencies_none(self):
         assert result.score == 0.0
         assert result.normalized_score is None
         assert result.error is not None
-        assert "No response latencies" in result.error
+        assert "turn_taking" in result.error
 
     @pytest.mark.asyncio
-    async def test_no_latencies_empty(self):
-        """Empty list returns error."""
+    async def test_missing_metrics_json(self, tmp_path):
+        """output_dir exists but has no metrics.json — returns error."""
         metric = ResponseSpeedMetric()
-        ctx = make_metric_context(response_speed_latencies=[])
+        ctx = make_metric_context(output_dir=tmp_path)
 
         result = await metric.compute(ctx)
 
@@ -74,10 +70,23 @@ async def test_no_latencies_empty(self):
         assert result.error is not None
 
     @pytest.mark.asyncio
-    async def test_valid_latencies(self):
-        """Valid latencies produce correct mean, max, and per-turn details."""
+    async def test_missing_turn_taking_data(self, tmp_path):
+        """metrics.json exists but has no turn_taking entry — returns error."""
+        (tmp_path / "metrics.json").write_text(json.dumps({"metrics": {}}))
         metric = ResponseSpeedMetric()
-        ctx = make_metric_context(response_speed_latencies=[1.0, 2.0, 3.0])
+        ctx = make_metric_context(output_dir=tmp_path)
+
+        result = await metric.compute(ctx)
+
+        assert result.score == 0.0
+        assert result.error is not None
+
+    @pytest.mark.asyncio
+    async def test_valid_latencies(self, tmp_path):
+        """Valid per_turn_latency produces correct mean, max, and per-turn details."""
+        _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0})
+        metric = ResponseSpeedMetric()
+        ctx = make_metric_context(output_dir=tmp_path)
 
         result = await metric.compute(ctx)
 
@@ -87,29 +96,28 @@ async def test_valid_latencies(self):
         assert result.details["mean_speed_seconds"] == pytest.approx(2.0)
         assert result.details["max_speed_seconds"] == pytest.approx(3.0)
         assert result.details["num_turns"] == 3
-        assert result.details["per_turn_speeds"] == [1.0, 2.0, 3.0]
 
     @pytest.mark.asyncio
-    async def test_filters_invalid_values(self):
+    async def test_filters_invalid_values(self, tmp_path):
         """Negative and >1000s values are filtered out."""
+        _write_metrics_json(tmp_path, {"1": -1.0, "2": 0.5, "3": 1500.0, "4": 2.5, "5": 0.0})
         metric = ResponseSpeedMetric()
-        ctx = make_metric_context(response_speed_latencies=[-1.0, 0.5, 1500.0, 2.5, 0.0])
+        ctx = make_metric_context(output_dir=tmp_path)
 
         result = await metric.compute(ctx)
 
         # Only 0.5 and 2.5 are valid (0 < x < 1000); 0.0 is excluded (not > 0)
         assert result.error is None
         assert result.details["num_turns"] == 2
-        expected_mean = (0.5 + 2.5) / 2
-        assert result.score == pytest.approx(expected_mean)
+        assert result.score == pytest.approx((0.5 + 2.5) / 2)
         assert result.details["max_speed_seconds"] == pytest.approx(2.5)
-        assert result.details["per_turn_speeds"] == [0.5, 2.5]
 
     @pytest.mark.asyncio
-    async def test_all_latencies_filtered_out(self):
+    async def test_all_latencies_filtered_out(self, tmp_path):
         """When all values are invalid, returns error."""
+        _write_metrics_json(tmp_path, {"1": -5.0, "2": 0.0, "3": 2000.0})
         metric = ResponseSpeedMetric()
-        ctx = make_metric_context(response_speed_latencies=[-5.0, 0.0, 2000.0])
+        ctx = make_metric_context(output_dir=tmp_path)
 
         result = await metric.compute(ctx)
 
@@ -119,10 +127,11 @@ async def test_all_latencies_filtered_out(self):
         assert "No valid response speeds" in result.error
 
     @pytest.mark.asyncio
-    async def test_single_latency_value(self):
+    async def test_single_latency_value(self, tmp_path):
         """Single valid latency works correctly."""
+        _write_metrics_json(tmp_path, {"1": 0.75})
         metric = ResponseSpeedMetric()
-        ctx = make_metric_context(response_speed_latencies=[0.75])
+        ctx = make_metric_context(output_dir=tmp_path)
 
         result = await metric.compute(ctx)
 
@@ -132,179 +141,72 @@ async def test_single_latency_value(self):
         assert result.details["num_turns"] == 1
         assert result.details["per_turn_speeds"] == [0.75]
 
-
-# ---------------------------------------------------------------------------
-# ResponseSpeedWithToolCallsMetric
-# ---------------------------------------------------------------------------
-
-
-class TestResponseSpeedWithToolCallsMetric:
     @pytest.mark.asyncio
-    async def test_no_output_dir(self):
-        """Missing output_dir returns error."""
-        metric = ResponseSpeedWithToolCallsMetric()
-        ctx = make_metric_context()
-
-        result = await metric.compute(ctx)
-
-        assert result.score == 0.0
-        assert result.error is not None
-
-    @pytest.mark.asyncio
-    async def test_missing_metrics_json(self, tmp_path):
-        """output_dir exists but has no metrics.json — returns error."""
-        metric = ResponseSpeedWithToolCallsMetric()
-        ctx = make_metric_context(output_dir=tmp_path)
-
-        result = await metric.compute(ctx)
-
-        assert result.score == 0.0
-        assert result.error is not None
-
-    @pytest.mark.asyncio
-    async def test_missing_turn_taking_data(self, tmp_path):
-        """metrics.json exists but has no turn_taking entry — returns error."""
-        (tmp_path / "metrics.json").write_text(json.dumps({"metrics": {}}))
-        metric = ResponseSpeedWithToolCallsMetric()
+    async def test_no_tool_call_breakdown_without_trace(self, tmp_path):
+        """with_tool_calls is None and no_tool_calls covers all turns when trace is absent."""
+        _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0})
+        metric = ResponseSpeedMetric()
         ctx = make_metric_context(output_dir=tmp_path)
 
         result = await metric.compute(ctx)
 
-        assert result.score == 0.0
-        assert result.error is not None
-
-    @pytest.mark.asyncio
-    async def test_no_turns_with_tool_calls(self, tmp_path):
-        """Record has no tool-call turns — returns 'not found' error."""
-        _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0})
-        trace = _make_trace(tool_call_turn_ids=set(), all_turn_ids={1, 2, 3})
-        metric = ResponseSpeedWithToolCallsMetric()
-        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
-
-        result = await metric.compute(ctx)
-
-        assert result.score == 0.0
-        assert result.error is not None
-        assert "No turns with tool calls" in result.error
+        assert result.error is None
+        # No trace → no tool call turn ids → all turns go into no_tool bucket
+        assert result.details["with_tool_calls"] is None
+        assert result.details["no_tool_calls"] is not None
+        assert result.details["no_tool_calls"]["num_turns"] == 2
 
     @pytest.mark.asyncio
-    async def test_mixed_turns(self, tmp_path):
-        """Correctly includes only tool-call turn latencies."""
+    async def test_tool_call_breakdown_mixed_turns(self, tmp_path):
+        """with_tool_calls and no_tool_calls sub-fields reflect the correct split."""
         _write_metrics_json(tmp_path, {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0})
-        # Turns 2 and 4 have tool calls
         trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4})
-        metric = ResponseSpeedWithToolCallsMetric()
+        metric = ResponseSpeedMetric()
         ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
 
         result = await metric.compute(ctx)
 
         assert result.error is None
-        assert result.details["num_turns"] == 2
-        assert result.score == pytest.approx((5.0 + 7.0) / 2)
-        assert result.details["max_speed_seconds"] == pytest.approx(7.0)
-        assert result.details["per_turn_speeds"] == [5.0, 7.0]
-
-    @pytest.mark.asyncio
-    async def test_all_turns_have_tool_calls(self, tmp_path):
-        """When every turn has a tool call, all latencies are included."""
+        with_tc = result.details["with_tool_calls"]
+        no_tc = result.details["no_tool_calls"]
+        assert with_tc is not None
+        assert no_tc is not None
+        assert with_tc["num_turns"] == 2
+        assert with_tc["mean_speed_seconds"] == pytest.approx((5.0 + 7.0) / 2)
+        assert with_tc["max_speed_seconds"] == pytest.approx(7.0)
+        assert no_tc["num_turns"] == 2
+        assert no_tc["mean_speed_seconds"] == pytest.approx((1.0 + 3.0) / 2)
+        assert no_tc["max_speed_seconds"] == pytest.approx(3.0)
+
+    @pytest.mark.asyncio
+    async def test_tool_call_breakdown_all_tool_turns(self, tmp_path):
+        """no_tool_calls is None when every turn has a tool call."""
         _write_metrics_json(tmp_path, {"1": 2.0, "2": 4.0})
         trace = _make_trace(tool_call_turn_ids={1, 2}, all_turn_ids={1, 2})
-        metric = ResponseSpeedWithToolCallsMetric()
+        metric = ResponseSpeedMetric()
         ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
 
         result = await metric.compute(ctx)
 
         assert result.error is None
-        assert result.details["num_turns"] == 2
-        assert result.score == pytest.approx(3.0)
+        assert result.details["with_tool_calls"] is not None
+        assert result.details["with_tool_calls"]["num_turns"] == 2
+        assert result.details["no_tool_calls"] is None
 
     @pytest.mark.asyncio
-    async def test_filters_invalid_latency_values(self, tmp_path):
-        """Sanity filter (0 < x < 1000) applies to per_turn_latency values."""
+    async def test_tool_call_breakdown_filters_invalid_latencies(self, tmp_path):
+        """Sanity filter (0 < x < 1000) applies within the breakdown sub-fields."""
         _write_metrics_json(tmp_path, {"1": -1.0, "2": 5.0, "3": 2000.0, "4": 3.0})
         trace = _make_trace(tool_call_turn_ids={1, 2, 3, 4}, all_turn_ids={1, 2, 3, 4})
-        metric = ResponseSpeedWithToolCallsMetric()
-        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
-
-        result = await metric.compute(ctx)
-
-        assert result.error is None
-        assert result.details["num_turns"] == 2  # only 5.0 and 3.0 pass
-        assert result.score == pytest.approx((5.0 + 3.0) / 2)
-
-
-# ---------------------------------------------------------------------------
-# ResponseSpeedNoToolCallsMetric
-# ---------------------------------------------------------------------------
-
-
-class TestResponseSpeedNoToolCallsMetric:
-    @pytest.mark.asyncio
-    async def test_no_output_dir(self):
-        """Missing output_dir returns error."""
-        metric = ResponseSpeedNoToolCallsMetric()
-        ctx = make_metric_context()
-
-        result = await metric.compute(ctx)
-
-        assert result.score == 0.0
-        assert result.error is not None
-
-    @pytest.mark.asyncio
-    async def test_missing_metrics_json(self, tmp_path):
-        """output_dir exists but has no metrics.json — returns error."""
-        metric = ResponseSpeedNoToolCallsMetric()
-        ctx = make_metric_context(output_dir=tmp_path)
-
-        result = await metric.compute(ctx)
-
-        assert result.score == 0.0
-        assert result.error is not None
-
-    @pytest.mark.asyncio
-    async def test_all_turns_have_tool_calls(self, tmp_path):
-        """Every turn has a tool call — no-tool bucket is empty."""
-        _write_metrics_json(tmp_path, {"1": 2.0, "2": 4.0})
-        trace = _make_trace(tool_call_turn_ids={1, 2}, all_turn_ids={1, 2})
-        metric = ResponseSpeedNoToolCallsMetric()
-        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
-
-        result = await metric.compute(ctx)
-
-        assert result.score == 0.0
-        assert result.error is not None
-        assert "No turns without tool calls" in result.error
-
-    @pytest.mark.asyncio
-    async def test_mixed_turns(self, tmp_path):
-        """Correctly includes only non-tool-call turn latencies."""
-        _write_metrics_json(tmp_path, {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0})
-        # Turns 2 and 4 have tool calls; turns 1 and 3 do not
-        trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4})
-        metric = ResponseSpeedNoToolCallsMetric()
-        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
-
-        result = await metric.compute(ctx)
-
-        assert result.error is None
-        assert result.details["num_turns"] == 2
-        assert result.score == pytest.approx((1.0 + 3.0) / 2)
-        assert result.details["max_speed_seconds"] == pytest.approx(3.0)
-        assert result.details["per_turn_speeds"] == [1.0, 3.0]
-
-    @pytest.mark.asyncio
-    async def test_no_turns_with_tool_calls(self, tmp_path):
-        """Record with no tool-call turns — all latencies included."""
-        _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0})
-        trace = _make_trace(tool_call_turn_ids=set(), all_turn_ids={1, 2, 3})
-        metric = ResponseSpeedNoToolCallsMetric()
+        metric = ResponseSpeedMetric()
         ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
 
         result = await metric.compute(ctx)
 
         assert result.error is None
-        assert result.details["num_turns"] == 3
-        assert result.score == pytest.approx(2.0)
+        with_tc = result.details["with_tool_calls"]
+        assert with_tc is not None
+        assert with_tc["num_turns"] == 2  # only 5.0 and 3.0 pass the filter
 
     @pytest.mark.asyncio
     async def test_with_and_no_tool_split_is_exhaustive(self, tmp_path):
@@ -312,12 +214,13 @@ async def test_with_and_no_tool_split_is_exhaustive(self, tmp_path):
         per_turn = {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0, "5": 2.0}
         _write_metrics_json(tmp_path, per_turn)
         trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4, 5})
+        metric = ResponseSpeedMetric()
+        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
 
-        ctx_with = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
-        ctx_no = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
-
-        result_with = await ResponseSpeedWithToolCallsMetric().compute(ctx_with)
-        result_no = await ResponseSpeedNoToolCallsMetric().compute(ctx_no)
+        result = await metric.compute(ctx)
 
-        combined = result_with.details["per_turn_speeds"] + result_no.details["per_turn_speeds"]
+        assert result.error is None
+        combined = (
+            result.details["with_tool_calls"]["per_turn_speeds"] + result.details["no_tool_calls"]["per_turn_speeds"]
+        )
         assert sorted(combined) == sorted(per_turn.values())

From 796c638edd96d8cd2486c32192f1e6d22cfe045d Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Wed, 15 Apr 2026 09:47:26 -0400
Subject: [PATCH 8/9] Show response_speed with/no tool call breakdown in
 diagnostic table and metrics_summary

---
 apps/analysis.py          | 21 ++++++++++++++++++++-
 src/eva/metrics/runner.py | 26 ++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index 40516a78..aba9c101 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -44,6 +44,9 @@ def _build_metric_group_map() -> dict[str, str]:
 
 
 _METRIC_GROUP: dict[str, str] = _build_metric_group_map()
+# Synthetic columns derived from response_speed details sub-fields
+_METRIC_GROUP["response_speed_with_tool_calls"] = "Diagnostic"
+_METRIC_GROUP["response_speed_no_tool_calls"] = "Diagnostic"
 
 # Ordered categories for display; anything not listed sorts to the end
 _CATEGORY_ORDER = ["Accuracy", "Experience", "Conversation Quality", "Diagnostic", "Validation"]
@@ -76,7 +79,7 @@ def _build_metric_group_map() -> dict[str, str]:
     "Other": "#AAAAAA",
 }
 
-_NON_NORMALIZED_METRICS = {"response_speed"}
+_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"}
 
 # EVA composite scores to show in the bar chart
 _EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"]
@@ -545,6 +548,15 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]:
                     else metric_score.score
                 )
 
+                if metric_name == "response_speed" and metric_score.details:
+                    details = metric_score.details
+                    with_tc = details.get("with_tool_calls") or {}
+                    no_tc = details.get("no_tool_calls") or {}
+                    row["response_speed_with_tool_calls"] = with_tc.get("mean_speed_seconds")
+                    row["response_speed_no_tool_calls"] = no_tc.get("mean_speed_seconds")
+                    all_metric_names.add("response_speed_with_tool_calls")
+                    all_metric_names.add("response_speed_no_tool_calls")
+
             rows.append(row)
 
     return rows, sorted(all_metric_names)
@@ -970,6 +982,13 @@ def render_cross_run_comparison(run_dirs: list[Path]):
             for m, stats in per_metric.items():
                 if stats.get("mean") is not None:
                     summary[m] = stats["mean"]
+                # Expose response_speed sub-field means as synthetic columns
+                for sub_key in ("with_tool_calls", "no_tool_calls"):
+                    sub = stats.get(sub_key)
+                    if sub and sub.get("mean") is not None:
+                        col = f"{m}_{sub_key}"
+                        summary[col] = sub["mean"]
+                        all_metric_names.add(col)
             # Add EVA composite scores from overall_scores
             overall = metrics_summary.get("overall_scores", {})
             for composite in _EVA_BAR_COMPOSITES:
diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py
index 2d808d7a..892fcbe7 100644
--- a/src/eva/metrics/runner.py
+++ b/src/eva/metrics/runner.py
@@ -620,6 +620,32 @@ def _build_per_metric_aggregates(
                         "count": count,
                     }
 
+        # Nest with/without tool call breakdowns inside the response_speed aggregate
+        if "response_speed" in metric_names and "response_speed" in metric_aggregates:
+            for sub_key in ("with_tool_calls", "no_tool_calls"):
+                sub_scores: list[float] = []
+                sub_missing = 0
+                for record_metrics in all_metrics.values():
+                    rs = record_metrics.metrics.get("response_speed")
+                    if rs is None or rs.error is not None:
+                        sub_missing += 1
+                        continue
+                    sub_details = (rs.details or {}).get(sub_key)
+                    if sub_details and sub_details.get("mean_speed_seconds") is not None:
+                        sub_scores.append(sub_details["mean_speed_seconds"])
+                    else:
+                        sub_missing += 1
+                if sub_scores or sub_missing > 0:
+                    metric_aggregates["response_speed"][sub_key] = {
+                        "mean": round(sum(sub_scores) / len(sub_scores), 4) if sub_scores else None,
+                        "min": round(min(sub_scores), 4) if sub_scores else None,
+                        "max": round(max(sub_scores), 4) if sub_scores else None,
+                        "count": len(sub_scores),
+                        "none_count": sub_missing,
+                        "missing_count": sub_missing,
+                        "total_records": total_records,
+                    }
+
         return metric_aggregates
 
     @staticmethod

From 4b60c309ab370e72751f2c71b2de4c8382a7d384 Mon Sep 17 00:00:00 2001
From: Fanny Riols <fanny.riols@servicenow.com>
Date: Thu, 16 Apr 2026 09:24:27 -0400
Subject: [PATCH 9/9] Update response_speed.py

---
 src/eva/metrics/diagnostic/response_speed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py
index 1b2d5bee..7dce04f6 100644
--- a/src/eva/metrics/diagnostic/response_speed.py
+++ b/src/eva/metrics/diagnostic/response_speed.py
@@ -106,7 +106,7 @@ async def compute(self, context: MetricContext) -> MetricScore:
             speeds = []
             per_turn_speeds = []
             for latency in all_latencies:
-                if 0 < latency < 1000:
+                if latency is not None and 0 < latency < 1000:
                     speeds.append(latency)
                     per_turn_speeds.append(round(latency, 3))
                 else: