diff --git a/apps/analysis.py b/apps/analysis.py
index 40516a78..aba9c101 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -44,6 +44,9 @@ def _build_metric_group_map() -> dict[str, str]:
 
 
 _METRIC_GROUP: dict[str, str] = _build_metric_group_map()
+# Synthetic columns derived from response_speed details sub-fields
+_METRIC_GROUP["response_speed_with_tool_calls"] = "Diagnostic"
+_METRIC_GROUP["response_speed_no_tool_calls"] = "Diagnostic"
 
 # Ordered categories for display; anything not listed sorts to the end
 _CATEGORY_ORDER = ["Accuracy", "Experience", "Conversation Quality", "Diagnostic", "Validation"]
@@ -76,7 +79,7 @@ def _build_metric_group_map() -> dict[str, str]:
     "Other": "#AAAAAA",
 }
 
-_NON_NORMALIZED_METRICS = {"response_speed"}
+_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"}
 
 # EVA composite scores to show in the bar chart
 _EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"]
@@ -545,6 +548,15 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]:
                     else metric_score.score
                 )
 
+                if metric_name == "response_speed" and metric_score.details:
+                    details = metric_score.details
+                    with_tc = details.get("with_tool_calls") or {}
+                    no_tc = details.get("no_tool_calls") or {}
+                    row["response_speed_with_tool_calls"] = with_tc.get("mean_speed_seconds")
+                    row["response_speed_no_tool_calls"] = no_tc.get("mean_speed_seconds")
+                    all_metric_names.add("response_speed_with_tool_calls")
+                    all_metric_names.add("response_speed_no_tool_calls")
+
             rows.append(row)
 
     return rows, sorted(all_metric_names)
@@ -970,6 +982,13 @@ def render_cross_run_comparison(run_dirs: list[Path]):
             for m, stats in per_metric.items():
                 if stats.get("mean") is not None:
                     summary[m] = stats["mean"]
+                # Expose response_speed sub-field means as synthetic columns
+                for sub_key in ("with_tool_calls", "no_tool_calls"):
+                    sub = stats.get(sub_key)
+                    if sub and sub.get("mean") is not None:
+                        col = f"{m}_{sub_key}"
+                        summary[col] = sub["mean"]
+                        all_metric_names.add(col)
             # Add EVA composite scores from overall_scores
             overall = metrics_summary.get("overall_scores", {})
             for composite in _EVA_BAR_COMPOSITES:
diff --git a/pyproject.toml b/pyproject.toml
index 561cba2d..47827e98 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,7 +101,7 @@ ignore = ["D203", "D206", "D213", "D400", "D401", "D413", "D415", "E1", "E501"]
 simulation_version = "0.1.0"
 # Bump when metrics pipeline changes (metrics code, judge prompts, pricing,
 # postprocessor). Old metric results become stale — cheap to recompute.
-metrics_version = "0.1.1"
+metrics_version = "0.1.2"
 
 [tool.mypy]
 python_version = "3.11"
diff --git a/src/eva/__init__.py b/src/eva/__init__.py
index 6796f4aa..03f1f13b 100644
--- a/src/eva/__init__.py
+++ b/src/eva/__init__.py
@@ -11,4 +11,4 @@
 
 # Bump metrics_version when changes affect metric computation (metrics code,
 # judge prompts, pricing tables, postprocessor).
-metrics_version = "0.1.1"
+metrics_version = "0.1.2"
diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py
index 0dd4fb53..7dce04f6 100644
--- a/src/eva/metrics/diagnostic/response_speed.py
+++ b/src/eva/metrics/diagnostic/response_speed.py
@@ -4,54 +4,114 @@
 final evaluation scores.
 """
 
+import json
+from pathlib import Path
+
 from eva.metrics.base import CodeMetric, MetricContext
 from eva.metrics.registry import register_metric
 from eva.models.results import MetricScore
 
 
+def _load_per_turn_latency(context: MetricContext) -> dict[str, float]:
+    """Load turn_taking per_turn_latency from the record's metrics.json.
+
+    Returns an empty dict if the data is unavailable.
+    """
+    if not context.output_dir:
+        return {}
+
+    metrics_path = Path(context.output_dir) / "metrics.json"
+    if not metrics_path.exists():
+        return {}
+
+    with open(metrics_path) as f:
+        data = json.load(f)
+
+    return data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {})
+
+
+def _split_by_tool_calls(
+    per_turn_latency: dict[str, float],
+    context: MetricContext,
+) -> tuple[list[float], list[float]]:
+    """Partition per_turn_latency values into (with_tool_calls, no_tool_calls).
+
+    Checks conversation_trace to determine which turn_ids had at least one tool call.
+    """
+    tool_call_turn_ids = {
+        entry["turn_id"] for entry in (context.conversation_trace or []) if entry.get("type") == "tool_call"
+    }
+
+    with_tool: list[float] = []
+    no_tool: list[float] = []
+    for turn_id_str, latency in per_turn_latency.items():
+        if int(turn_id_str) in tool_call_turn_ids:
+            with_tool.append(latency)
+        else:
+            no_tool.append(latency)
+
+    return with_tool, no_tool
+
+
+def _compute_speed_stats(latencies: list[float]) -> dict | None:
+    """Compute summary stats for a list of latencies, applying the sanity filter.
+
+    Returns None if no valid values remain after filtering.
+    """
+    valid = [v for v in latencies if 0 < v < 1000]
+    if not valid:
+        return None
+    return {
+        "mean_speed_seconds": round(sum(valid) / len(valid), 3),
+        "max_speed_seconds": round(max(valid), 3),
+        "num_turns": len(valid),
+        "per_turn_speeds": [round(v, 3) for v in valid],
+    }
+
+
 @register_metric
 class ResponseSpeedMetric(CodeMetric):
     """Response speed metric.
 
     Measures the elapsed time between the end of the user's utterance
-    and the beginning of the assistant's response.
+    and the beginning of the assistant's response, using per_turn_latency
+    from the turn_taking metric.
 
     Reports raw latency values in seconds — no normalization applied.
 
+    Details include a breakdown by turns with and without tool calls.
+
     This is a diagnostic metric used for diagnosing model performance issues.
     It is not directly used in final evaluation scores.
     """
 
     name = "response_speed"
-    description = "Debug metric: latency between user utterance end and assistant response start"
     category = "diagnostic"
+    description = "Diagnostic metric: latency between user utterance end and assistant response start"
     exclude_from_pass_at_k = True
 
     async def compute(self, context: MetricContext) -> MetricScore:
-        """Compute response speed from Pipecat's UserBotLatencyObserver measurements."""
         try:
-            # Check if we have response speed latencies from UserBotLatencyObserver
-            if not context.response_speed_latencies:
+            per_turn_latency = _load_per_turn_latency(context)
+
+            if not per_turn_latency:
                 return MetricScore(
                     name=self.name,
                     score=0.0,
                     normalized_score=None,
-                    error="No response latencies available (UserBotLatencyObserver data missing)",
+                    error="No response latencies available (turn_taking per_turn_latency data missing)",
                 )
 
-            # Use latencies measured by Pipecat's UserBotLatencyObserver
-            # These measure the time from user stopped speaking to assistant started speaking
+            all_latencies = list(per_turn_latency.values())
             speeds = []
             per_turn_speeds = []
-
-            for response_speed in context.response_speed_latencies:
-                # Filter out invalid values (negative or extremely large)
-                if 0 < response_speed < 1000:  # Sanity check: under 1000 seconds
-                    speeds.append(response_speed)
-                    per_turn_speeds.append(round(response_speed, 3))
+            for latency in all_latencies:
+                if latency is not None and 0 < latency < 1000:
+                    speeds.append(latency)
+                    per_turn_speeds.append(round(latency, 3))
                 else:
                     self.logger.warning(
-                        f"[{context.record_id}] Unusual response speed detected and dropped: {response_speed} seconds"
+                        f"[{context.record_id}] Unusual response speed detected and dropped: {latency} seconds"
                     )
 
             if not speeds:
@@ -63,17 +123,20 @@ async def compute(self, context: MetricContext) -> MetricScore:
                 )
 
             mean_speed = sum(speeds) / len(speeds)
-            max_speed = max(speeds)
+
+            with_tool, no_tool = _split_by_tool_calls(per_turn_latency, context)
 
             return MetricScore(
                 name=self.name,
-                score=round(mean_speed, 3),  # Mean response speed in seconds
-                normalized_score=None,  # Raw latency in seconds; not normalizable to [0,1]
+                score=round(mean_speed, 3),
+                normalized_score=None,
                 details={
                     "mean_speed_seconds": round(mean_speed, 3),
-                    "max_speed_seconds": round(max_speed, 3),
+                    "max_speed_seconds": round(max(speeds), 3),
                     "num_turns": len(speeds),
                     "per_turn_speeds": per_turn_speeds,
+                    "with_tool_calls": _compute_speed_stats(with_tool),
+                    "no_tool_calls": _compute_speed_stats(no_tool),
                 },
             )
 
diff --git a/src/eva/metrics/processor.py b/src/eva/metrics/processor.py
index 660e7ce9..94aa6e44 100644
--- a/src/eva/metrics/processor.py
+++ b/src/eva/metrics/processor.py
@@ -24,6 +24,19 @@
 
 logger = get_logger(__name__)
 
+
+def _resolve_path(stored: str | None, fallback: Path) -> str | Path:
+    """Return *stored* if it exists on disk, otherwise *fallback*.
+
+    Allows metrics to re-run correctly when a run directory has been moved:
+    the stored path reflects the original location, but the file is now at
+    *fallback* (i.e. output_dir / filename).
+    """
+    if stored and Path(stored).exists():
+        return stored
+    return fallback
+
+
 # Elevenlabs audio user field → _ProcessorContext attribute name
 AUDIO_ATTR = {
     "pipecat_agent": "audio_timestamps_assistant_turns",
@@ -824,8 +837,10 @@ def _build_history(
         Each entry: {timestamp_ms, source, event_type, data}.
         """
         history = self._load_audit_log_transcript(output_dir)
-        history.extend(self._load_pipecat_logs(result.pipecat_logs_path))
-        history.extend(self._load_elevenlabs_logs(result.elevenlabs_logs_path))
+        pipecat_path = _resolve_path(result.pipecat_logs_path, output_dir / "pipecat_logs.jsonl")
+        history.extend(self._load_pipecat_logs(pipecat_path))
+        elevenlabs_path = _resolve_path(result.elevenlabs_logs_path, output_dir / "elevenlabs_events.jsonl")
+        history.extend(self._load_elevenlabs_logs(elevenlabs_path))
 
         history.sort(key=lambda e: e["timestamp_ms"])
         context.history = history
diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py
index 2d808d7a..892fcbe7 100644
--- a/src/eva/metrics/runner.py
+++ b/src/eva/metrics/runner.py
@@ -620,6 +620,32 @@ def _build_per_metric_aggregates(
                         "count": count,
                     }
 
+        # Nest with/without tool call breakdowns inside the response_speed aggregate
+        if "response_speed" in metric_names and "response_speed" in metric_aggregates:
+            for sub_key in ("with_tool_calls", "no_tool_calls"):
+                sub_scores: list[float] = []
+                sub_missing = 0
+                for record_metrics in all_metrics.values():
+                    rs = record_metrics.metrics.get("response_speed")
+                    if rs is None or rs.error is not None:
+                        sub_missing += 1
+                        continue
+                    sub_details = (rs.details or {}).get(sub_key)
+                    if sub_details and sub_details.get("mean_speed_seconds") is not None:
+                        sub_scores.append(sub_details["mean_speed_seconds"])
+                    else:
+                        sub_missing += 1
+                if sub_scores or sub_missing > 0:
+                    metric_aggregates["response_speed"][sub_key] = {
+                        "mean": round(sum(sub_scores) / len(sub_scores), 4) if sub_scores else None,
+                        "min": round(min(sub_scores), 4) if sub_scores else None,
+                        "max": round(max(sub_scores), 4) if sub_scores else None,
+                        "count": len(sub_scores),
+                        "none_count": sub_missing,
+                        "missing_count": sub_missing,
+                        "total_records": total_records,
+                    }
+
         return metric_aggregates
 
     @staticmethod
diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index e08783bd..f3885c54 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -648,10 +648,17 @@ def apply_env_overrides(self, live: "RunConfig") -> None:
             if not has_redacted:
                 continue
             if name not in live_by_name:
-                raise ValueError(
-                    f"Cannot restore secrets: deployment {name!r} not found in "
-                    f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
+                active_llm = getattr(self.model, "llm", None)
+                if name == active_llm:
+                    raise ValueError(
+                        f"Cannot restore secrets: deployment {name!r} not found in "
+                        f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
+                    )
+                logger.warning(
+                    f"Deployment {name!r} has redacted secrets but is not in the current "
+                    f"EVA_MODEL_LIST — skipping (not used in this run)."
                 )
+                continue
             live_params = live_by_name[name].get("litellm_params", {})
             for key, value in saved_params.items():
                 if value == "***" and key in live_params:
diff --git a/tests/unit/metrics/test_response_speed.py b/tests/unit/metrics/test_response_speed.py
index 8cb3ecfc..b9369a6e 100644
--- a/tests/unit/metrics/test_response_speed.py
+++ b/tests/unit/metrics/test_response_speed.py
@@ -1,18 +1,54 @@
 """Tests for the ResponseSpeedMetric."""
 
+import json
+
 import pytest
 
 from eva.metrics.diagnostic.response_speed import ResponseSpeedMetric
 
 from .conftest import make_metric_context
 
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _write_metrics_json(tmp_path, per_turn_latency: dict) -> None:
+    """Write a minimal metrics.json with turn_taking per_turn_latency data."""
+    data = {
+        "metrics": {
+            "turn_taking": {
+                "details": {
+                    "per_turn_latency": per_turn_latency,
+                }
+            }
+        }
+    }
+    (tmp_path / "metrics.json").write_text(json.dumps(data))
+
+
+def _make_trace(tool_call_turn_ids: set[int], all_turn_ids: set[int]) -> list[dict]:
+    """Build a minimal conversation_trace with the given turn structure."""
+    trace = []
+    for tid in sorted(all_turn_ids):
+        trace.append({"turn_id": tid, "type": "transcribed", "content": "user utterance"})
+        if tid in tool_call_turn_ids:
+            trace.append({"turn_id": tid, "type": "tool_call", "tool_name": "some_tool"})
+            trace.append({"turn_id": tid, "type": "tool_response", "tool_name": "some_tool"})
+    return trace
+
+
+# ---------------------------------------------------------------------------
+# ResponseSpeedMetric
+# ---------------------------------------------------------------------------
+
 
 class TestResponseSpeedMetric:
     @pytest.mark.asyncio
-    async def test_no_latencies_none(self):
-        """None latencies returns error."""
+    async def test_no_output_dir(self):
+        """Missing output_dir returns error — no per_turn_latency data."""
         metric = ResponseSpeedMetric()
-        ctx = make_metric_context(response_speed_latencies=None)
+        ctx = make_metric_context()
 
         result = await metric.compute(ctx)
 
@@ -20,13 +56,25 @@ async def test_no_latencies_none(self):
         assert result.score == 0.0
         assert result.normalized_score is None
         assert result.error is not None
-        assert "No response latencies" in result.error
+        assert "turn_taking" in result.error
+
+    @pytest.mark.asyncio
+    async def test_missing_metrics_json(self, tmp_path):
+        """output_dir exists but has no metrics.json — returns error."""
+        metric = ResponseSpeedMetric()
+        ctx = make_metric_context(output_dir=tmp_path)
+
+        result = await metric.compute(ctx)
+
+        assert result.score == 0.0
+        assert result.error is not None
 
     @pytest.mark.asyncio
-    async def test_no_latencies_empty(self):
-        """Empty list returns error."""
+    async def test_missing_turn_taking_data(self, tmp_path):
+        """metrics.json exists but has no turn_taking entry — returns error."""
+        (tmp_path / "metrics.json").write_text(json.dumps({"metrics": {}}))
         metric = ResponseSpeedMetric()
-        ctx = make_metric_context(response_speed_latencies=[])
+        ctx = make_metric_context(output_dir=tmp_path)
 
         result = await metric.compute(ctx)
 
@@ -34,10 +82,11 @@ async def test_no_latencies_empty(self):
         assert result.error is not None
 
     @pytest.mark.asyncio
-    async def test_valid_latencies(self):
-        """Valid latencies produce correct mean, max, and per-turn details."""
+    async def test_valid_latencies(self, tmp_path):
+        """Valid per_turn_latency produces correct mean, max, and per-turn details."""
+        _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0})
         metric = ResponseSpeedMetric()
-        ctx = make_metric_context(response_speed_latencies=[1.0, 2.0, 3.0])
+        ctx = make_metric_context(output_dir=tmp_path)
 
         result = await metric.compute(ctx)
 
@@ -47,29 +96,28 @@ async def test_valid_latencies(self):
         assert result.details["mean_speed_seconds"] == pytest.approx(2.0)
         assert result.details["max_speed_seconds"] == pytest.approx(3.0)
         assert result.details["num_turns"] == 3
-        assert result.details["per_turn_speeds"] == [1.0, 2.0, 3.0]
 
     @pytest.mark.asyncio
-    async def test_filters_invalid_values(self):
+    async def test_filters_invalid_values(self, tmp_path):
         """Negative and >1000s values are filtered out."""
+        _write_metrics_json(tmp_path, {"1": -1.0, "2": 0.5, "3": 1500.0, "4": 2.5, "5": 0.0})
         metric = ResponseSpeedMetric()
-        ctx = make_metric_context(response_speed_latencies=[-1.0, 0.5, 1500.0, 2.5, 0.0])
+        ctx = make_metric_context(output_dir=tmp_path)
 
         result = await metric.compute(ctx)
 
         # Only 0.5 and 2.5 are valid (0 < x < 1000); 0.0 is excluded (not > 0)
         assert result.error is None
         assert result.details["num_turns"] == 2
-        expected_mean = (0.5 + 2.5) / 2
-        assert result.score == pytest.approx(expected_mean)
+        assert result.score == pytest.approx((0.5 + 2.5) / 2)
         assert result.details["max_speed_seconds"] == pytest.approx(2.5)
-        assert result.details["per_turn_speeds"] == [0.5, 2.5]
 
     @pytest.mark.asyncio
-    async def test_all_latencies_filtered_out(self):
+    async def test_all_latencies_filtered_out(self, tmp_path):
         """When all values are invalid, returns error."""
+        _write_metrics_json(tmp_path, {"1": -5.0, "2": 0.0, "3": 2000.0})
         metric = ResponseSpeedMetric()
-        ctx = make_metric_context(response_speed_latencies=[-5.0, 0.0, 2000.0])
+        ctx = make_metric_context(output_dir=tmp_path)
 
         result = await metric.compute(ctx)
 
@@ -79,10 +127,11 @@ async def test_all_latencies_filtered_out(self):
         assert "No valid response speeds" in result.error
 
     @pytest.mark.asyncio
-    async def test_single_latency_value(self):
+    async def test_single_latency_value(self, tmp_path):
         """Single valid latency works correctly."""
+        _write_metrics_json(tmp_path, {"1": 0.75})
         metric = ResponseSpeedMetric()
-        ctx = make_metric_context(response_speed_latencies=[0.75])
+        ctx = make_metric_context(output_dir=tmp_path)
 
         result = await metric.compute(ctx)
 
@@ -91,3 +140,87 @@ async def test_single_latency_value(self):
         assert result.details["max_speed_seconds"] == pytest.approx(0.75)
         assert result.details["num_turns"] == 1
         assert result.details["per_turn_speeds"] == [0.75]
+
+    @pytest.mark.asyncio
+    async def test_no_tool_call_breakdown_without_trace(self, tmp_path):
+        """with_tool_calls is None and no_tool_calls covers all turns when trace is absent."""
+        _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0})
+        metric = ResponseSpeedMetric()
+        ctx = make_metric_context(output_dir=tmp_path)
+
+        result = await metric.compute(ctx)
+
+        assert result.error is None
+        # No trace → no tool call turn ids → all turns go into no_tool bucket
+        assert result.details["with_tool_calls"] is None
+        assert result.details["no_tool_calls"] is not None
+        assert result.details["no_tool_calls"]["num_turns"] == 2
+
+    @pytest.mark.asyncio
+    async def test_tool_call_breakdown_mixed_turns(self, tmp_path):
+        """with_tool_calls and no_tool_calls sub-fields reflect the correct split."""
+        _write_metrics_json(tmp_path, {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0})
+        trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4})
+        metric = ResponseSpeedMetric()
+        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+
+        result = await metric.compute(ctx)
+
+        assert result.error is None
+        with_tc = result.details["with_tool_calls"]
+        no_tc = result.details["no_tool_calls"]
+        assert with_tc is not None
+        assert no_tc is not None
+        assert with_tc["num_turns"] == 2
+        assert with_tc["mean_speed_seconds"] == pytest.approx((5.0 + 7.0) / 2)
+        assert with_tc["max_speed_seconds"] == pytest.approx(7.0)
+        assert no_tc["num_turns"] == 2
+        assert no_tc["mean_speed_seconds"] == pytest.approx((1.0 + 3.0) / 2)
+        assert no_tc["max_speed_seconds"] == pytest.approx(3.0)
+
+    @pytest.mark.asyncio
+    async def test_tool_call_breakdown_all_tool_turns(self, tmp_path):
+        """no_tool_calls is None when every turn has a tool call."""
+        _write_metrics_json(tmp_path, {"1": 2.0, "2": 4.0})
+        trace = _make_trace(tool_call_turn_ids={1, 2}, all_turn_ids={1, 2})
+        metric = ResponseSpeedMetric()
+        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+
+        result = await metric.compute(ctx)
+
+        assert result.error is None
+        assert result.details["with_tool_calls"] is not None
+        assert result.details["with_tool_calls"]["num_turns"] == 2
+        assert result.details["no_tool_calls"] is None
+
+    @pytest.mark.asyncio
+    async def test_tool_call_breakdown_filters_invalid_latencies(self, tmp_path):
+        """Sanity filter (0 < x < 1000) applies within the breakdown sub-fields."""
+        _write_metrics_json(tmp_path, {"1": -1.0, "2": 5.0, "3": 2000.0, "4": 3.0})
+        trace = _make_trace(tool_call_turn_ids={1, 2, 3, 4}, all_turn_ids={1, 2, 3, 4})
+        metric = ResponseSpeedMetric()
+        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+
+        result = await metric.compute(ctx)
+
+        assert result.error is None
+        with_tc = result.details["with_tool_calls"]
+        assert with_tc is not None
+        assert with_tc["num_turns"] == 2  # only 5.0 and 3.0 pass the filter
+
+    @pytest.mark.asyncio
+    async def test_with_and_no_tool_split_is_exhaustive(self, tmp_path):
+        """with_tool + no_tool latencies together cover all per_turn_latency values."""
+        per_turn = {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0, "5": 2.0}
+        _write_metrics_json(tmp_path, per_turn)
+        trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4, 5})
+        metric = ResponseSpeedMetric()
+        ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace)
+
+        result = await metric.compute(ctx)
+
+        assert result.error is None
+        combined = (
+            result.details["with_tool_calls"]["per_turn_speeds"] + result.details["no_tool_calls"]["per_turn_speeds"]
+        )
+        assert sorted(combined) == sorted(per_turn.values())