ServiceNow · fanny-riols · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/apps/analysis.py b/apps/analysis.py
@@ -44,6 +44,9 @@ def _build_metric_group_map() -> dict[str, str]:
 
 
 _METRIC_GROUP: dict[str, str] = _build_metric_group_map()
+# Synthetic columns derived from response_speed details sub-fields
+_METRIC_GROUP["response_speed_with_tool_calls"] = "Diagnostic"
+_METRIC_GROUP["response_speed_no_tool_calls"] = "Diagnostic"
 
 # Ordered categories for display; anything not listed sorts to the end
 _CATEGORY_ORDER = ["Accuracy", "Experience", "Conversation Quality", "Diagnostic", "Validation"]
@@ -76,7 +79,7 @@ def _build_metric_group_map() -> dict[str, str]:
     "Other": "#AAAAAA",
 }
 
-_NON_NORMALIZED_METRICS = {"response_speed"}
+_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"}
 
 # EVA composite scores to show in the bar chart
 _EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"]
@@ -545,6 +548,15 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]:
                     else metric_score.score
                 )
 
+                if metric_name == "response_speed" and metric_score.details:
+                    details = metric_score.details
+                    with_tc = details.get("with_tool_calls") or {}
+                    no_tc = details.get("no_tool_calls") or {}
+                    row["response_speed_with_tool_calls"] = with_tc.get("mean_speed_seconds")
+                    row["response_speed_no_tool_calls"] = no_tc.get("mean_speed_seconds")
+                    all_metric_names.add("response_speed_with_tool_calls")
+                    all_metric_names.add("response_speed_no_tool_calls")
+
             rows.append(row)
 
     return rows, sorted(all_metric_names)
@@ -970,6 +982,13 @@ def render_cross_run_comparison(run_dirs: list[Path]):
             for m, stats in per_metric.items():
                 if stats.get("mean") is not None:
                     summary[m] = stats["mean"]
+                # Expose response_speed sub-field means as synthetic columns
+                for sub_key in ("with_tool_calls", "no_tool_calls"):
+                    sub = stats.get(sub_key)
+                    if sub and sub.get("mean") is not None:
+                        col = f"{m}_{sub_key}"
+                        summary[col] = sub["mean"]
+                        all_metric_names.add(col)
             # Add EVA composite scores from overall_scores
             overall = metrics_summary.get("overall_scores", {})
             for composite in _EVA_BAR_COMPOSITES:

diff --git a/pyproject.toml b/pyproject.toml
@@ -101,7 +101,7 @@ ignore = ["D203", "D206", "D213", "D400", "D401", "D413", "D415", "E1", "E501"]
 simulation_version = "0.1.0"
 # Bump when metrics pipeline changes (metrics code, judge prompts, pricing,
 # postprocessor). Old metric results become stale — cheap to recompute.
-metrics_version = "0.1.1"
+metrics_version = "0.1.2"
 
 [tool.mypy]
 python_version = "3.11"

diff --git a/src/eva/__init__.py b/src/eva/__init__.py
@@ -11,4 +11,4 @@
 
 # Bump metrics_version when changes affect metric computation (metrics code,
 # judge prompts, pricing tables, postprocessor).
-metrics_version = "0.1.1"
+metrics_version = "0.1.2"
diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py
@@ -4,54 +4,114 @@
 final evaluation scores.
 """
 
+import json
+from pathlib import Path
+
 from eva.metrics.base import CodeMetric, MetricContext
 from eva.metrics.registry import register_metric
 from eva.models.results import MetricScore
 
 
+def _load_per_turn_latency(context: MetricContext) -> dict[str, float]:
+    """Load turn_taking per_turn_latency from the record's metrics.json.
+
+    Returns an empty dict if the data is unavailable.
+    """
+    if not context.output_dir:
+        return {}
+
+    metrics_path = Path(context.output_dir) / "metrics.json"
+    if not metrics_path.exists():
+        return {}
+
+    with open(metrics_path) as f:
+        data = json.load(f)
+
+    return data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {})
+
+
+def _split_by_tool_calls(
+    per_turn_latency: dict[str, float],
+    context: MetricContext,
+) -> tuple[list[float], list[float]]:
+    """Partition per_turn_latency values into (with_tool_calls, no_tool_calls).
+
+    Checks conversation_trace to determine which turn_ids had at least one tool call.
+    """
+    tool_call_turn_ids = {
+        entry["turn_id"] for entry in (context.conversation_trace or []) if entry.get("type") == "tool_call"
+    }
+
+    with_tool: list[float] = []
+    no_tool: list[float] = []
+    for turn_id_str, latency in per_turn_latency.items():
+        if int(turn_id_str) in tool_call_turn_ids:
+            with_tool.append(latency)
+        else:
+            no_tool.append(latency)
+
+    return with_tool, no_tool
+
+
+def _compute_speed_stats(latencies: list[float]) -> dict | None:
+    """Compute summary stats for a list of latencies, applying the sanity filter.
+
+    Returns None if no valid values remain after filtering.
+    """
+    valid = [v for v in latencies if 0 < v < 1000]
+    if not valid:
+        return None
+    return {
+        "mean_speed_seconds": round(sum(valid) / len(valid), 3),
+        "max_speed_seconds": round(max(valid), 3),
+        "num_turns": len(valid),
+        "per_turn_speeds": [round(v, 3) for v in valid],
+    }
+
+
 @register_metric
 class ResponseSpeedMetric(CodeMetric):
     """Response speed metric.
 
     Measures the elapsed time between the end of the user's utterance
-    and the beginning of the assistant's response.
+    and the beginning of the assistant's response, using per_turn_latency
+    from the turn_taking metric.
 
     Reports raw latency values in seconds — no normalization applied.
 
+    Details include a breakdown by turns with and without tool calls.
+
     This is a diagnostic metric used for diagnosing model performance issues.
     It is not directly used in final evaluation scores.
     """
 
     name = "response_speed"
-    description = "Debug metric: latency between user utterance end and assistant response start"
     category = "diagnostic"
+    description = "Diagnostic metric: latency between user utterance end and assistant response start"
     exclude_from_pass_at_k = True
 
     async def compute(self, context: MetricContext) -> MetricScore:
-        """Compute response speed from Pipecat's UserBotLatencyObserver measurements."""
         try:
-            # Check if we have response speed latencies from UserBotLatencyObserver
-            if not context.response_speed_latencies:
+            per_turn_latency = _load_per_turn_latency(context)
+
+            if not per_turn_latency:
                 return MetricScore(
                     name=self.name,
                     score=0.0,
                     normalized_score=None,
-                    error="No response latencies available (UserBotLatencyObserver data missing)",
+                    error="No response latencies available (turn_taking per_turn_latency data missing)",
                 )
 
-            # Use latencies measured by Pipecat's UserBotLatencyObserver
-            # These measure the time from user stopped speaking to assistant started speaking
+            all_latencies = list(per_turn_latency.values())
             speeds = []
             per_turn_speeds = []
-
-            for response_speed in context.response_speed_latencies:
-                # Filter out invalid values (negative or extremely large)
-                if 0 < response_speed < 1000:  # Sanity check: under 1000 seconds
-                    speeds.append(response_speed)
-                    per_turn_speeds.append(round(response_speed, 3))
+            for latency in all_latencies:
+                if 0 < latency < 1000:
+                    speeds.append(latency)
+                    per_turn_speeds.append(round(latency, 3))
                 else:
                     self.logger.warning(
-                        f"[{context.record_id}] Unusual response speed detected and dropped: {response_speed} seconds"
+                        f"[{context.record_id}] Unusual response speed detected and dropped: {latency} seconds"
                     )
 
             if not speeds:
@@ -63,17 +123,20 @@ async def compute(self, context: MetricContext) -> MetricScore:
                 )
 
             mean_speed = sum(speeds) / len(speeds)
-            max_speed = max(speeds)
+
+            with_tool, no_tool = _split_by_tool_calls(per_turn_latency, context)
 
             return MetricScore(
                 name=self.name,
-                score=round(mean_speed, 3),  # Mean response speed in seconds
-                normalized_score=None,  # Raw latency in seconds; not normalizable to [0,1]
+                score=round(mean_speed, 3),
+                normalized_score=None,
                 details={
                     "mean_speed_seconds": round(mean_speed, 3),
-                    "max_speed_seconds": round(max_speed, 3),
+                    "max_speed_seconds": round(max(speeds), 3),
                     "num_turns": len(speeds),
                     "per_turn_speeds": per_turn_speeds,
+                    "with_tool_calls": _compute_speed_stats(with_tool),
+                    "no_tool_calls": _compute_speed_stats(no_tool),
                 },
             )
 

diff --git a/src/eva/metrics/processor.py b/src/eva/metrics/processor.py
@@ -24,6 +24,19 @@
 
 logger = get_logger(__name__)
 
+
+def _resolve_path(stored: str | None, fallback: Path) -> str | Path:
+    """Return *stored* if it exists on disk, otherwise *fallback*.
+
+    Allows metrics to re-run correctly when a run directory has been moved:
+    the stored path reflects the original location, but the file is now at
+    *fallback* (i.e. output_dir / filename).
+    """
+    if stored and Path(stored).exists():
+        return stored
+    return fallback
+
+
 # Elevenlabs audio user field → _ProcessorContext attribute name
 AUDIO_ATTR = {
     "pipecat_agent": "audio_timestamps_assistant_turns",
@@ -824,8 +837,10 @@ def _build_history(
         Each entry: {timestamp_ms, source, event_type, data}.
         """
         history = self._load_audit_log_transcript(output_dir)
-        history.extend(self._load_pipecat_logs(result.pipecat_logs_path))
-        history.extend(self._load_elevenlabs_logs(result.elevenlabs_logs_path))
+        pipecat_path = _resolve_path(result.pipecat_logs_path, output_dir / "pipecat_logs.jsonl")
+        history.extend(self._load_pipecat_logs(pipecat_path))
+        elevenlabs_path = _resolve_path(result.elevenlabs_logs_path, output_dir / "elevenlabs_events.jsonl")
+        history.extend(self._load_elevenlabs_logs(elevenlabs_path))
 
         history.sort(key=lambda e: e["timestamp_ms"])
         context.history = history

diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py
@@ -620,6 +620,32 @@ def _build_per_metric_aggregates(
                         "count": count,
                     }
 
+        # Nest with/without tool call breakdowns inside the response_speed aggregate
+        if "response_speed" in metric_names and "response_speed" in metric_aggregates:
+            for sub_key in ("with_tool_calls", "no_tool_calls"):
+                sub_scores: list[float] = []
+                sub_missing = 0
+                for record_metrics in all_metrics.values():
+                    rs = record_metrics.metrics.get("response_speed")
+                    if rs is None or rs.error is not None:
+                        sub_missing += 1
+                        continue
+                    sub_details = (rs.details or {}).get(sub_key)
+                    if sub_details and sub_details.get("mean_speed_seconds") is not None:
+                        sub_scores.append(sub_details["mean_speed_seconds"])
+                    else:
+                        sub_missing += 1
+                if sub_scores or sub_missing > 0:
+                    metric_aggregates["response_speed"][sub_key] = {
+                        "mean": round(sum(sub_scores) / len(sub_scores), 4) if sub_scores else None,
+                        "min": round(min(sub_scores), 4) if sub_scores else None,
+                        "max": round(max(sub_scores), 4) if sub_scores else None,
+                        "count": len(sub_scores),
+                        "none_count": sub_missing,
+                        "missing_count": sub_missing,
+                        "total_records": total_records,
+                    }
+
         return metric_aggregates
 
     @staticmethod

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
@@ -648,10 +648,17 @@ def apply_env_overrides(self, live: "RunConfig") -> None:
             if not has_redacted:
                 continue
             if name not in live_by_name:
-                raise ValueError(
-                    f"Cannot restore secrets: deployment {name!r} not found in "
-                    f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
+                active_llm = getattr(self.model, "llm", None)
+                if name == active_llm:
+                    raise ValueError(
+                        f"Cannot restore secrets: deployment {name!r} not found in "
+                        f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
+                    )
+                logger.warning(
+                    f"Deployment {name!r} has redacted secrets but is not in the current "
+                    f"EVA_MODEL_LIST — skipping (not used in this run)."
                 )
+                continue
             live_params = live_by_name[name].get("litellm_params", {})
             for key, value in saved_params.items():
                 if value == "***" and key in live_params: