diff --git a/apps/analysis.py b/apps/analysis.py index 40516a78..aba9c101 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -44,6 +44,9 @@ def _build_metric_group_map() -> dict[str, str]: _METRIC_GROUP: dict[str, str] = _build_metric_group_map() +# Synthetic columns derived from response_speed details sub-fields +_METRIC_GROUP["response_speed_with_tool_calls"] = "Diagnostic" +_METRIC_GROUP["response_speed_no_tool_calls"] = "Diagnostic" # Ordered categories for display; anything not listed sorts to the end _CATEGORY_ORDER = ["Accuracy", "Experience", "Conversation Quality", "Diagnostic", "Validation"] @@ -76,7 +79,7 @@ def _build_metric_group_map() -> dict[str, str]: "Other": "#AAAAAA", } -_NON_NORMALIZED_METRICS = {"response_speed"} +_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"} # EVA composite scores to show in the bar chart _EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"] @@ -545,6 +548,15 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]: else metric_score.score ) + if metric_name == "response_speed" and metric_score.details: + details = metric_score.details + with_tc = details.get("with_tool_calls") or {} + no_tc = details.get("no_tool_calls") or {} + row["response_speed_with_tool_calls"] = with_tc.get("mean_speed_seconds") + row["response_speed_no_tool_calls"] = no_tc.get("mean_speed_seconds") + all_metric_names.add("response_speed_with_tool_calls") + all_metric_names.add("response_speed_no_tool_calls") + rows.append(row) return rows, sorted(all_metric_names) @@ -970,6 +982,13 @@ def render_cross_run_comparison(run_dirs: list[Path]): for m, stats in per_metric.items(): if stats.get("mean") is not None: summary[m] = stats["mean"] + # Expose response_speed sub-field means as synthetic columns + for sub_key in ("with_tool_calls", "no_tool_calls"): + sub = stats.get(sub_key) + if sub and sub.get("mean") is not None: + col = f"{m}_{sub_key}" + summary[col] = sub["mean"] + all_metric_names.add(col) # Add EVA composite scores from overall_scores overall = metrics_summary.get("overall_scores", {}) for composite in _EVA_BAR_COMPOSITES: diff --git a/pyproject.toml b/pyproject.toml index 561cba2d..47827e98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ ignore = ["D203", "D206", "D213", "D400", "D401", "D413", "D415", "E1", "E501"] simulation_version = "0.1.0" # Bump when metrics pipeline changes (metrics code, judge prompts, pricing, # postprocessor). Old metric results become stale — cheap to recompute. -metrics_version = "0.1.1" +metrics_version = "0.1.2" [tool.mypy] python_version = "3.11" diff --git a/src/eva/__init__.py b/src/eva/__init__.py index 6796f4aa..03f1f13b 100644 --- a/src/eva/__init__.py +++ b/src/eva/__init__.py @@ -11,4 +11,4 @@ # Bump metrics_version when changes affect metric computation (metrics code, # judge prompts, pricing tables, postprocessor). -metrics_version = "0.1.1" +metrics_version = "0.1.2" diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py index 0dd4fb53..7dce04f6 100644 --- a/src/eva/metrics/diagnostic/response_speed.py +++ b/src/eva/metrics/diagnostic/response_speed.py @@ -4,54 +4,114 @@ final evaluation scores. """ +import json +from pathlib import Path + from eva.metrics.base import CodeMetric, MetricContext from eva.metrics.registry import register_metric from eva.models.results import MetricScore +def _load_per_turn_latency(context: MetricContext) -> dict[str, float]: + """Load turn_taking per_turn_latency from the record's metrics.json. + + Returns an empty dict if the data is unavailable. + """ + if not context.output_dir: + return {} + + metrics_path = Path(context.output_dir) / "metrics.json" + if not metrics_path.exists(): + return {} + + with open(metrics_path) as f: + data = json.load(f) + + return data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {}) + + +def _split_by_tool_calls( + per_turn_latency: dict[str, float], + context: MetricContext, +) -> tuple[list[float], list[float]]: + """Partition per_turn_latency values into (with_tool_calls, no_tool_calls). + + Checks conversation_trace to determine which turn_ids had at least one tool call. + """ + tool_call_turn_ids = { + entry["turn_id"] for entry in (context.conversation_trace or []) if entry.get("type") == "tool_call" + } + + with_tool: list[float] = [] + no_tool: list[float] = [] + for turn_id_str, latency in per_turn_latency.items(): + if int(turn_id_str) in tool_call_turn_ids: + with_tool.append(latency) + else: + no_tool.append(latency) + + return with_tool, no_tool + + +def _compute_speed_stats(latencies: list[float]) -> dict | None: + """Compute summary stats for a list of latencies, applying the sanity filter. + + Returns None if no valid values remain after filtering. + """ + valid = [v for v in latencies if 0 < v < 1000] + if not valid: + return None + return { + "mean_speed_seconds": round(sum(valid) / len(valid), 3), + "max_speed_seconds": round(max(valid), 3), + "num_turns": len(valid), + "per_turn_speeds": [round(v, 3) for v in valid], + } + + @register_metric class ResponseSpeedMetric(CodeMetric): """Response speed metric. Measures the elapsed time between the end of the user's utterance - and the beginning of the assistant's response. + and the beginning of the assistant's response, using per_turn_latency + from the turn_taking metric. Reports raw latency values in seconds — no normalization applied. + Details include a breakdown by turns with and without tool calls. + This is a diagnostic metric used for diagnosing model performance issues. It is not directly used in final evaluation scores. """ name = "response_speed" - description = "Debug metric: latency between user utterance end and assistant response start" category = "diagnostic" + description = "Diagnostic metric: latency between user utterance end and assistant response start" exclude_from_pass_at_k = True async def compute(self, context: MetricContext) -> MetricScore: - """Compute response speed from Pipecat's UserBotLatencyObserver measurements.""" try: - # Check if we have response speed latencies from UserBotLatencyObserver - if not context.response_speed_latencies: + per_turn_latency = _load_per_turn_latency(context) + + if not per_turn_latency: return MetricScore( name=self.name, score=0.0, normalized_score=None, - error="No response latencies available (UserBotLatencyObserver data missing)", + error="No response latencies available (turn_taking per_turn_latency data missing)", ) - # Use latencies measured by Pipecat's UserBotLatencyObserver - # These measure the time from user stopped speaking to assistant started speaking + all_latencies = list(per_turn_latency.values()) speeds = [] per_turn_speeds = [] - - for response_speed in context.response_speed_latencies: - # Filter out invalid values (negative or extremely large) - if 0 < response_speed < 1000: # Sanity check: under 1000 seconds - speeds.append(response_speed) - per_turn_speeds.append(round(response_speed, 3)) + for latency in all_latencies: + if latency is not None and 0 < latency < 1000: + speeds.append(latency) + per_turn_speeds.append(round(latency, 3)) else: self.logger.warning( - f"[{context.record_id}] Unusual response speed detected and dropped: {response_speed} seconds" + f"[{context.record_id}] Unusual response speed detected and dropped: {latency} seconds" ) if not speeds: @@ -63,17 +123,20 @@ async def compute(self, context: MetricContext) -> MetricScore: ) mean_speed = sum(speeds) / len(speeds) - max_speed = max(speeds) + + with_tool, no_tool = _split_by_tool_calls(per_turn_latency, context) return MetricScore( name=self.name, - score=round(mean_speed, 3), # Mean response speed in seconds - normalized_score=None, # Raw latency in seconds; not normalizable to [0,1] + score=round(mean_speed, 3), + normalized_score=None, details={ "mean_speed_seconds": round(mean_speed, 3), - "max_speed_seconds": round(max_speed, 3), + "max_speed_seconds": round(max(speeds), 3), "num_turns": len(speeds), "per_turn_speeds": per_turn_speeds, + "with_tool_calls": _compute_speed_stats(with_tool), + "no_tool_calls": _compute_speed_stats(no_tool), }, ) diff --git a/src/eva/metrics/processor.py b/src/eva/metrics/processor.py index 660e7ce9..94aa6e44 100644 --- a/src/eva/metrics/processor.py +++ b/src/eva/metrics/processor.py @@ -24,6 +24,19 @@ logger = get_logger(__name__) + +def _resolve_path(stored: str | None, fallback: Path) -> str | Path: + """Return *stored* if it exists on disk, otherwise *fallback*. + + Allows metrics to re-run correctly when a run directory has been moved: + the stored path reflects the original location, but the file is now at + *fallback* (i.e. output_dir / filename). + """ + if stored and Path(stored).exists(): + return stored + return fallback + + # Elevenlabs audio user field → _ProcessorContext attribute name AUDIO_ATTR = { "pipecat_agent": "audio_timestamps_assistant_turns", @@ -824,8 +837,10 @@ def _build_history( Each entry: {timestamp_ms, source, event_type, data}. """ history = self._load_audit_log_transcript(output_dir) - history.extend(self._load_pipecat_logs(result.pipecat_logs_path)) - history.extend(self._load_elevenlabs_logs(result.elevenlabs_logs_path)) + pipecat_path = _resolve_path(result.pipecat_logs_path, output_dir / "pipecat_logs.jsonl") + history.extend(self._load_pipecat_logs(pipecat_path)) + elevenlabs_path = _resolve_path(result.elevenlabs_logs_path, output_dir / "elevenlabs_events.jsonl") + history.extend(self._load_elevenlabs_logs(elevenlabs_path)) history.sort(key=lambda e: e["timestamp_ms"]) context.history = history diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py index 2d808d7a..892fcbe7 100644 --- a/src/eva/metrics/runner.py +++ b/src/eva/metrics/runner.py @@ -620,6 +620,32 @@ def _build_per_metric_aggregates( "count": count, } + # Nest with/without tool call breakdowns inside the response_speed aggregate + if "response_speed" in metric_names and "response_speed" in metric_aggregates: + for sub_key in ("with_tool_calls", "no_tool_calls"): + sub_scores: list[float] = [] + sub_missing = 0 + for record_metrics in all_metrics.values(): + rs = record_metrics.metrics.get("response_speed") + if rs is None or rs.error is not None: + sub_missing += 1 + continue + sub_details = (rs.details or {}).get(sub_key) + if sub_details and sub_details.get("mean_speed_seconds") is not None: + sub_scores.append(sub_details["mean_speed_seconds"]) + else: + sub_missing += 1 + if sub_scores or sub_missing > 0: + metric_aggregates["response_speed"][sub_key] = { + "mean": round(sum(sub_scores) / len(sub_scores), 4) if sub_scores else None, + "min": round(min(sub_scores), 4) if sub_scores else None, + "max": round(max(sub_scores), 4) if sub_scores else None, + "count": len(sub_scores), + "none_count": sub_missing, + "missing_count": sub_missing, + "total_records": total_records, + } + return metric_aggregates @staticmethod diff --git a/src/eva/models/config.py b/src/eva/models/config.py index e08783bd..f3885c54 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -648,10 +648,17 @@ def apply_env_overrides(self, live: "RunConfig") -> None: if not has_redacted: continue if name not in live_by_name: - raise ValueError( - f"Cannot restore secrets: deployment {name!r} not found in " - f"current EVA_MODEL_LIST (available: {list(live_by_name)})" + active_llm = getattr(self.model, "llm", None) + if name == active_llm: + raise ValueError( + f"Cannot restore secrets: deployment {name!r} not found in " + f"current EVA_MODEL_LIST (available: {list(live_by_name)})" + ) + logger.warning( + f"Deployment {name!r} has redacted secrets but is not in the current " + f"EVA_MODEL_LIST — skipping (not used in this run)." ) + continue live_params = live_by_name[name].get("litellm_params", {}) for key, value in saved_params.items(): if value == "***" and key in live_params: diff --git a/tests/unit/metrics/test_response_speed.py b/tests/unit/metrics/test_response_speed.py index 8cb3ecfc..b9369a6e 100644 --- a/tests/unit/metrics/test_response_speed.py +++ b/tests/unit/metrics/test_response_speed.py @@ -1,18 +1,54 @@ """Tests for the ResponseSpeedMetric.""" +import json + import pytest from eva.metrics.diagnostic.response_speed import ResponseSpeedMetric from .conftest import make_metric_context +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _write_metrics_json(tmp_path, per_turn_latency: dict) -> None: + """Write a minimal metrics.json with turn_taking per_turn_latency data.""" + data = { + "metrics": { + "turn_taking": { + "details": { + "per_turn_latency": per_turn_latency, + } + } + } + } + (tmp_path / "metrics.json").write_text(json.dumps(data)) + + +def _make_trace(tool_call_turn_ids: set[int], all_turn_ids: set[int]) -> list[dict]: + """Build a minimal conversation_trace with the given turn structure.""" + trace = [] + for tid in sorted(all_turn_ids): + trace.append({"turn_id": tid, "type": "transcribed", "content": "user utterance"}) + if tid in tool_call_turn_ids: + trace.append({"turn_id": tid, "type": "tool_call", "tool_name": "some_tool"}) + trace.append({"turn_id": tid, "type": "tool_response", "tool_name": "some_tool"}) + return trace + + +# --------------------------------------------------------------------------- +# ResponseSpeedMetric +# --------------------------------------------------------------------------- + class TestResponseSpeedMetric: @pytest.mark.asyncio - async def test_no_latencies_none(self): - """None latencies returns error.""" + async def test_no_output_dir(self): + """Missing output_dir returns error — no per_turn_latency data.""" metric = ResponseSpeedMetric() - ctx = make_metric_context(response_speed_latencies=None) + ctx = make_metric_context() result = await metric.compute(ctx) @@ -20,13 +56,25 @@ async def test_no_latencies_none(self): assert result.score == 0.0 assert result.normalized_score is None assert result.error is not None - assert "No response latencies" in result.error + assert "turn_taking" in result.error + + @pytest.mark.asyncio + async def test_missing_metrics_json(self, tmp_path): + """output_dir exists but has no metrics.json — returns error.""" + metric = ResponseSpeedMetric() + ctx = make_metric_context(output_dir=tmp_path) + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None @pytest.mark.asyncio - async def test_no_latencies_empty(self): - """Empty list returns error.""" + async def test_missing_turn_taking_data(self, tmp_path): + """metrics.json exists but has no turn_taking entry — returns error.""" + (tmp_path / "metrics.json").write_text(json.dumps({"metrics": {}})) metric = ResponseSpeedMetric() - ctx = make_metric_context(response_speed_latencies=[]) + ctx = make_metric_context(output_dir=tmp_path) result = await metric.compute(ctx) @@ -34,10 +82,11 @@ async def test_no_latencies_empty(self): assert result.error is not None @pytest.mark.asyncio - async def test_valid_latencies(self): - """Valid latencies produce correct mean, max, and per-turn details.""" + async def test_valid_latencies(self, tmp_path): + """Valid per_turn_latency produces correct mean, max, and per-turn details.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0}) metric = ResponseSpeedMetric() - ctx = make_metric_context(response_speed_latencies=[1.0, 2.0, 3.0]) + ctx = make_metric_context(output_dir=tmp_path) result = await metric.compute(ctx) @@ -47,29 +96,28 @@ async def test_valid_latencies(self): assert result.details["mean_speed_seconds"] == pytest.approx(2.0) assert result.details["max_speed_seconds"] == pytest.approx(3.0) assert result.details["num_turns"] == 3 - assert result.details["per_turn_speeds"] == [1.0, 2.0, 3.0] @pytest.mark.asyncio - async def test_filters_invalid_values(self): + async def test_filters_invalid_values(self, tmp_path): """Negative and >1000s values are filtered out.""" + _write_metrics_json(tmp_path, {"1": -1.0, "2": 0.5, "3": 1500.0, "4": 2.5, "5": 0.0}) metric = ResponseSpeedMetric() - ctx = make_metric_context(response_speed_latencies=[-1.0, 0.5, 1500.0, 2.5, 0.0]) + ctx = make_metric_context(output_dir=tmp_path) result = await metric.compute(ctx) # Only 0.5 and 2.5 are valid (0 < x < 1000); 0.0 is excluded (not > 0) assert result.error is None assert result.details["num_turns"] == 2 - expected_mean = (0.5 + 2.5) / 2 - assert result.score == pytest.approx(expected_mean) + assert result.score == pytest.approx((0.5 + 2.5) / 2) assert result.details["max_speed_seconds"] == pytest.approx(2.5) - assert result.details["per_turn_speeds"] == [0.5, 2.5] @pytest.mark.asyncio - async def test_all_latencies_filtered_out(self): + async def test_all_latencies_filtered_out(self, tmp_path): """When all values are invalid, returns error.""" + _write_metrics_json(tmp_path, {"1": -5.0, "2": 0.0, "3": 2000.0}) metric = ResponseSpeedMetric() - ctx = make_metric_context(response_speed_latencies=[-5.0, 0.0, 2000.0]) + ctx = make_metric_context(output_dir=tmp_path) result = await metric.compute(ctx) @@ -79,10 +127,11 @@ async def test_all_latencies_filtered_out(self): assert "No valid response speeds" in result.error @pytest.mark.asyncio - async def test_single_latency_value(self): + async def test_single_latency_value(self, tmp_path): """Single valid latency works correctly.""" + _write_metrics_json(tmp_path, {"1": 0.75}) metric = ResponseSpeedMetric() - ctx = make_metric_context(response_speed_latencies=[0.75]) + ctx = make_metric_context(output_dir=tmp_path) result = await metric.compute(ctx) @@ -91,3 +140,87 @@ async def test_single_latency_value(self): assert result.details["max_speed_seconds"] == pytest.approx(0.75) assert result.details["num_turns"] == 1 assert result.details["per_turn_speeds"] == [0.75] + + @pytest.mark.asyncio + async def test_no_tool_call_breakdown_without_trace(self, tmp_path): + """with_tool_calls is None and no_tool_calls covers all turns when trace is absent.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0}) + metric = ResponseSpeedMetric() + ctx = make_metric_context(output_dir=tmp_path) + + result = await metric.compute(ctx) + + assert result.error is None + # No trace → no tool call turn ids → all turns go into no_tool bucket + assert result.details["with_tool_calls"] is None + assert result.details["no_tool_calls"] is not None + assert result.details["no_tool_calls"]["num_turns"] == 2 + + @pytest.mark.asyncio + async def test_tool_call_breakdown_mixed_turns(self, tmp_path): + """with_tool_calls and no_tool_calls sub-fields reflect the correct split.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0}) + trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4}) + metric = ResponseSpeedMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + with_tc = result.details["with_tool_calls"] + no_tc = result.details["no_tool_calls"] + assert with_tc is not None + assert no_tc is not None + assert with_tc["num_turns"] == 2 + assert with_tc["mean_speed_seconds"] == pytest.approx((5.0 + 7.0) / 2) + assert with_tc["max_speed_seconds"] == pytest.approx(7.0) + assert no_tc["num_turns"] == 2 + assert no_tc["mean_speed_seconds"] == pytest.approx((1.0 + 3.0) / 2) + assert no_tc["max_speed_seconds"] == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_tool_call_breakdown_all_tool_turns(self, tmp_path): + """no_tool_calls is None when every turn has a tool call.""" + _write_metrics_json(tmp_path, {"1": 2.0, "2": 4.0}) + trace = _make_trace(tool_call_turn_ids={1, 2}, all_turn_ids={1, 2}) + metric = ResponseSpeedMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + assert result.details["with_tool_calls"] is not None + assert result.details["with_tool_calls"]["num_turns"] == 2 + assert result.details["no_tool_calls"] is None + + @pytest.mark.asyncio + async def test_tool_call_breakdown_filters_invalid_latencies(self, tmp_path): + """Sanity filter (0 < x < 1000) applies within the breakdown sub-fields.""" + _write_metrics_json(tmp_path, {"1": -1.0, "2": 5.0, "3": 2000.0, "4": 3.0}) + trace = _make_trace(tool_call_turn_ids={1, 2, 3, 4}, all_turn_ids={1, 2, 3, 4}) + metric = ResponseSpeedMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + with_tc = result.details["with_tool_calls"] + assert with_tc is not None + assert with_tc["num_turns"] == 2 # only 5.0 and 3.0 pass the filter + + @pytest.mark.asyncio + async def test_with_and_no_tool_split_is_exhaustive(self, tmp_path): + """with_tool + no_tool latencies together cover all per_turn_latency values.""" + per_turn = {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0, "5": 2.0} + _write_metrics_json(tmp_path, per_turn) + trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4, 5}) + metric = ResponseSpeedMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + combined = ( + result.details["with_tool_calls"]["per_turn_speeds"] + result.details["no_tool_calls"]["per_turn_speeds"] + ) + assert sorted(combined) == sorted(per_turn.values())