Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion apps/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ def _build_metric_group_map() -> dict[str, str]:


_METRIC_GROUP: dict[str, str] = _build_metric_group_map()
# Synthetic columns derived from response_speed details sub-fields
_METRIC_GROUP["response_speed_with_tool_calls"] = "Diagnostic"
_METRIC_GROUP["response_speed_no_tool_calls"] = "Diagnostic"

# Ordered categories for display; anything not listed sorts to the end
_CATEGORY_ORDER = ["Accuracy", "Experience", "Conversation Quality", "Diagnostic", "Validation"]
Expand Down Expand Up @@ -76,7 +79,7 @@ def _build_metric_group_map() -> dict[str, str]:
"Other": "#AAAAAA",
}

_NON_NORMALIZED_METRICS = {"response_speed"}
_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"}

# EVA composite scores to show in the bar chart
_EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"]
Expand Down Expand Up @@ -545,6 +548,15 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]:
else metric_score.score
)

if metric_name == "response_speed" and metric_score.details:
details = metric_score.details
with_tc = details.get("with_tool_calls") or {}
no_tc = details.get("no_tool_calls") or {}
row["response_speed_with_tool_calls"] = with_tc.get("mean_speed_seconds")
row["response_speed_no_tool_calls"] = no_tc.get("mean_speed_seconds")
all_metric_names.add("response_speed_with_tool_calls")
all_metric_names.add("response_speed_no_tool_calls")

rows.append(row)

return rows, sorted(all_metric_names)
Expand Down Expand Up @@ -970,6 +982,13 @@ def render_cross_run_comparison(run_dirs: list[Path]):
for m, stats in per_metric.items():
if stats.get("mean") is not None:
summary[m] = stats["mean"]
# Expose response_speed sub-field means as synthetic columns
for sub_key in ("with_tool_calls", "no_tool_calls"):
sub = stats.get(sub_key)
if sub and sub.get("mean") is not None:
col = f"{m}_{sub_key}"
summary[col] = sub["mean"]
all_metric_names.add(col)
# Add EVA composite scores from overall_scores
overall = metrics_summary.get("overall_scores", {})
for composite in _EVA_BAR_COMPOSITES:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ ignore = ["D203", "D206", "D213", "D400", "D401", "D413", "D415", "E1", "E501"]
simulation_version = "0.1.0"
# Bump when metrics pipeline changes (metrics code, judge prompts, pricing,
# postprocessor). Old metric results become stale — cheap to recompute.
metrics_version = "0.1.1"
metrics_version = "0.1.2"

[tool.mypy]
python_version = "3.11"
Expand Down
2 changes: 1 addition & 1 deletion src/eva/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@

# Bump metrics_version when changes affect metric computation (metrics code,
# judge prompts, pricing tables, postprocessor).
metrics_version = "0.1.1"
metrics_version = "0.1.2"
101 changes: 82 additions & 19 deletions src/eva/metrics/diagnostic/response_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,54 +4,114 @@
final evaluation scores.
"""

import json
from pathlib import Path

from eva.metrics.base import CodeMetric, MetricContext
from eva.metrics.registry import register_metric
from eva.models.results import MetricScore


def _load_per_turn_latency(context: MetricContext) -> dict[str, float]:
"""Load turn_taking per_turn_latency from the record's metrics.json.

Returns an empty dict if the data is unavailable.
"""
if not context.output_dir:
return {}

metrics_path = Path(context.output_dir) / "metrics.json"
if not metrics_path.exists():
return {}

with open(metrics_path) as f:
data = json.load(f)

return data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {})


def _split_by_tool_calls(
per_turn_latency: dict[str, float],
context: MetricContext,
) -> tuple[list[float], list[float]]:
"""Partition per_turn_latency values into (with_tool_calls, no_tool_calls).

Checks conversation_trace to determine which turn_ids had at least one tool call.
"""
tool_call_turn_ids = {
entry["turn_id"] for entry in (context.conversation_trace or []) if entry.get("type") == "tool_call"
}

with_tool: list[float] = []
no_tool: list[float] = []
for turn_id_str, latency in per_turn_latency.items():
if int(turn_id_str) in tool_call_turn_ids:
with_tool.append(latency)
else:
no_tool.append(latency)

return with_tool, no_tool


def _compute_speed_stats(latencies: list[float]) -> dict | None:
"""Compute summary stats for a list of latencies, applying the sanity filter.

Returns None if no valid values remain after filtering.
"""
valid = [v for v in latencies if 0 < v < 1000]
if not valid:
return None
return {
"mean_speed_seconds": round(sum(valid) / len(valid), 3),
"max_speed_seconds": round(max(valid), 3),
"num_turns": len(valid),
"per_turn_speeds": [round(v, 3) for v in valid],
}


@register_metric
class ResponseSpeedMetric(CodeMetric):
"""Response speed metric.

Measures the elapsed time between the end of the user's utterance
and the beginning of the assistant's response.
and the beginning of the assistant's response, using per_turn_latency
from the turn_taking metric.

Reports raw latency values in seconds — no normalization applied.

Details include a breakdown by turns with and without tool calls.

This is a diagnostic metric used for diagnosing model performance issues.
It is not directly used in final evaluation scores.
"""

name = "response_speed"
description = "Debug metric: latency between user utterance end and assistant response start"
category = "diagnostic"
description = "Diagnostic metric: latency between user utterance end and assistant response start"
exclude_from_pass_at_k = True

async def compute(self, context: MetricContext) -> MetricScore:
"""Compute response speed from Pipecat's UserBotLatencyObserver measurements."""
try:
# Check if we have response speed latencies from UserBotLatencyObserver
if not context.response_speed_latencies:
per_turn_latency = _load_per_turn_latency(context)

if not per_turn_latency:
return MetricScore(
name=self.name,
score=0.0,
normalized_score=None,
error="No response latencies available (UserBotLatencyObserver data missing)",
error="No response latencies available (turn_taking per_turn_latency data missing)",
)

# Use latencies measured by Pipecat's UserBotLatencyObserver
# These measure the time from user stopped speaking to assistant started speaking
all_latencies = list(per_turn_latency.values())
speeds = []
per_turn_speeds = []

for response_speed in context.response_speed_latencies:
# Filter out invalid values (negative or extremely large)
if 0 < response_speed < 1000: # Sanity check: under 1000 seconds
speeds.append(response_speed)
per_turn_speeds.append(round(response_speed, 3))
for latency in all_latencies:
if 0 < latency < 1000:
speeds.append(latency)
per_turn_speeds.append(round(latency, 3))
else:
self.logger.warning(
f"[{context.record_id}] Unusual response speed detected and dropped: {response_speed} seconds"
f"[{context.record_id}] Unusual response speed detected and dropped: {latency} seconds"
)

if not speeds:
Expand All @@ -63,17 +123,20 @@ async def compute(self, context: MetricContext) -> MetricScore:
)

mean_speed = sum(speeds) / len(speeds)
max_speed = max(speeds)

with_tool, no_tool = _split_by_tool_calls(per_turn_latency, context)

return MetricScore(
name=self.name,
score=round(mean_speed, 3), # Mean response speed in seconds
normalized_score=None, # Raw latency in seconds; not normalizable to [0,1]
score=round(mean_speed, 3),
normalized_score=None,
details={
"mean_speed_seconds": round(mean_speed, 3),
"max_speed_seconds": round(max_speed, 3),
"max_speed_seconds": round(max(speeds), 3),
"num_turns": len(speeds),
"per_turn_speeds": per_turn_speeds,
"with_tool_calls": _compute_speed_stats(with_tool),
"no_tool_calls": _compute_speed_stats(no_tool),
},
)

Expand Down
19 changes: 17 additions & 2 deletions src/eva/metrics/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,19 @@

logger = get_logger(__name__)


def _resolve_path(stored: str | None, fallback: Path) -> str | Path:
"""Return *stored* if it exists on disk, otherwise *fallback*.

Allows metrics to re-run correctly when a run directory has been moved:
the stored path reflects the original location, but the file is now at
*fallback* (i.e. output_dir / filename).
"""
if stored and Path(stored).exists():
return stored
return fallback


# Elevenlabs audio user field → _ProcessorContext attribute name
AUDIO_ATTR = {
"pipecat_agent": "audio_timestamps_assistant_turns",
Expand Down Expand Up @@ -824,8 +837,10 @@ def _build_history(
Each entry: {timestamp_ms, source, event_type, data}.
"""
history = self._load_audit_log_transcript(output_dir)
history.extend(self._load_pipecat_logs(result.pipecat_logs_path))
history.extend(self._load_elevenlabs_logs(result.elevenlabs_logs_path))
pipecat_path = _resolve_path(result.pipecat_logs_path, output_dir / "pipecat_logs.jsonl")
history.extend(self._load_pipecat_logs(pipecat_path))
elevenlabs_path = _resolve_path(result.elevenlabs_logs_path, output_dir / "elevenlabs_events.jsonl")
history.extend(self._load_elevenlabs_logs(elevenlabs_path))

history.sort(key=lambda e: e["timestamp_ms"])
context.history = history
Expand Down
26 changes: 26 additions & 0 deletions src/eva/metrics/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,32 @@ def _build_per_metric_aggregates(
"count": count,
}

# Nest with/without tool call breakdowns inside the response_speed aggregate
if "response_speed" in metric_names and "response_speed" in metric_aggregates:
for sub_key in ("with_tool_calls", "no_tool_calls"):
sub_scores: list[float] = []
sub_missing = 0
for record_metrics in all_metrics.values():
rs = record_metrics.metrics.get("response_speed")
if rs is None or rs.error is not None:
sub_missing += 1
continue
sub_details = (rs.details or {}).get(sub_key)
if sub_details and sub_details.get("mean_speed_seconds") is not None:
sub_scores.append(sub_details["mean_speed_seconds"])
else:
sub_missing += 1
if sub_scores or sub_missing > 0:
metric_aggregates["response_speed"][sub_key] = {
"mean": round(sum(sub_scores) / len(sub_scores), 4) if sub_scores else None,
"min": round(min(sub_scores), 4) if sub_scores else None,
"max": round(max(sub_scores), 4) if sub_scores else None,
"count": len(sub_scores),
"none_count": sub_missing,
"missing_count": sub_missing,
"total_records": total_records,
}

return metric_aggregates

@staticmethod
Expand Down
13 changes: 10 additions & 3 deletions src/eva/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,10 +648,17 @@ def apply_env_overrides(self, live: "RunConfig") -> None:
if not has_redacted:
continue
if name not in live_by_name:
raise ValueError(
f"Cannot restore secrets: deployment {name!r} not found in "
f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
active_llm = getattr(self.model, "llm", None)
if name == active_llm:
raise ValueError(
f"Cannot restore secrets: deployment {name!r} not found in "
f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
)
logger.warning(
f"Deployment {name!r} has redacted secrets but is not in the current "
f"EVA_MODEL_LIST — skipping (not used in this run)."
)
continue
live_params = live_by_name[name].get("litellm_params", {})
for key, value in saved_params.items():
if value == "***" and key in live_params:
Expand Down
Loading
Loading