Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/metric_context.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ Counts and flags computed during benchmark execution.
- `"transfer"`: Assistant transferred to live agent
- `"error"`: An error occurred
- **`duration_seconds: float`** - Total duration of the conversation in seconds.
- **`is_audio_native: bool`** - Whether this conversation used an audio-native architecture. Metrics should check this flag to adjust behavior (e.g., audio-native uses intended user text in conversation_trace).
- **`pipeline_type: PipelineType`** - The pipeline architecture used (`CASCADE`, `AUDIO_LLM`, or `S2S`). Access `context.is_audio_native` for a convenience boolean that returns `True` for both `AUDIO_LLM` and `S2S`.
- **`latency_assistant_turns: dict[int, float]`** - Per-turn latency in seconds (user speech end to assistant speech start), keyed by turn ID.

### File Paths
Expand Down Expand Up @@ -212,11 +212,11 @@ The LLM processes **transcribed text**, so `transcribed_user_turns` reflects wha

The model processes **raw audio**. The audit log may contain a transcript from the service's own secondary STT, but this is **not what the model used** — it's just for reference. This is why `transcribed_user_turns` is unreliable for audio-native models and `intended_user_turns` should be used instead.

Check `context.is_audio_native` (audio-native) to determine which mode was used.
Check `context.pipeline_type` to determine which mode was used, or `context.is_audio_native` for a boolean grouping of `S2S` and `AUDIO_LLM`.

### Writing Audio-Native-Aware Metrics

If your metric needs user text directly (rather than via `conversation_trace`, which handles this automatically), branch on `context.is_audio_native` (audio-native):
If your metric needs user text directly (rather than via `conversation_trace`, which handles this automatically), branch on `context.is_audio_native`:

```python
async def compute(self, context: MetricContext) -> MetricScore:
Expand Down
9 changes: 7 additions & 2 deletions src/eva/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
resolve_turn_id,
validate_rating,
)
from eva.models.config import PipelineType
from eva.models.results import MetricScore
from eva.utils.llm_client import LLMClient
from eva.utils.logging import get_logger
Expand Down Expand Up @@ -84,7 +85,7 @@ def __init__(
latency_assistant_turns: dict[int, float] | None = None,
assistant_interrupted_turns: set[int] | None = None,
user_interrupted_turns: set[int] | None = None,
is_audio_native: bool = False,
pipeline_type: PipelineType = PipelineType.CASCADE,
):
self.record_id = record_id

Expand Down Expand Up @@ -134,7 +135,11 @@ def __init__(
self.latency_assistant_turns = latency_assistant_turns or {}
self.assistant_interrupted_turns = assistant_interrupted_turns or set()
self.user_interrupted_turns = user_interrupted_turns or set()
self.is_audio_native = is_audio_native
self.pipeline_type = pipeline_type

@property
def is_audio_native(self) -> bool:
return self.pipeline_type in (PipelineType.S2S, PipelineType.AUDIO_LLM)

def to_dict(self) -> dict[str, Any]:
"""Convert MetricContext to a serializable dictionary."""
Expand Down
95 changes: 62 additions & 33 deletions src/eva/metrics/processor.py

Large diffs are not rendered by default.

8 changes: 3 additions & 5 deletions src/eva/metrics/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from eva.metrics.base import BaseMetric, MetricContext
from eva.metrics.processor import MetricsContextProcessor
from eva.metrics.registry import MetricRegistry, get_global_registry
from eva.models.config import is_audio_native_pipeline
from eva.models.config import PipelineType, get_pipeline_type
from eva.models.record import EvaluationRecord
from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics
from eva.utils.hash_utils import get_dict_hash
Expand Down Expand Up @@ -130,7 +130,7 @@ def _load_agent_config(self) -> dict[str, Any]:

# Determine pipeline type from config (fallback to False for legacy runs)
model_data = config_data.get("model", {})
self._is_audio_native = is_audio_native_pipeline(model_data) if model_data else False
self._pipeline_type = get_pipeline_type(model_data) if model_data else PipelineType.CASCADE

agent_config_path = config_data.get("agent_config_path")

Expand Down Expand Up @@ -429,9 +429,7 @@ def _load_context(self, record_id: str, record_dir: Path) -> MetricContext:
result = ConversationResult(**result_data)

# Use postprocessor to process logs and create enriched context
metrics_context = self.metrics_processor.process_record(
result, record_dir, is_audio_native=self._is_audio_native
)
metrics_context = self.metrics_processor.process_record(result, record_dir, pipeline_type=self._pipeline_type)

# Get agent instructions and tools from config
agent_instructions = self._agent_config["instructions"]
Expand Down
24 changes: 17 additions & 7 deletions src/eva/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import copy
import logging
from datetime import UTC, datetime
from enum import StrEnum
from pathlib import Path
from typing import Annotated, Any, ClassVar, Literal

Expand Down Expand Up @@ -171,6 +172,14 @@ def pipeline_parts(self) -> dict[str, str]:
_AUDIO_LLM_FIELDS = {"audio_llm", "audio_llm_params", "tts", "tts_params"}


class PipelineType(StrEnum):
"""Type of voice pipeline."""

CASCADE = "cascade"
AUDIO_LLM = "audio_llm"
S2S = "s2s"


def _model_config_discriminator(data: Any) -> str:
"""Discriminate which pipeline config type to use based on unique fields."""
if isinstance(data, dict):
Expand All @@ -186,21 +195,22 @@ def _model_config_discriminator(data: Any) -> str:
return "pipeline"


def is_audio_native_pipeline(model_data: dict | Any) -> bool:
"""Return True if the model config represents an audio-native pipeline (S2S or AudioLLM).
def get_pipeline_type(model_data: dict | Any) -> PipelineType:
"""Return the pipeline type for the given model config.

Works with both raw dicts (e.g. from config.json) and parsed model config objects.
Also handles legacy configs where ``realtime_model`` was stored alongside
``llm_model`` in a flat dict (before the discriminated-union refactor).
Returns False for configs missing the ``model`` key.
"""
mode = _model_config_discriminator(model_data)
if mode in ("s2s", "audio_llm"):
return True
if mode == "s2s":
return PipelineType.S2S
if mode == "audio_llm":
return PipelineType.AUDIO_LLM
# Legacy: realtime_model was a sibling of llm_model before the union split
if isinstance(model_data, dict) and model_data.get("realtime_model"):
return True
return False
return PipelineType.S2S
return PipelineType.CASCADE


def _strip_other_mode_fields(data: dict) -> dict:
Expand Down
90 changes: 0 additions & 90 deletions src/eva/utils/log_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,93 +346,3 @@ def filter_empty_responses(logs: list[dict]) -> list[dict]:
else:
filtered.append(log)
return filtered


def group_consecutive_logs_by_speaker(elevenlabs_logs: list[dict]) -> list[dict]:
"""Group consecutive transcripts/responses from the same speaker in elevenlabs logs.

After filtering empty responses, if there are consecutive transcripts/responses
from the same speaker (ignoring audio events in between), merge them into a single log entry.

This handles cases where ElevenLabs splits a single utterance into multiple transcripts
(e.g., "One moment" followed by "Thank you..." with audio events in between).
"""
if not elevenlabs_logs:
return []

def is_speech_event(log):
"""Check if log is a speech event."""
return log.get("type") in ["assistant_speech", "user_speech"]

def get_speaker_id(log):
"""Get speaker identifier from log type."""
return log.get("type") # "assistant_speech" or "user_speech"

def get_text(log):
"""Extract text from log."""
return log.get("data", {}).get("text", "")

def create_grouped_log(group):
"""Create a merged log from a group of logs."""
first = group[0]
merged_text = " ".join(get_text(log) for log in group)

return {
"type": first.get("type"),
"timestamp": first.get("timestamp"),
"sequence": first.get("sequence"),
"data": {"text": merged_text, "source": first.get("data", {}).get("source")},
}

# First pass: extract only speech events and group consecutive same-speaker ones
transcript_events = [log for log in elevenlabs_logs if is_speech_event(log)]

grouped_transcripts = []
if transcript_events:
current_group = [transcript_events[0]]

for i in range(1, len(transcript_events)):
curr_log = transcript_events[i]
prev_log = current_group[0]

# Check if same speaker
same_speaker = get_speaker_id(curr_log) == get_speaker_id(prev_log)

if same_speaker:
current_group.append(curr_log)
else:
# Finish current group
grouped_transcripts.append(create_grouped_log(current_group))
current_group = [curr_log]

# Don't forget the last group
grouped_transcripts.append(create_grouped_log(current_group))

# Second pass: rebuild full log list with grouped transcripts
# Keep all non-speech events, replace speech sequences with grouped versions
result = []
transcript_idx = 0
i = 0
while i < len(elevenlabs_logs):
log = elevenlabs_logs[i]
if is_speech_event(log):
# Add the grouped transcript and skip ahead past all logs that were merged
if transcript_idx < len(grouped_transcripts):
result.append(grouped_transcripts[transcript_idx])
transcript_idx += 1
# Skip ahead past any consecutive logs of the same speaker that were merged
j = i + 1
curr_speaker = get_speaker_id(log)
while j < len(elevenlabs_logs):
next_log = elevenlabs_logs[j]
if is_speech_event(next_log) and get_speaker_id(next_log) == curr_speaker:
i = j
j += 1
else:
break
else:
# Keep non-speech events as-is
result.append(log)
i += 1

return result
Loading
Loading