ServiceNow · raghavm243512 · Apr 16, 2026 · Apr 13, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/docs/metric_context.md b/docs/metric_context.md
@@ -181,7 +181,7 @@ Counts and flags computed during benchmark execution.
   - `"transfer"`: Assistant transferred to live agent
   - `"error"`: An error occurred
 - **`duration_seconds: float`** - Total duration of the conversation in seconds.
-- **`is_audio_native: bool`** - Whether this conversation used an audio-native architecture. Metrics should check this flag to adjust behavior (e.g., audio-native uses intended user text in conversation_trace).
+- **`pipeline_type: PipelineType`** - The pipeline architecture used (`CASCADE`, `AUDIO_LLM`, or `S2S`). Access `context.is_audio_native` for a convenience boolean that returns `True` for both `AUDIO_LLM` and `S2S`.
 - **`latency_assistant_turns: dict[int, float]`** - Per-turn latency in seconds (user speech end to assistant speech start), keyed by turn ID.
 
 ### File Paths
@@ -212,11 +212,11 @@ The LLM processes **transcribed text**, so `transcribed_user_turns` reflects wha
 
 The model processes **raw audio**. The audit log may contain a transcript from the service's own secondary STT, but this is **not what the model used** — it's just for reference. This is why `transcribed_user_turns` is unreliable for audio-native models and `intended_user_turns` should be used instead.
 
-Check `context.is_audio_native` (audio-native) to determine which mode was used.
+Check `context.pipeline_type` to determine which mode was used, or `context.is_audio_native` for a boolean grouping of `S2S` and `AUDIO_LLM`.
 
 ### Writing Audio-Native-Aware Metrics
 
-If your metric needs user text directly (rather than via `conversation_trace`, which handles this automatically), branch on `context.is_audio_native` (audio-native):
+If your metric needs user text directly (rather than via `conversation_trace`, which handles this automatically), branch on `context.is_audio_native`:
 
 ```python
 async def compute(self, context: MetricContext) -> MetricScore:

diff --git a/src/eva/metrics/base.py b/src/eva/metrics/base.py
@@ -19,6 +19,7 @@
     resolve_turn_id,
     validate_rating,
 )
+from eva.models.config import PipelineType
 from eva.models.results import MetricScore
 from eva.utils.llm_client import LLMClient
 from eva.utils.logging import get_logger
@@ -84,7 +85,7 @@ def __init__(
         latency_assistant_turns: dict[int, float] | None = None,
         assistant_interrupted_turns: set[int] | None = None,
         user_interrupted_turns: set[int] | None = None,
-        is_audio_native: bool = False,
+        pipeline_type: PipelineType = PipelineType.CASCADE,
     ):
         self.record_id = record_id
 
@@ -134,7 +135,11 @@ def __init__(
         self.latency_assistant_turns = latency_assistant_turns or {}
         self.assistant_interrupted_turns = assistant_interrupted_turns or set()
         self.user_interrupted_turns = user_interrupted_turns or set()
-        self.is_audio_native = is_audio_native
+        self.pipeline_type = pipeline_type
+
+    @property
+    def is_audio_native(self) -> bool:
+        return self.pipeline_type in (PipelineType.S2S, PipelineType.AUDIO_LLM)
 
     def to_dict(self) -> dict[str, Any]:
         """Convert MetricContext to a serializable dictionary."""

diff --git a/src/eva/metrics/processor.py b/src/eva/metrics/processor.py
diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py
@@ -13,7 +13,7 @@
 from eva.metrics.base import BaseMetric, MetricContext
 from eva.metrics.processor import MetricsContextProcessor
 from eva.metrics.registry import MetricRegistry, get_global_registry
-from eva.models.config import is_audio_native_pipeline
+from eva.models.config import PipelineType, get_pipeline_type
 from eva.models.record import EvaluationRecord
 from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics
 from eva.utils.hash_utils import get_dict_hash
@@ -130,7 +130,7 @@ def _load_agent_config(self) -> dict[str, Any]:
 
         # Determine pipeline type from config (fallback to False for legacy runs)
         model_data = config_data.get("model", {})
-        self._is_audio_native = is_audio_native_pipeline(model_data) if model_data else False
+        self._pipeline_type = get_pipeline_type(model_data) if model_data else PipelineType.CASCADE
 
         agent_config_path = config_data.get("agent_config_path")
 
@@ -429,9 +429,7 @@ def _load_context(self, record_id: str, record_dir: Path) -> MetricContext:
         result = ConversationResult(**result_data)
 
         # Use postprocessor to process logs and create enriched context
-        metrics_context = self.metrics_processor.process_record(
-            result, record_dir, is_audio_native=self._is_audio_native
-        )
+        metrics_context = self.metrics_processor.process_record(result, record_dir, pipeline_type=self._pipeline_type)
 
         # Get agent instructions and tools from config
         agent_instructions = self._agent_config["instructions"]

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
@@ -15,6 +15,7 @@
 import copy
 import logging
 from datetime import UTC, datetime
+from enum import StrEnum
 from pathlib import Path
 from typing import Annotated, Any, ClassVar, Literal
 
@@ -171,6 +172,14 @@ def pipeline_parts(self) -> dict[str, str]:
 _AUDIO_LLM_FIELDS = {"audio_llm", "audio_llm_params", "tts", "tts_params"}
 
 
+class PipelineType(StrEnum):
+    """Type of voice pipeline."""
+
+    CASCADE = "cascade"
+    AUDIO_LLM = "audio_llm"
+    S2S = "s2s"
+
+
 def _model_config_discriminator(data: Any) -> str:
     """Discriminate which pipeline config type to use based on unique fields."""
     if isinstance(data, dict):
@@ -186,21 +195,22 @@ def _model_config_discriminator(data: Any) -> str:
     return "pipeline"
 
 
-def is_audio_native_pipeline(model_data: dict | Any) -> bool:
-    """Return True if the model config represents an audio-native pipeline (S2S or AudioLLM).
+def get_pipeline_type(model_data: dict | Any) -> PipelineType:
+    """Return the pipeline type for the given model config.
 
     Works with both raw dicts (e.g. from config.json) and parsed model config objects.
     Also handles legacy configs where ``realtime_model`` was stored alongside
     ``llm_model`` in a flat dict (before the discriminated-union refactor).
-    Returns False for configs missing the ``model`` key.
     """
     mode = _model_config_discriminator(model_data)
-    if mode in ("s2s", "audio_llm"):
-        return True
+    if mode == "s2s":
+        return PipelineType.S2S
+    if mode == "audio_llm":
+        return PipelineType.AUDIO_LLM
     # Legacy: realtime_model was a sibling of llm_model before the union split
     if isinstance(model_data, dict) and model_data.get("realtime_model"):
-        return True
-    return False
+        return PipelineType.S2S
+    return PipelineType.CASCADE
 
 
 def _strip_other_mode_fields(data: dict) -> dict:

diff --git a/src/eva/utils/log_processing.py b/src/eva/utils/log_processing.py
@@ -346,93 +346,3 @@ def filter_empty_responses(logs: list[dict]) -> list[dict]:
         else:
             filtered.append(log)
     return filtered
-
-
-def group_consecutive_logs_by_speaker(elevenlabs_logs: list[dict]) -> list[dict]:
-    """Group consecutive transcripts/responses from the same speaker in elevenlabs logs.
-
-    After filtering empty responses, if there are consecutive transcripts/responses
-    from the same speaker (ignoring audio events in between), merge them into a single log entry.
-
-    This handles cases where ElevenLabs splits a single utterance into multiple transcripts
-    (e.g., "One moment" followed by "Thank you..." with audio events in between).
-    """
-    if not elevenlabs_logs:
-        return []
-
-    def is_speech_event(log):
-        """Check if log is a speech event."""
-        return log.get("type") in ["assistant_speech", "user_speech"]
-
-    def get_speaker_id(log):
-        """Get speaker identifier from log type."""
-        return log.get("type")  # "assistant_speech" or "user_speech"
-
-    def get_text(log):
-        """Extract text from log."""
-        return log.get("data", {}).get("text", "")
-
-    def create_grouped_log(group):
-        """Create a merged log from a group of logs."""
-        first = group[0]
-        merged_text = " ".join(get_text(log) for log in group)
-
-        return {
-            "type": first.get("type"),
-            "timestamp": first.get("timestamp"),
-            "sequence": first.get("sequence"),
-            "data": {"text": merged_text, "source": first.get("data", {}).get("source")},
-        }
-
-    # First pass: extract only speech events and group consecutive same-speaker ones
-    transcript_events = [log for log in elevenlabs_logs if is_speech_event(log)]
-
-    grouped_transcripts = []
-    if transcript_events:
-        current_group = [transcript_events[0]]
-
-        for i in range(1, len(transcript_events)):
-            curr_log = transcript_events[i]
-            prev_log = current_group[0]
-
-            # Check if same speaker
-            same_speaker = get_speaker_id(curr_log) == get_speaker_id(prev_log)
-
-            if same_speaker:
-                current_group.append(curr_log)
-            else:
-                # Finish current group
-                grouped_transcripts.append(create_grouped_log(current_group))
-                current_group = [curr_log]
-
-        # Don't forget the last group
-        grouped_transcripts.append(create_grouped_log(current_group))
-
-    # Second pass: rebuild full log list with grouped transcripts
-    # Keep all non-speech events, replace speech sequences with grouped versions
-    result = []
-    transcript_idx = 0
-    i = 0
-    while i < len(elevenlabs_logs):
-        log = elevenlabs_logs[i]
-        if is_speech_event(log):
-            # Add the grouped transcript and skip ahead past all logs that were merged
-            if transcript_idx < len(grouped_transcripts):
-                result.append(grouped_transcripts[transcript_idx])
-                transcript_idx += 1
-                # Skip ahead past any consecutive logs of the same speaker that were merged
-                j = i + 1
-                curr_speaker = get_speaker_id(log)
-                while j < len(elevenlabs_logs):
-                    next_log = elevenlabs_logs[j]
-                    if is_speech_event(next_log) and get_speaker_id(next_log) == curr_speaker:
-                        i = j
-                        j += 1
-                    else:
-                        break
-        else:
-            # Keep non-speech events as-is
-            result.append(log)
-        i += 1
-
-    return result