ServiceNow · gabegma · Apr 15, 2026
diff --git a/configs/prompts/judge.yaml b/configs/prompts/judge.yaml
@@ -829,6 +829,81 @@ judge:
           ],
           "explanation": "<string: overall summary of fidelity assessment>"
         }}
+    s2s_user_prompt: |
+        You are an expert evaluator checking the **speech clarity and articulation** of entities spoken by an AI voice agent.
+
+        You will receive:
+        1. A conversation trace showing what the user said and what data the agent retrieved via tools. Assistant responses are redacted — you must listen to the audio to hear what the agent actually said.
+        2. An audio recording of the agent's side of the conversation only (the user is not audible).
+
+        ## Conversation Trace
+        {conversation_trace_formatted}
+
+        ## IMPORTANT: What This Metric Measures
+
+        This metric measures **speech fidelity** — whether entities are clearly and correctly articulated in the audio. The conversation trace is provided so you know which entities to listen for, NOT so you can judge whether the agent gave the right answer.
+
+        **This is NOT a faithfulness or correctness metric.** Do NOT evaluate:
+        - Whether the agent used the right entity from a tool response (e.g., agent says "$315" but tool says $300 — this is a faithfulness issue, NOT a speech fidelity issue)
+        - Whether the agent fabricated or hallucinated information not in the trace
+        - Whether the agent omitted information it should have mentioned
+        - Whether the agent's response is logical, helpful, or correct
+
+        **What this metric DOES evaluate:**
+        When the agent speaks an entity that appears in the conversation trace (user utterances or tool responses), is it **clearly articulated** in the audio? Specifically:
+        - Can you clearly hear the entity as spoken?
+        - Does the spoken form sound like the correct entity, or is it garbled, mispronounced, or distorted?
+        - If the agent spells out a code letter by letter, is each letter/digit clearly distinguishable?
+
+        ## Entity Categories to Listen For
+        - Confirmation codes (e.g., ZK3FFW, FAR0UM) — especially when spelled out letter by letter
+        - Flight numbers (e.g., SkyWay 410, SW302)
+        - Dollar amounts (e.g., $15, $1,285.00) — "fifteen" vs "fifty" matters
+        - Seat numbers (e.g., 21C, 14A)
+        - Reference/voucher IDs (e.g., REF-8JVSDF-001) — verify each segment is distinguishable
+        - Times (e.g., 3:55 PM, 10:30 AM)
+        - Dates (e.g., March 25th, February 3rd)
+        - Names (e.g., Mr. Rivera, Rodriguez)
+
+        ## Examples
+
+        **High fidelity (rating = 1):**
+        - Tool response contains confirmation code "YTM924". Agent says "Y T M nine two four" — each character is clearly audible. ✓
+        - User says "last name Patel". Agent says "Patel" — clearly articulated. ✓
+        - Tool response says fare is $300. Agent says "$315" — the amount is clearly spoken even though it doesn't match the tool response. This is a faithfulness issue, not a speech fidelity issue. Rate 1. ✓
+        - Agent mentions "Dallas" which is not in the tool response — this is a hallucination, not a speech issue. Rate 1. ✓
+
+        **Low fidelity (rating = 0):**
+        - Tool response contains "YTM924". Agent tries to spell it out but audio sounds like "Y T N nine two four" — "M" sounds like "N". ✗
+        - Agent says a dollar amount but the audio is garbled and you cannot tell if it's "fifty" or "fifteen". ✗
+        - Agent spells a code but skips or slurs a letter so the spoken code has fewer characters than expected. ✗
+
+        **What to ignore (does NOT cause rating = 0):**
+        - Entities the agent mentions that are NOT in the conversation trace — do not evaluate these
+        - Minor pronunciation variations that do not change identity (e.g., "Ms." vs "Miss")
+        - Filler words, phrasing, word choice, sentence structure
+        - Slight pacing or prosody differences
+
+        ## Rating Scale (per turn)
+        - **1 (High Fidelity)**: Every entity from the conversation trace that the agent speaks in this turn is clearly and correctly articulated.
+        - **0 (Low Fidelity)**: One or more entities from the conversation trace are garbled, mispronounced, or indistinguishable in the audio.
+
+        If the assistant does not speak any entities from the conversation trace in a turn (e.g., a greeting, filler, or turn where it only mentions entities not in the trace), set `has_entities` to false. These turns are excluded from scoring.
+
+        ## Response Format
+        Respond with a JSON object. Each turn entry must include the turn_id matching the turn number shown in the Conversation Trace above:
+        {{
+          "turns": [
+            {{
+              "turn_id": <int: the turn number from the Conversation Trace>,
+              "transcript": <string: your transcription of the audio for this turn, use only the audio for this not the conversation trace>,
+              "has_entities": <boolean: true if the assistant speaks entities from the conversation trace in this turn, false otherwise>,
+              "explanation": "<string: 1-3 sentence analysis listing which trace entities were spoken and whether they are clearly articulated>",
+              "rating": <0 or 1>
+            }}
+          ],
+          "explanation": "<string: overall summary of speech fidelity assessment>"
+        }}
 
   user_speech_fidelity:
     user_prompt: |

diff --git a/src/eva/metrics/accuracy/__init__.py b/src/eva/metrics/accuracy/__init__.py
@@ -1,11 +1,13 @@
 """Task completion metrics - measuring whether the agent accomplished the user's goal."""
 
 from . import agent_speech_fidelity  # noqa
+from . import agent_speech_fidelity_s2s  # noqa
 from . import faithfulness  # noqa
 from . import task_completion  # noqa
 
 __all__ = [
     "agent_speech_fidelity",
+    "agent_speech_fidelity_s2s",
     "faithfulness",
     "task_completion",
 ]
diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py
@@ -0,0 +1,239 @@
+"""Agent speech fidelity metric for S2S models — entity-focused evaluation.
+
+For S2S (speech-to-speech) models, there is no intended text to compare against.
+Instead, this metric verifies that key entities spoken by the agent (from tool
+responses and user utterances) are accurate by sending a redacted conversation
+trace alongside the agent audio to Gemini.
+"""
+
+import json
+from typing import Any
+
+from eva.metrics.base import MetricContext
+from eva.metrics.speech_fidelity_base import SpeechFidelityBaseMetric
+from eva.metrics.utils import aggregate_per_turn_scores, normalize_rating, resolve_turn_id
+from eva.models.results import MetricScore
+
+
+class AgentSpeechFidelityS2SMetric(SpeechFidelityBaseMetric):
+    """Audio-based entity fidelity metric for S2S agent speech.
+
+    Evaluates whether key entities (from tool responses and user utterances) are
+    spoken correctly by the agent, without requiring intended text.
+
+    Rating scale: 0 (entity error) or 1 (all entities accurate)
+    """
+
+    name = "agent_speech_fidelity"
+    description = "Audio-based evaluation of agent entity fidelity for S2S models"
+    category = "accuracy"
+    role = "assistant"
+    rating_scale = (0, 1)
+    pass_at_k_threshold = 0.95
+
+    async def compute(self, context: MetricContext) -> MetricScore:
+        """Compute entity fidelity score using redacted conversation trace + audio."""
+        try:
+            audio_segment = self.load_role_audio(context, self.role)
+            if audio_segment is None:
+                return MetricScore(
+                    name=self.name,
+                    score=0.0,
+                    normalized_score=0.0,
+                    error=f"No {self.role} audio file available",
+                )
+
+            redacted_trace = self._build_redacted_trace(context)
+            assistant_turn_ids = self._get_assistant_turn_ids(redacted_trace)
+
+            if not assistant_turn_ids:
+                return MetricScore(
+                    name=self.name,
+                    score=0.0,
+                    normalized_score=0.0,
+                    error="No assistant turns found in conversation trace",
+                )
+
+            num_turns = len(assistant_turn_ids)
+            trace_formatted = self._format_redacted_trace(redacted_trace)
+            audio_b64 = self.encode_audio_segment(audio_segment)
+
+            prompt = self.get_judge_prompt(
+                prompt_key="s2s_user_prompt",
+                conversation_trace_formatted=trace_formatted,
+            )
+
+            messages = self.create_audio_message(audio_b64, prompt)
+
+            per_turn_ratings: dict[int, int | None] = {}
+            per_turn_explanations: dict[int, str] = {}
+            per_turn_transcripts: dict[int, str] = {}
+            per_turn_normalized: dict[int, float] = {}
+            min_rating, max_rating = self.rating_scale
+            valid_ratings_range = list(range(min_rating, max_rating + 1))
+
+            response_text, turns = await self._call_and_parse(messages, context, audio_segment, prompt)
+
+            if response_text is None:
+                return MetricScore(
+                    name=self.name,
+                    score=0.0,
+                    normalized_score=0.0,
+                    error="No response from judge",
+                )
+
+            self.logger.debug(f"Raw judge response: {response_text[:200]}")
+
+            if len(turns) != num_turns:
+                self.logger.warning(
+                    f"[{context.record_id}] Expected {num_turns} ratings for S2S entity fidelity, got {len(turns)}"
+                )
+
+            per_turn_has_entities: dict[int, bool] = {}
+
+            for response_item in turns:
+                turn_id = resolve_turn_id(response_item, assistant_turn_ids, self.name)
+                if turn_id is None:
+                    continue
+                rating = response_item.get("rating")
+                transcript = response_item.get("transcript")
+                explanation = response_item.get("explanation", "")
+                has_entities = response_item.get("has_entities", True)
+
+                per_turn_has_entities[turn_id] = has_entities
+
+                if not has_entities:
+                    # Exclude turns with no entities from scoring
+                    per_turn_ratings[turn_id] = rating
+                    per_turn_explanations[turn_id] = explanation
+                    per_turn_transcripts[turn_id] = transcript
+                    continue
+
+                if rating not in valid_ratings_range:
+                    self.logger.warning(f"[{context.record_id}] Invalid rating {rating} for turn {turn_id}")
+                    per_turn_ratings[turn_id] = None
+                    per_turn_explanations[turn_id] = f"Invalid rating: {rating}"
+                    continue
+
+                per_turn_ratings[turn_id] = rating
+                per_turn_explanations[turn_id] = explanation
+                per_turn_transcripts[turn_id] = transcript
+                per_turn_normalized[turn_id] = normalize_rating(rating, min_rating, max_rating)
+
+            aggregated_score = aggregate_per_turn_scores(list(per_turn_normalized.values()), self.aggregation)
+
+            # Only count turns with entities toward the score
+            valid_ratings = [
+                per_turn_ratings[tid]
+                for tid in per_turn_ratings
+                if per_turn_ratings[tid] is not None and per_turn_has_entities.get(tid, True)
+            ]
+            avg_rating = sum(valid_ratings) / len(valid_ratings) if valid_ratings else 0.0
+            num_skipped_no_entities = sum(1 for v in per_turn_has_entities.values() if not v)
+
+            details: dict[str, Any] = {
+                "variant": "s2s",
+                "aggregation": self.aggregation,
+                "num_turns": num_turns,
+                "num_evaluated": len(valid_ratings),
+                "num_skipped_no_entities": num_skipped_no_entities,
+                "per_turn_ratings": per_turn_ratings,
+                "per_turn_has_entities": per_turn_has_entities,
+                "per_turn_explanations": per_turn_explanations,
+                "judge_prompt": prompt,
+                "judge_raw_response": response_text,
+            }
+
+            return MetricScore(
+                name=self.name,
+                score=round(avg_rating, 3),
+                normalized_score=round(aggregated_score, 3) if aggregated_score is not None else 0,
+                details=details,
+                error="Aggregation failed" if aggregated_score is None else None,
+            )
+
+        except Exception as e:
+            return self._handle_error(e, context)
+
+    @staticmethod
+    def _build_redacted_trace(context: MetricContext) -> list[dict]:
+        """Build a redacted conversation trace for entity fidelity evaluation.
+
+        Keeps user entries and tool responses as-is (entity sources).
+        Replaces assistant entries with a single placeholder per turn_id
+        (a turn can have multiple assistant entries, e.g. before/after tool calls).
+        Drops tool_call entries (parameters, not entity sources).
+
+        Note: conversation trace entries use different schemas by type:
+        - user/assistant entries have ``role`` + ``content``
+        - tool entries have ``type`` (tool_call/tool_response) + ``tool_name`` + data fields
+        """
+        redacted = []
+        seen_assistant_turns: set[int] = set()
+        for entry in context.conversation_trace or []:
+            role = entry.get("role")
+            entry_type = entry.get("type")
+
+            if role == "assistant":
+                turn_id = entry.get("turn_id")
+                if turn_id not in seen_assistant_turns:
+                    seen_assistant_turns.add(turn_id)
+                    redacted.append(
+                        {
+                            "role": "assistant",
+                            "turn_id": turn_id,
+                            "redacted": True,
+                        }
+                    )
+            elif role == "user":
+                redacted.append(
+                    {
+                        "role": "user",
+                        "content": entry.get("content", ""),
+                        "turn_id": entry.get("turn_id"),
+                    }
+                )
+            elif entry_type == "tool_response":
+                redacted.append(
+                    {
+                        "role": "tool_response",
+                        "tool_name": entry.get("tool_name", "unknown"),
+                        "content": entry.get("tool_response", {}),
+                        "turn_id": entry.get("turn_id"),
+                    }
+                )
+            # Skip tool_call entries — parameters are not entity sources
+
+        return redacted
+
+    @staticmethod
+    def _get_assistant_turn_ids(redacted_trace: list[dict]) -> list[int]:
+        """Extract sorted unique assistant turn IDs from the redacted trace."""
+        turn_ids = set()
+        for entry in redacted_trace:
+            if entry.get("role") == "assistant" and entry.get("turn_id") is not None:
+                turn_ids.add(entry["turn_id"])
+        return sorted(turn_ids)
+
+    @staticmethod
+    def _format_redacted_trace(redacted_trace: list[dict]) -> str:
+        """Format the redacted trace as text for the prompt."""
+        lines = []
+        for entry in redacted_trace:
+            turn_id = entry.get("turn_id", "?")
+            role = entry["role"]
+
+            if role == "user":
+                lines.append(f"Turn {turn_id} - User: {entry['content']}")
+            elif role == "assistant":
+                lines.append(f"Turn {turn_id} - [Assistant speaks]")
+            elif role == "tool_response":
+                tool_name = entry.get("tool_name", "unknown")
+                content = entry.get("content", {})
+                if isinstance(content, (dict, list)):
+                    content_str = json.dumps(content, indent=None)
+                else:
+                    content_str = str(content)
+                lines.append(f"Turn {turn_id} - Tool Response ({tool_name}): {content_str}")
+
+        return "\n".join(lines)
diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py
@@ -9,6 +9,7 @@
 
 import yaml
 
+from eva.metrics.accuracy.agent_speech_fidelity_s2s import AgentSpeechFidelityS2SMetric
 from eva.metrics.aggregation import compute_record_aggregates, compute_run_level_aggregates
 from eva.metrics.base import BaseMetric, MetricContext
 from eva.metrics.processor import MetricsContextProcessor
@@ -118,6 +119,13 @@ def __init__(
             else:
                 logger.warning(f"Metric '{name}' not found, skipping")
 
+        # For S2S pipelines, swap agent_speech_fidelity with entity-focused variant
+        if self._pipeline_type == PipelineType.S2S:
+            self.metrics = [
+                AgentSpeechFidelityS2SMetric(config=m.config) if m.name == "agent_speech_fidelity" else m
+                for m in self.metrics
+            ]
+
         logger.info(f"Metrics runner initialized with {len(self.metrics)} metrics")
 
     def _load_agent_config(self) -> dict[str, Any]: