diff --git a/.env.example b/.env.example index d11adaaa..ec0dcf06 100644 --- a/.env.example +++ b/.env.example @@ -109,16 +109,16 @@ EVA_MODEL_LIST='[ # Uncomment and add to EVA_MODEL_LIST above as needed. # # Azure OpenAI (alternative to direct OpenAI): -# { -# "model_name": "gpt-5.2", -# "litellm_params": { -# "model": "azure/gpt-5.2", -# "api_key": "os.environ/AZURE_OPENAI_API_KEY", -# "api_base": "https://your-resource.openai.azure.com", -# "max_parallel_requests": 5 -# }, -# "model_info": {"base_model": "gpt-5.2"} -# } +# { +# "model_name": "gpt-5.2", +# "litellm_params": { +# "model": "azure/gpt-5.2", +# "api_key": "os.environ/AZURE_OPENAI_API_KEY", +# "api_base": "https://your-resource.openai.azure.com", +# "max_parallel_requests": 5 +# }, +# "model_info": {"base_model": "gpt-5.2"} +# } # # Self-hosted model (e.g., vLLM, NVIDIA NIM): # { diff --git a/.gitignore b/.gitignore index 4420978f..e9ef7669 100644 --- a/.gitignore +++ b/.gitignore @@ -84,6 +84,7 @@ creds/ # Claude Code .claude/ claude_only_docs/ +CLAUDE.md # Local folders local/ diff --git a/Dockerfile b/Dockerfile index 12f1527b..3566c4af 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ # ============================================ # Stage 1: Builder # ============================================ -FROM python:3.11-slim as builder +FROM python:3.11-slim AS builder WORKDIR /app @@ -27,7 +27,7 @@ RUN pip install --no-cache-dir --upgrade pip && \ # ============================================ # Stage 2: Runtime # ============================================ -FROM python:3.11-slim as runtime +FROM python:3.11-slim AS runtime WORKDIR /app diff --git a/configs/prompts/simulation.yaml b/configs/prompts/simulation.yaml index 9dc36cb6..6914346a 100644 --- a/configs/prompts/simulation.yaml +++ b/configs/prompts/simulation.yaml @@ -125,7 +125,7 @@ audio_llm_agent: realtime_agent: system_prompt: | - You are a friendly voice assistant. + You are an AI voice assistant on a live phone call. Call the appropriate function to process the user's input. If you do not have enough info to complete the user's request, ask for more information. Call the tool as many times as you need until the user's task is complete. Call the tool as quickly as possible. diff --git a/docs/metric_context.md b/docs/metric_context.md index dbcabb06..62778402 100644 --- a/docs/metric_context.md +++ b/docs/metric_context.md @@ -282,9 +282,9 @@ Minor discrepancies are still possible (turn IDs off by one, audio timestamps no Benchmark Execution: ├─ EvaluationRecord (dataset.jsonl) │ ├─ user_goal, user_persona, scenario_db → MetricContext - │ └─ Feeds to AssistantServer + UserSimulator + │ └─ Feeds to PipecatAssistantServer + UserSimulator │ - ├─ AssistantServer writes: + ├─ PipecatAssistantServer writes: │ ├─ audit_log.json (tool calls, user/assistant turns) │ ├─ pipecat_events.jsonl (TTS text, turn boundaries) │ ├─ response_latencies.json (response speed data) diff --git a/src/eva/assistant/audio_bridge.py b/src/eva/assistant/audio_bridge.py new file mode 100644 index 00000000..496d4b04 --- /dev/null +++ b/src/eva/assistant/audio_bridge.py @@ -0,0 +1,280 @@ +"""Shared audio bridge utilities for framework-specific assistant servers. + +All framework servers need to: +1. Accept Twilio-framed WebSocket connections from the user simulator +2. Convert audio between Twilio's mulaw 8kHz and the framework's native format +3. Write framework_logs.jsonl with timestamped events + +This module provides the common infrastructure. +""" + +import audioop +import base64 +import json +import struct +import time +from pathlib import Path + +import numpy as np +import soxr + +from eva.utils.logging import get_logger + +logger = get_logger(__name__) + + +# ── Audio format conversion ────────────────────────────────────────── + + +def mulaw_8k_to_pcm16_16k(mulaw_bytes: bytes) -> bytes: + """Convert 8kHz mu-law audio to 16kHz 16-bit PCM.""" + # Decode mu-law to 16-bit PCM at 8kHz + pcm_8k = audioop.ulaw2lin(mulaw_bytes, 2) + # Upsample from 8kHz to 16kHz + pcm_16k, _ = audioop.ratecv(pcm_8k, 2, 1, 8000, 16000, None) + return pcm_16k + + +def mulaw_8k_to_pcm16_24k(mulaw_bytes: bytes) -> bytes: + """Convert 8kHz mu-law audio to 24kHz 16-bit PCM.""" + # Decode mu-law to 16-bit PCM at 8kHz + pcm_8k = audioop.ulaw2lin(mulaw_bytes, 2) + # Upsample from 8kHz to 24kHz + pcm_24k, _ = audioop.ratecv(pcm_8k, 2, 1, 8000, 24000, None) + # audioop.ratecv can produce ±2 samples; clamp to exact 3× input length + # so that the inverse conversion recovers the original sample count. + expected_bytes = len(pcm_8k) * 3 + if len(pcm_24k) < expected_bytes: + pcm_24k = pcm_24k + b"\x00" * (expected_bytes - len(pcm_24k)) + elif len(pcm_24k) > expected_bytes: + pcm_24k = pcm_24k[:expected_bytes] + return pcm_24k + + +def pcm16_16k_to_mulaw_8k(pcm_bytes: bytes) -> bytes: + """Convert 16kHz 16-bit PCM to 8kHz mu-law.""" + # Downsample from 16kHz to 8kHz + pcm_8k, _ = audioop.ratecv(pcm_bytes, 2, 1, 16000, 8000, None) + # Encode to mu-law + return audioop.lin2ulaw(pcm_8k, 2) + + +def pcm16_24k_to_mulaw_8k(pcm_bytes: bytes) -> bytes: + """Convert 24kHz 16-bit PCM to 8kHz mu-law. + + Uses soxr VHQ resampling (same as Pipecat) for proper anti-aliasing during the 3:1 downsampling. + audioop.ratecv produces muffled audio because it lacks an anti-aliasing filter. + """ + # Downsample from 24kHz to 8kHz using high-quality resampler + audio_data = np.frombuffer(pcm_bytes, dtype=np.int16) + resampled = soxr.resample(audio_data, 24000, 8000, quality="VHQ") + # Both audioop.ratecv (upstream) and soxr can produce ±1 sample due to filter rounding. + # Use round() so that e.g. 2399 input samples → round(2399/3) = 800, not 799. + expected_samples = round(len(audio_data) * 8000 / 24000) + if len(resampled) < expected_samples: + resampled = np.pad(resampled, (0, expected_samples - len(resampled))) + elif len(resampled) > expected_samples: + resampled = resampled[:expected_samples] + pcm_8k = resampled.astype(np.int16).tobytes() + # Encode to mu-law + return audioop.lin2ulaw(pcm_8k, 2) + + +def sync_buffer_to_position(buffer: bytearray, target_position: int) -> None: + """Pad *buffer* with silence bytes so it reaches *target_position*. + + Mirrors pipecat's ``AudioBufferProcessor._sync_buffer_to_position``. + Call this **before** extending the *other* track so both tracks stay + positionally aligned. + """ + current_len = len(buffer) + if current_len < target_position: + buffer.extend(b"\x00" * (target_position - current_len)) + + +def pcm16_mix(track_a: bytes, track_b: bytes) -> bytes: + """Mix two 16-bit PCM tracks by sample-wise addition with clipping. + + Both tracks must be the same sample rate. If lengths differ, + the shorter track is zero-padded. + """ + len_a, len_b = len(track_a), len(track_b) + max_len = max(len_a, len_b) + + # Zero-pad shorter track + if len_a < max_len: + track_a = track_a + b"\x00" * (max_len - len_a) + if len_b < max_len: + track_b = track_b + b"\x00" * (max_len - len_b) + + # Mix with clipping + n_samples = max_len // 2 + fmt = f"<{n_samples}h" + samples_a = struct.unpack(fmt, track_a) + samples_b = struct.unpack(fmt, track_b) + mixed = struct.pack(fmt, *(max(-32768, min(32767, a + b)) for a, b in zip(samples_a, samples_b))) + return mixed + + +# ── Twilio WebSocket Protocol ──────────────────────────────────────── + + +def parse_twilio_media_message(message: str) -> bytes | None: + """Parse a Twilio media WebSocket message and extract raw audio bytes. + + Returns None if the message is not a media message. + """ + try: + data = json.loads(message) + if data.get("event") == "media": + payload = data["media"]["payload"] + return base64.b64decode(payload) + except (json.JSONDecodeError, KeyError): + pass + return None + + +def create_twilio_media_message(stream_sid: str, audio_bytes: bytes) -> str: + """Create a Twilio media WebSocket message with the given audio bytes.""" + payload = base64.b64encode(audio_bytes).decode("ascii") + return json.dumps( + { + "event": "media", + "streamSid": stream_sid, + "media": { + "payload": payload, + }, + } + ) + + +def create_twilio_start_response(stream_sid: str) -> str: + """Create a Twilio 'start' event response.""" + return json.dumps( + { + "event": "start", + "streamSid": stream_sid, + "start": { + "streamSid": stream_sid, + "mediaFormat": { + "encoding": "audio/x-mulaw", + "sampleRate": 8000, + "channels": 1, + }, + }, + } + ) + + +# ── Framework Logs Writer ──────────────────────────────────────────── + + +class FrameworkLogWriter: + """Writes framework_logs.jsonl (replacement for pipecat_logs.jsonl). + + Captures turn boundaries, TTS text, and LLM responses with accurate + wall-clock timestamps. + """ + + def __init__(self, output_dir: Path): + self.log_file = output_dir / "framework_logs.jsonl" + output_dir.mkdir(parents=True, exist_ok=True) + + def write(self, event_type: str, data: dict, timestamp_ms: int | None = None) -> None: + """Write a single log entry. + + Args: + event_type: One of 'turn_start', 'turn_end', 'tts_text', 'llm_response' + data: Event data dict. Must contain a 'frame' key for tts_text/llm_response. + timestamp_ms: Wall-clock timestamp in milliseconds. Defaults to now. + """ + if timestamp_ms is None: + timestamp_ms = int(time.time() * 1000) + + entry = { + "timestamp": timestamp_ms, + "type": event_type, + "data": data, + } + try: + with open(self.log_file, "a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + except Exception as e: + logger.error(f"Error writing framework log: {e}") + + def turn_start(self, timestamp_ms: int | None = None) -> None: + """Log a turn start event.""" + self.write("turn_start", {"frame": "turn_start"}, timestamp_ms) + + def turn_end(self, was_interrupted: bool = False, timestamp_ms: int | None = None) -> None: + """Log a turn end event.""" + self.write("turn_end", {"frame": "turn_end", "was_interrupted": was_interrupted}, timestamp_ms) + + def tts_text(self, text: str, timestamp_ms: int | None = None) -> None: + """Log TTS text (what was actually spoken).""" + self.write("tts_text", {"frame": text}, timestamp_ms) + + def llm_response(self, text: str, timestamp_ms: int | None = None) -> None: + """Log LLM response text (full intended response).""" + self.write("llm_response", {"frame": text}, timestamp_ms) + + +# ── Metrics Log Writer ─────────────────────────────────────────────── + + +class MetricsLogWriter: + """Writes pipecat_metrics.jsonl equivalent for non-pipecat frameworks.""" + + def __init__(self, output_dir: Path): + self.log_file = output_dir / "pipecat_metrics.jsonl" + output_dir.mkdir(parents=True, exist_ok=True) + + def write_processing_metric(self, processor: str, value_seconds: float, model: str = "") -> None: + """Write a ProcessingMetricsData entry (e.g., for STT latency).""" + entry = { + "timestamp": int(time.time() * 1000), + "type": "ProcessingMetricsData", + "processor": processor, + "model": model, + "value": value_seconds, + } + self._append(entry) + + def write_ttfb_metric(self, processor: str, value_seconds: float, model: str = "") -> None: + """Write a TTFBMetricsData entry (e.g., for TTS time-to-first-byte).""" + entry = { + "timestamp": int(time.time() * 1000), + "type": "TTFBMetricsData", + "processor": processor, + "model": model, + "value": value_seconds, + } + self._append(entry) + + def write_token_usage( + self, + processor: str, + model: str, + prompt_tokens: int, + completion_tokens: int, + ) -> None: + """Write an LLMTokenUsageMetricsData entry.""" + entry = { + "timestamp": int(time.time() * 1000), + "type": "LLMTokenUsageMetricsData", + "processor": processor, + "model": model, + "value": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + } + self._append(entry) + + def _append(self, entry: dict) -> None: + try: + with open(self.log_file, "a", encoding="utf-8") as f: + f.write(json.dumps(entry) + "\n") + except Exception as e: + logger.error(f"Error writing metrics log: {e}") diff --git a/src/eva/assistant/base_server.py b/src/eva/assistant/base_server.py new file mode 100644 index 00000000..fd7227f6 --- /dev/null +++ b/src/eva/assistant/base_server.py @@ -0,0 +1,211 @@ +"""Abstract base class for assistant server implementations. + +All framework-specific assistant servers (Pipecat, OpenAI Realtime, Gemini Live, etc.) +must inherit from AbstractAssistantServer and implement the required interface. + +See docs/assistant_server_contract.md for the full specification. +""" + +import json +import wave +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any + +from eva.assistant.agentic.audit_log import AuditLog +from eva.assistant.tools.tool_executor import ToolExecutor +from eva.models.agents import AgentConfig +from eva.models.config import AudioLLMConfig, PipelineConfig, SpeechToSpeechConfig +from eva.utils.logging import get_logger + +logger = get_logger(__name__) + +INITIAL_MESSAGE = "Hello! How can I help you today?" + + +class AbstractAssistantServer(ABC): + """Base class for all assistant server implementations. + + Each implementation must: + 1. Expose a WebSocket endpoint at ws://localhost:{port}/ws with Twilio frame format + 2. Bridge audio between the user simulator and the framework's native format + 3. Execute tool calls via the local ToolExecutor + 4. Produce all required output files (audit_log.json, framework_logs.jsonl, audio, etc.) + 5. Populate the AuditLog with conversation events + """ + + def __init__( + self, + current_date_time: str, + pipeline_config: PipelineConfig | SpeechToSpeechConfig | AudioLLMConfig, + agent: AgentConfig, + agent_config_path: str, + scenario_db_path: str, + output_dir: Path, + port: int, + conversation_id: str, + ): + """Initialize the assistant server. + + Args: + current_date_time: Current date/time string from the evaluation record + pipeline_config: Configuration for the model/pipeline + agent: Single agent configuration to use + agent_config_path: Path to agent YAML configuration + scenario_db_path: Path to scenario database JSON + output_dir: Directory for output files + port: Port to listen on + conversation_id: Unique ID for this conversation + """ + self.current_date_time = current_date_time + self.pipeline_config = pipeline_config + self.agent: AgentConfig = agent + self.agent_config_path = agent_config_path + self.scenario_db_path = scenario_db_path + self.output_dir = Path(output_dir) + self.port = port + self.conversation_id = conversation_id + + # Core components - all implementations must use these + self.audit_log = AuditLog() + self.tool_handler = ToolExecutor( + tool_config_path=agent_config_path, + scenario_db_path=scenario_db_path, + tool_module_path=self.agent.tool_module_path, + current_date_time=self.current_date_time, + ) + + # Audio buffers for recording + self._audio_buffer = bytearray() + self.user_audio_buffer = bytearray() + self.assistant_audio_buffer = bytearray() + self._audio_sample_rate: int = 24000 # Subclasses can override + + @abstractmethod + async def start(self) -> None: + """Start the server. + + Must be non-blocking (return after the server is ready to accept connections). + Must expose a WebSocket endpoint at ws://localhost:{port}/ws using FastAPI+uvicorn + with TwilioFrameSerializer for compatibility with the user simulator. + + The implementation must: + 1. Create a FastAPI app with /ws and / WebSocket endpoints + 2. Start a uvicorn server on the configured port + 3. Return once the server is accepting connections + """ + ... + + @abstractmethod + async def stop(self) -> None: + """Stop the server and save all outputs. + + Must: + 1. Gracefully shut down the server + 2. Call save_outputs() to persist all data + """ + ... + + def get_conversation_stats(self) -> dict[str, Any]: + """Get statistics about the conversation. + + Returns dict with: num_turns, num_tool_calls, tools_called, etc. + """ + return self.audit_log.get_stats() + + def get_initial_scenario_db(self) -> dict[str, Any]: + """Get initial (pristine) scenario database state.""" + return self.tool_handler.original_db + + def get_final_scenario_db(self) -> dict[str, Any]: + """Get final (mutated) scenario database state.""" + return self.tool_handler.db + + # ── Shared output helpers ────────────────────────────────────────── + + async def save_outputs(self) -> None: + """Save all required output files. Called by stop(). + + Subclasses can override to add framework-specific outputs, + but must call super().save_outputs(). + """ + # Save audit log + self.audit_log.save(self.output_dir / "audit_log.json") + + # Save simplified transcript + transcript_path = self.output_dir / "transcript.jsonl" + self.audit_log.save_transcript_jsonl(transcript_path) + + # Save audio recordings + self._save_audio() + + # Save scenario database states (REQUIRED for deterministic metrics) + self._save_scenario_dbs() + + logger.info(f"Outputs saved to {self.output_dir}") + + def _save_audio(self) -> None: + """Save accumulated audio buffers to WAV files. + + If _audio_buffer (mixed) is empty but user and assistant buffers are + available, compute mixed audio automatically via sample-wise addition. + """ + # Auto-compute mixed audio from user + assistant tracks when not populated + if not self._audio_buffer and self.user_audio_buffer and self.assistant_audio_buffer: + from eva.assistant.audio_bridge import pcm16_mix + + self._audio_buffer = bytearray(pcm16_mix(bytes(self.user_audio_buffer), bytes(self.assistant_audio_buffer))) + elif not self._audio_buffer and self.user_audio_buffer: + self._audio_buffer = bytearray(self.user_audio_buffer) + elif not self._audio_buffer and self.assistant_audio_buffer: + self._audio_buffer = bytearray(self.assistant_audio_buffer) + + if self._audio_buffer: + self._save_wav_file( + bytes(self._audio_buffer), + self.output_dir / "audio_mixed.wav", + self._audio_sample_rate, + 1, + ) + if self.user_audio_buffer: + self._save_wav_file( + bytes(self.user_audio_buffer), + self.output_dir / "audio_user.wav", + self._audio_sample_rate, + 1, + ) + if self.assistant_audio_buffer: + self._save_wav_file( + bytes(self.assistant_audio_buffer), + self.output_dir / "audio_assistant.wav", + self._audio_sample_rate, + 1, + ) + + def _save_wav_file(self, audio_data: bytes, file_path: Path, sample_rate: int, num_channels: int) -> None: + """Save raw 16-bit PCM audio data to a WAV file.""" + try: + with wave.open(str(file_path), "wb") as wav_file: + wav_file.setnchannels(num_channels) + wav_file.setsampwidth(2) # 16-bit PCM + wav_file.setframerate(sample_rate) + wav_file.writeframes(audio_data) + logger.debug(f"Audio saved to {file_path} ({len(audio_data)} bytes)") + except Exception as e: + logger.error(f"Error saving audio to {file_path}: {e}") + + def _save_scenario_dbs(self) -> None: + """Save initial and final scenario database states.""" + try: + initial_db_path = self.output_dir / "initial_scenario_db.json" + with open(initial_db_path, "w") as f: + json.dump(self.get_initial_scenario_db(), f, indent=2, sort_keys=True, default=str) + + final_db_path = self.output_dir / "final_scenario_db.json" + with open(final_db_path, "w") as f: + json.dump(self.get_final_scenario_db(), f, indent=2, sort_keys=True, default=str) + + logger.info(f"Saved scenario database states to {self.output_dir}") + except Exception as e: + logger.error(f"Error saving scenario database states: {e}", exc_info=True) + raise diff --git a/src/eva/assistant/gemini_live_server.py b/src/eva/assistant/gemini_live_server.py new file mode 100644 index 00000000..9622ea65 --- /dev/null +++ b/src/eva/assistant/gemini_live_server.py @@ -0,0 +1,660 @@ +"""Gemini Live AssistantServer for EVA-Bench. + +Bridges between Twilio-framed WebSocket (user simulator) and Google's Gemini Live +API via the google-genai Python SDK. Audio flows: + + User simulator (8 kHz mulaw) + -> 16 kHz PCM16 -> Gemini Live input + Gemini Live output (24 kHz PCM16) + -> 8 kHz mulaw -> User simulator + +All tool calls are executed locally via ToolExecutor; transcription events +from Gemini populate the audit log. +""" + +from __future__ import annotations + +import asyncio +import json +import os +import time +from pathlib import Path +from typing import Any + +import uvicorn +from fastapi import FastAPI, WebSocket, WebSocketDisconnect +from google import genai +from google.genai import types + +from eva.assistant.audio_bridge import ( + FrameworkLogWriter, + MetricsLogWriter, + create_twilio_media_message, + mulaw_8k_to_pcm16_16k, + mulaw_8k_to_pcm16_24k, + parse_twilio_media_message, + pcm16_24k_to_mulaw_8k, + sync_buffer_to_position, +) +from eva.assistant.base_server import INITIAL_MESSAGE, AbstractAssistantServer +from eva.models.agents import AgentConfig +from eva.models.config import AudioLLMConfig, PipelineConfig, SpeechToSpeechConfig +from eva.utils.logging import get_logger +from eva.utils.prompt_manager import PromptManager + +logger = get_logger(__name__) + +# Default recording sample rate (Gemini outputs 24 kHz PCM) +_RECORDING_SAMPLE_RATE = 24000 + +# Audio output pacing: send 160-byte mulaw chunks (20ms at 8kHz) at real-time rate +# so the user simulator's silence detection works correctly. +MULAW_CHUNK_SIZE = 160 # bytes per chunk (20ms at 8kHz, 1 byte per sample) +MULAW_CHUNK_DURATION_S = 0.02 # 20ms per chunk + + +# --------------------------------------------------------------------------- +# Tool schema helpers +# --------------------------------------------------------------------------- + + +def _json_schema_type(python_type: str) -> str: + """Map Python/EVA type names to JSON Schema / Gemini type strings.""" + mapping = { + "string": "STRING", + "str": "STRING", + "integer": "INTEGER", + "int": "INTEGER", + "number": "NUMBER", + "float": "NUMBER", + "boolean": "BOOLEAN", + "bool": "BOOLEAN", + "array": "ARRAY", + "list": "ARRAY", + "object": "OBJECT", + "dict": "OBJECT", + } + return mapping.get(python_type.lower(), "STRING") + + +def _convert_schema_properties(props: dict[str, Any]) -> dict[str, types.Schema]: + """Recursively convert JSON Schema property dicts to Gemini Schema objects.""" + result: dict[str, types.Schema] = {} + for name, defn in props.items(): + if not isinstance(defn, dict): + result[name] = types.Schema(type="STRING") + continue + + schema_type = _json_schema_type(defn.get("type", "string")) + kwargs: dict[str, Any] = {"type": schema_type} + + if "description" in defn: + kwargs["description"] = defn["description"] + if "enum" in defn: + kwargs["enum"] = defn["enum"] + + # Nested object + if schema_type == "OBJECT" and "properties" in defn: + kwargs["properties"] = _convert_schema_properties(defn["properties"]) + + # Array items + if schema_type == "ARRAY" and "items" in defn: + items = defn["items"] + if isinstance(items, dict): + item_type = _json_schema_type(items.get("type", "string")) + item_kwargs: dict[str, Any] = {"type": item_type} + if "properties" in items: + item_kwargs["properties"] = _convert_schema_properties(items["properties"]) + kwargs["items"] = types.Schema(**item_kwargs) + else: + kwargs["items"] = types.Schema(type="STRING") + + result[name] = types.Schema(**kwargs) + return result + + +def _agent_tools_to_gemini(agent: AgentConfig) -> list[types.Tool] | None: + """Convert EVA AgentConfig tools to Gemini FunctionDeclaration list.""" + if not agent.tools: + return None + + declarations: list[types.FunctionDeclaration] = [] + for tool in agent.tools: + properties = _convert_schema_properties(tool.get_parameter_properties()) + required = tool.get_required_param_names() + + params_schema = types.Schema( + type="OBJECT", + properties=properties, + required=required or None, + ) + + declarations.append( + types.FunctionDeclaration( + name=tool.function_name, + description=f"{tool.name}: {tool.description}", + parameters=params_schema, + behavior=types.Behavior.BLOCKING, + ) + ) + + if not declarations: + return None + return [types.Tool(function_declarations=declarations)] + + +# --------------------------------------------------------------------------- +# Gemini Live AssistantServer +# --------------------------------------------------------------------------- + + +class GeminiLiveAssistantServer(AbstractAssistantServer): + """Bridges Twilio WebSocket <-> Gemini Live API for EVA-Bench evaluation.""" + + def __init__( + self, + current_date_time: str, + pipeline_config: PipelineConfig | SpeechToSpeechConfig | AudioLLMConfig, + agent: AgentConfig, + agent_config_path: str, + scenario_db_path: str, + output_dir: Path, + port: int, + conversation_id: str, + ): + super().__init__( + current_date_time=current_date_time, + pipeline_config=pipeline_config, + agent=agent, + agent_config_path=agent_config_path, + scenario_db_path=scenario_db_path, + output_dir=output_dir, + port=port, + conversation_id=conversation_id, + ) + + # Recording sample rate (Gemini outputs 24 kHz) + self._audio_sample_rate = _RECORDING_SAMPLE_RATE + + # Server state + self._app: FastAPI | None = None + self._server: uvicorn.Server | None = None + self._server_task: asyncio.Task | None = None + self._running = False + + # Gemini model name from s2s_params or default + s2s_params: dict[str, Any] = {} + if isinstance(self.pipeline_config, SpeechToSpeechConfig): + s2s_params = self.pipeline_config.s2s_params or {} + self._model = s2s_params.get("model", "gemini-3.1-flash-live-preview") + self._voice = s2s_params.get("voice", "Kore") + self._language_code = s2s_params.get("language_code", "en-US") + self._api_key = s2s_params.get("api_key", "") + + # Build system prompt (same pattern as pipecat realtime) + prompt_manager = PromptManager() + self._system_prompt = prompt_manager.get_prompt( + "realtime_agent.system_prompt", + agent_personality=agent.description, + agent_instructions=agent.instructions, + datetime=self.current_date_time, + ) + + # Build Gemini tools + self._gemini_tools = _agent_tools_to_gemini(agent) + + # Framework log writers + self._fw_log: FrameworkLogWriter | None = None + self._metrics_log: MetricsLogWriter | None = None + + # ------------------------------------------------------------------ + # Server lifecycle + # ------------------------------------------------------------------ + + async def start(self) -> None: + """Start the FastAPI WebSocket server (non-blocking).""" + if self._running: + logger.warning("Server already running") + return + + self.output_dir.mkdir(parents=True, exist_ok=True) + self._fw_log = FrameworkLogWriter(self.output_dir) + self._metrics_log = MetricsLogWriter(self.output_dir) + + self._app = FastAPI() + + @self._app.websocket("/ws") + async def websocket_endpoint(websocket: WebSocket): + await websocket.accept() + await self._handle_session(websocket) + + @self._app.websocket("/") + async def websocket_root(websocket: WebSocket): + await websocket.accept() + await self._handle_session(websocket) + + config = uvicorn.Config( + self._app, + host="0.0.0.0", + port=self.port, + log_level="warning", + lifespan="off", + ) + self._server = uvicorn.Server(config) + self._running = True + self._server_task = asyncio.create_task(self._server.serve()) + + while not self._server.started: + await asyncio.sleep(0.01) + + logger.info(f"GeminiLive server started on ws://localhost:{self.port}") + + async def stop(self) -> None: + """Stop the server, save outputs.""" + if not self._running: + return + self._running = False + + if self._server: + self._server.should_exit = True + if self._server_task: + try: + await asyncio.wait_for(self._server_task, timeout=5.0) + except TimeoutError: + self._server_task.cancel() + try: + await self._server_task + except asyncio.CancelledError: + pass + except (asyncio.CancelledError, KeyboardInterrupt): + pass + self._server = None + self._server_task = None + + await self.save_outputs() + logger.info(f"GeminiLive server stopped on port {self.port}") + + # ------------------------------------------------------------------ + # Gemini client factory + # ------------------------------------------------------------------ + + def _create_genai_client(self) -> genai.Client: + """Create a google-genai Client using Vertex AI or API key.""" + if self._api_key: + logger.info("Using Gemini API key for authentication") + return genai.Client(api_key=self._api_key) + + project = os.environ.get("GOOGLE_CLOUD_PROJECT") + location = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1") + if project: + logger.info(f"Using Vertex AI (project={project}, location={location})") + return genai.Client(vertexai=True, project=project, location=location) + + # Fallback: let the SDK resolve credentials (e.g. ADC) + logger.info("No explicit credentials; relying on google-genai default resolution") + return genai.Client() + + # ------------------------------------------------------------------ + # Live session configuration + # ------------------------------------------------------------------ + + def _build_live_config(self) -> types.LiveConnectConfig: + """Build the LiveConnectConfig for the Gemini session.""" + config_kwargs: dict[str, Any] = { + "response_modalities": [types.Modality.AUDIO], + "system_instruction": self._system_prompt, + "speech_config": types.SpeechConfig( + voice_config=types.VoiceConfig( + prebuilt_voice_config=types.PrebuiltVoiceConfig( + voice_name=self._voice, + ) + ), + language_code=self._language_code, + ), + "realtime_input_config": types.RealtimeInputConfig( + automatic_activity_detection=types.AutomaticActivityDetection( + disabled=False, + start_of_speech_sensitivity=types.StartSensitivity.START_SENSITIVITY_LOW, + end_of_speech_sensitivity=types.EndSensitivity.END_SENSITIVITY_LOW, + silence_duration_ms=200, + ), + activity_handling=types.ActivityHandling.START_OF_ACTIVITY_INTERRUPTS, + ), + "input_audio_transcription": types.AudioTranscriptionConfig(), + "output_audio_transcription": types.AudioTranscriptionConfig(), + } + if self._gemini_tools: + config_kwargs["tools"] = self._gemini_tools + + return types.LiveConnectConfig(**config_kwargs) + + # ------------------------------------------------------------------ + # Session handler + # ------------------------------------------------------------------ + + async def _handle_session(self, websocket: WebSocket) -> None: + """Bridge a single Twilio WebSocket session with Gemini Live.""" + logger.info("Client connected to GeminiLive server") + + stream_sid: str = self.conversation_id + client = self._create_genai_client() + live_config = self._build_live_config() + + # Track Twilio stream state + twilio_connected = True + + # Accumulate assistant speech text per turn + _assistant_turn_text: list[str] = [] + _user_turn_text: list[str] = [] + + _in_model_turn = False + _user_speaking = False + _user_speech_start_ts: str | None = None # Timestamp from audio_interface + _assistant_turn_start_ts: str | None = None # Wall-clock ms when first audio chunk arrives + + # Queue for outbound mulaw chunks; the pacer task drains it at real-time rate + # so _process_gemini_events never sleeps and keeps reading Gemini events promptly. + audio_output_queue: asyncio.Queue[bytes] = asyncio.Queue() + + try: + async with client.aio.live.connect(model=self._model, config=live_config) as session: + logger.info(f"Gemini Live session connected (model={self._model})") + + # Trigger the initial greeting using realtime text input. + # send_client_content with Content turns is not supported by + # some Live models (e.g. gemini-3.1-flash-live-preview), but + # send_realtime_input(text=...) works universally. + await session.send_realtime_input(text=f"Please greet with: {INITIAL_MESSAGE}") + self._fw_log.turn_start() + + # ----- Concurrent tasks ----- + async def _forward_user_audio() -> None: + """Read Twilio WS messages, convert audio, send to Gemini.""" + nonlocal stream_sid, twilio_connected + try: + while twilio_connected and self._running: + try: + raw = await asyncio.wait_for(websocket.receive_text(), timeout=1.0) + except TimeoutError: + continue + + # Parse Twilio JSON envelope + try: + msg = json.loads(raw) + except json.JSONDecodeError: + continue + + event = msg.get("event") + if event == "start": + stream_sid = msg.get("start", {}).get("streamSid", stream_sid) + logger.info(f"Twilio stream started: {stream_sid}") + continue + elif event == "stop": + logger.info("Twilio stream stopped") + twilio_connected = False + break + elif event == "user_speech_start": + nonlocal _user_speech_start_ts + _user_speech_start_ts = msg.get("timestamp_ms") + logger.info(f"User speech start timestamp received: {_user_speech_start_ts}") + continue + elif event == "media": + # Extract raw mulaw bytes + mulaw_bytes = parse_twilio_media_message(raw) + if mulaw_bytes is None: + continue + + # Convert 8 kHz mulaw -> 16 kHz PCM for Gemini + pcm_16k = mulaw_8k_to_pcm16_16k(mulaw_bytes) + + pcm_24k = mulaw_8k_to_pcm16_24k(mulaw_bytes) + if not _in_model_turn: + sync_buffer_to_position(self.assistant_audio_buffer, len(self.user_audio_buffer)) + self.user_audio_buffer.extend(pcm_24k) + + # Send to Gemini + await session.send_realtime_input( + audio=types.Blob( + data=pcm_16k, + mime_type="audio/pcm;rate=16000", + ) + ) + except WebSocketDisconnect: + logger.info("Twilio WebSocket disconnected") + twilio_connected = False + except asyncio.CancelledError: + pass + except Exception as e: + logger.error(f"Error in user audio forwarder: {e}", exc_info=True) + finally: + twilio_connected = False + + async def _pace_audio_output() -> None: + """Drain audio_output_queue and forward chunks at real-time rate. + + Runs as its own task so _process_gemini_events never blocks on + sleep and can read the next Gemini event immediately. + """ + nonlocal twilio_connected + next_send_time = time.monotonic() + try: + while self._running: + try: + chunk = await asyncio.wait_for(audio_output_queue.get(), timeout=1.0) + except TimeoutError: + continue + + twilio_msg = create_twilio_media_message(stream_sid, chunk) + try: + await websocket.send_text(twilio_msg) + except Exception: + twilio_connected = False + return + + now = time.monotonic() + if next_send_time <= now: + next_send_time = now + next_send_time += MULAW_CHUNK_DURATION_S + sleep_duration = next_send_time - time.monotonic() + if sleep_duration > 0: + await asyncio.sleep(sleep_duration) + except asyncio.CancelledError: + pass + + async def _process_gemini_events() -> None: + """Consume events from the Gemini Live session.""" + nonlocal _assistant_turn_text, _user_turn_text + nonlocal _in_model_turn, _user_speaking, _user_speech_start_ts, _assistant_turn_start_ts + nonlocal twilio_connected + + logger.info("Gemini event processor started") + event_count = 0 + try: + # Use manual receive loop instead of `async for ... in session.receive()` + # because the iterator exits after turn_complete (returns None), + # closing the session prematurely. The manual loop keeps the session + # alive between model turns. + while self._running: + try: + response = await asyncio.wait_for(session._receive(), timeout=2.0) + except TimeoutError: + continue + if response is None: + continue + if not self._running: + break + + event_count += 1 + + # --- Server content (audio, transcriptions, turn signals) --- + if response.server_content: + sc = response.server_content + + # Model audio output + if sc.model_turn: + if not _in_model_turn: + _in_model_turn = True + _assistant_turn_text = [] + _assistant_turn_start_ts = str(int(round(time.time() * 1000))) + self._fw_log.turn_start() + + for part in sc.model_turn.parts: + if part.inline_data and part.inline_data.data: + pcm_24k = bytes(part.inline_data.data) + + # Skip tiny chunks that can't be resampled + if len(pcm_24k) < 6: + continue + + if not _user_speaking: + sync_buffer_to_position( + self.user_audio_buffer, len(self.assistant_audio_buffer) + ) + self.assistant_audio_buffer.extend(pcm_24k) + + # Convert to 8 kHz mulaw and send in + # small chunks so the user simulator's + # silence-detection timing works correctly. + if twilio_connected: + try: + mulaw = pcm16_24k_to_mulaw_8k(pcm_24k) + except Exception as conv_err: + logger.warning( + f"Audio conversion error ({len(pcm_24k)} bytes): {conv_err}" + ) + continue + + offset = 0 + while offset < len(mulaw): + chunk = mulaw[offset : offset + MULAW_CHUNK_SIZE] + offset += MULAW_CHUNK_SIZE + await audio_output_queue.put(chunk) + + # Turn complete + if sc.turn_complete: + logger.debug("Gemini turn complete") + full_text = " ".join(_assistant_turn_text).strip() + if full_text: + self.audit_log.append_assistant_output( + full_text, timestamp_ms=_assistant_turn_start_ts + ) + self._fw_log.llm_response(full_text) + self._fw_log.turn_end(was_interrupted=False) + _in_model_turn = False + _assistant_turn_text = [] + _assistant_turn_start_ts = None + + # Barge-in / interruption + if sc.interrupted: + _user_speaking = True + logger.debug("Gemini turn interrupted (barge-in)") + full_text = " ".join(_assistant_turn_text).strip() + if full_text: + self.audit_log.append_assistant_output( + full_text + " [interrupted]", timestamp_ms=_assistant_turn_start_ts + ) + self._fw_log.tts_text(full_text) + self._fw_log.turn_end(was_interrupted=True) + _in_model_turn = False + _assistant_turn_text = [] + _assistant_turn_start_ts = None + + # Input transcription (user speech) + if sc.input_transcription: + _user_speaking = False + text = sc.input_transcription.text or "" + if text.strip(): + logger.info(f"User transcription: {text.strip()}") + self.audit_log.append_user_input( + text.strip(), timestamp_ms=_user_speech_start_ts + ) + _user_speech_start_ts = None # Reset for next turn + + # Output transcription (model speech) + if sc.output_transcription: + text = sc.output_transcription.text or "" + if text.strip(): + _assistant_turn_text.append(text.strip()) + logger.debug(f"Assistant transcription chunk: {text.strip()}") + + # --- Tool calls --- + if response.tool_call: + for fc in response.tool_call.function_calls: + tool_name = fc.name + tool_args = dict(fc.args) if fc.args else {} + logger.info(f"Tool call: {tool_name}({json.dumps(tool_args)})") + + # Record in audit log + self.audit_log.append_realtime_tool_call(tool_name, tool_args) + + # Execute tool + result = await self.tool_handler.execute(tool_name, tool_args) + logger.debug(f"Tool result: {tool_name} -> {json.dumps(result)}") + self.audit_log.append_tool_response(tool_name, result) + + # Send result back to Gemini + await session.send_tool_response( + function_responses=[ + types.FunctionResponse( + id=fc.id, + name=fc.name, + response=result, + ) + ] + ) + + # --- Usage metadata --- + if response.usage_metadata: + um = response.usage_metadata + prompt_tokens = getattr(um, "prompt_token_count", 0) or 0 + completion_tokens = getattr(um, "candidates_token_count", 0) or 0 + if prompt_tokens or completion_tokens: + self._metrics_log.write_token_usage( + processor="gemini_live", + model=self._model, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + + except asyncio.CancelledError: + pass + except Exception as e: + logger.error(f"Error in Gemini event processor: {e}", exc_info=True) + + # Run all three tasks; when any exits, cancel the others + user_task = asyncio.create_task(_forward_user_audio()) + gemini_task = asyncio.create_task(_process_gemini_events()) + pacer_task = asyncio.create_task(_pace_audio_output()) + + done, pending = await asyncio.wait( + [user_task, gemini_task, pacer_task], + return_when=asyncio.FIRST_COMPLETED, + ) + + def _task_name(t: asyncio.Task) -> str: + if t is user_task: + return "user_audio" + if t is gemini_task: + return "gemini_events" + return "audio_pacer" + + # Log which task finished first + for task in done: + exc = task.exception() + if exc: + logger.error(f"Task '{_task_name(task)}' failed: {exc}", exc_info=exc) + else: + logger.info(f"Task '{_task_name(task)}' completed normally") + + for task in pending: + logger.info(f"Cancelling pending task '{_task_name(task)}'") + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + except Exception as e: + logger.error(f"Gemini Live session error: {e}", exc_info=True) + finally: + logger.info("Client disconnected from GeminiLive server") diff --git a/src/eva/assistant/openai_realtime_server.py b/src/eva/assistant/openai_realtime_server.py new file mode 100644 index 00000000..1a423fed --- /dev/null +++ b/src/eva/assistant/openai_realtime_server.py @@ -0,0 +1,798 @@ +"""OpenAI Realtime API assistant server implementation. + +Uses the OpenAI Python SDK's Realtime API (client.beta.realtime.connect()) +to bridge audio between a Twilio-framed WebSocket (user simulator) and the +OpenAI Realtime model. Handles tool calls via the local ToolExecutor and +records all conversation events in the audit log. +""" + +import asyncio +import base64 +import json +import time +from dataclasses import dataclass, field +from typing import Any + +import uvicorn +from fastapi import FastAPI, WebSocket, WebSocketDisconnect +from openai import AsyncOpenAI + +from eva.assistant.audio_bridge import ( + FrameworkLogWriter, + MetricsLogWriter, + create_twilio_media_message, + mulaw_8k_to_pcm16_24k, + parse_twilio_media_message, + pcm16_24k_to_mulaw_8k, + sync_buffer_to_position, +) +from eva.assistant.base_server import INITIAL_MESSAGE, AbstractAssistantServer +from eva.utils.logging import get_logger +from eva.utils.prompt_manager import PromptManager + +logger = get_logger(__name__) + +# OpenAI Realtime operates at 24 kHz 16-bit mono PCM +OPENAI_SAMPLE_RATE = 24000 + +# Audio output pacing: send 160-byte mulaw chunks (20ms at 8kHz) at real-time rate +# so the user simulator's silence detection works correctly. +MULAW_CHUNK_SIZE = 160 # bytes per chunk (20ms at 8kHz, 1 byte per sample) +MULAW_CHUNK_DURATION_S = 0.02 # 20ms per chunk + + +def _wall_ms() -> str: + """Return current wall-clock time as epoch-milliseconds string.""" + return str(int(round(time.time() * 1000))) + + +@dataclass +class _UserTurnRecord: + """Tracks state for a single user speech turn.""" + + speech_started_wall_ms: str = "" + speech_stopped_wall_ms: str = "" + transcript: str = "" + flushed: bool = False + + +@dataclass +class _AssistantResponseState: + """Accumulates state for the current assistant response.""" + + transcript_parts: list[str] = field(default_factory=list) + transcript_done_text: str = "" # Final text from response.audio_transcript.done + first_audio_wall_ms: str | None = None + responding: bool = False + has_function_calls: bool = False + + +class OpenAIRealtimeAssistantServer(AbstractAssistantServer): + """Assistant server backed by the OpenAI Realtime API. + + Exposes a local WebSocket at ``ws://localhost:{port}/ws`` using the Twilio + frame format so the user simulator can connect as if talking to Twilio. + Internally bridges audio between Twilio (8 kHz mulaw) and OpenAI Realtime + (24 kHz PCM16 base64). + """ + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + + self._audio_sample_rate = OPENAI_SAMPLE_RATE + + self._app: FastAPI | None = None + self._server: uvicorn.Server | None = None + self._server_task: asyncio.Task | None = None + self._running: bool = False + + self._fw_log: FrameworkLogWriter | None = None + self._metrics_log: MetricsLogWriter | None = None + + prompt_manager = PromptManager() + self._system_prompt: str = prompt_manager.get_prompt( + "realtime_agent.system_prompt", + agent_personality=self.agent.description, + agent_instructions=self.agent.instructions, + datetime=self.current_date_time, + ) + + self._realtime_tools: list[dict] = self._build_realtime_tools() + + self._user_turn: _UserTurnRecord | None = None + self._assistant_state = _AssistantResponseState() + self._stream_sid: str = "" + + self._user_speaking: bool = False + self._bot_speaking: bool = False + self._user_frame_count: int = 0 + self._delta_count: int = 0 + + # User speech start timestamp from audio_interface (source of truth) + self._audio_interface_speech_start_ts: str | None = None + + self._model: str = self.pipeline_config.s2s + + async def start(self) -> None: + """Start the FastAPI WebSocket server.""" + if self._running: + logger.warning("Server already running") + return + + self.output_dir.mkdir(parents=True, exist_ok=True) + self._fw_log = FrameworkLogWriter(self.output_dir) + self._metrics_log = MetricsLogWriter(self.output_dir) + + self._app = FastAPI() + + @self._app.websocket("/ws") + async def websocket_endpoint(websocket: WebSocket): + await websocket.accept() + await self._handle_session(websocket) + + @self._app.websocket("/") + async def websocket_root(websocket: WebSocket): + await websocket.accept() + await self._handle_session(websocket) + + config = uvicorn.Config( + self._app, + host="0.0.0.0", + port=self.port, + log_level="warning", + lifespan="off", + ) + self._server = uvicorn.Server(config) + self._running = True + self._server_task = asyncio.create_task(self._server.serve()) + + while not self._server.started: + await asyncio.sleep(0.01) + + logger.info(f"OpenAI Realtime server started on ws://localhost:{self.port}") + + async def stop(self) -> None: + """Stop the server and save all outputs.""" + if not self._running: + return + + self._running = False + + if self._server: + self._server.should_exit = True + if self._server_task: + try: + await asyncio.wait_for(self._server_task, timeout=5.0) + except TimeoutError: + self._server_task.cancel() + try: + await self._server_task + except asyncio.CancelledError: + pass + except (asyncio.CancelledError, KeyboardInterrupt): + pass + self._server = None + self._server_task = None + + await self.save_outputs() + logger.info(f"OpenAI Realtime server stopped on port {self.port}") + + async def save_outputs(self) -> None: + """Save all outputs including mixed audio.""" + await super().save_outputs() + + async def _handle_session(self, websocket: WebSocket) -> None: + """Handle a single WebSocket session. + + 1. Accept Twilio WS connection + 2. Connect to OpenAI Realtime API + 3. Configure session (instructions, tools, voice, VAD) + 4. Run two concurrent tasks: + a. Forward user audio: Twilio WS -> decode mulaw -> PCM16 24kHz base64 -> OpenAI + b. Process OpenAI events: async for event in conn -> handle each type + 5. On tool call: execute via self.tool_handler, send result back + 6. On audio: decode base64 PCM16 -> record -> encode mulaw -> send to Twilio WS + """ + logger.info("Client connected to OpenAI Realtime server") + + # Reset per-session state + self._user_turn = None + self._assistant_state = _AssistantResponseState() + self._stream_sid = self.conversation_id + self._user_speaking = False + self._bot_speaking = False + + api_key = self.pipeline_config.s2s_params.get("api_key") + if not api_key: + raise ValueError("API key required for openai realtime") + client = AsyncOpenAI(api_key=api_key) + + try: + async with client.realtime.connect(model=self._model) as conn: + # Configure the session + await conn.session.update( + session={ + "type": "realtime", + "output_modalities": ["audio"], + "instructions": self._system_prompt, + "audio": { + "output": { + "voice": self.pipeline_config.s2s_params.get("voice", "marin"), + "format": {"type": "audio/pcm", "rate": 24000}, + }, + "input": { + "format": {"type": "audio/pcm", "rate": 24000}, + "turn_detection": { + "type": self.pipeline_config.s2s_params.get("vad_settings", {}).get( + "type", "server_vad" + ), + "threshold": self.pipeline_config.s2s_params.get("vad_settings", {}).get( + "threshold", 0.5 + ), + "prefix_padding_ms": self.pipeline_config.s2s_params.get("vad_settings", {}).get( + "prefix_padding_ms", 300 + ), + "silence_duration_ms": self.pipeline_config.s2s_params.get("vad_settings", {}).get( + "silence_duration_ms", 200 + ), + }, + "transcription": { + "model": self.pipeline_config.s2s_params.get("transcription_model", "whisper-1") + }, + }, + }, + "tools": self._realtime_tools, + } + ) + + # Trigger the initial greeting + await conn.conversation.item.create( + item={ + "type": "message", + "role": "user", + "content": [ + { + "type": "input_text", + "text": f"Say: '{INITIAL_MESSAGE}'", + } + ], + } + ) + await conn.response.create() + + # Run all three tasks; when any exits, cancel the others + audio_output_queue: asyncio.Queue[bytes] = asyncio.Queue() + forward_task = asyncio.create_task(self._forward_user_audio(websocket, conn)) + receive_task = asyncio.create_task(self._process_openai_events(conn, websocket, audio_output_queue)) + pacer_task = asyncio.create_task(self._pace_audio_output(websocket, audio_output_queue)) + + done, pending = await asyncio.wait( + [forward_task, receive_task, pacer_task], + return_when=asyncio.FIRST_COMPLETED, + ) + for task in pending: + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + # Check for exceptions in completed tasks + for task in done: + if task.exception(): + logger.error(f"Session task failed: {task.exception()}") + + except Exception as e: + logger.error(f"OpenAI Realtime session error: {e}", exc_info=True) + finally: + logger.info("Client disconnected from OpenAI Realtime server") + + # ── Audio output pacer (OpenAI -> Twilio WS at real-time rate) ─── + + async def _pace_audio_output(self, websocket: WebSocket, audio_output_queue: asyncio.Queue[bytes]) -> None: + """Drain audio_output_queue and forward chunks at real-time rate. + + Runs as its own task so _process_openai_events never blocks on sleep + and can read the next OpenAI event immediately. + """ + next_send_time = time.monotonic() + try: + while True: + try: + chunk = await asyncio.wait_for(audio_output_queue.get(), timeout=1.0) + except TimeoutError: + continue + + twilio_msg = create_twilio_media_message(self._stream_sid, chunk) + try: + await websocket.send_text(twilio_msg) + except Exception as e: + logger.error(f"Error sending audio to Twilio WS: {e}") + return + + now = time.monotonic() + if next_send_time <= now: + next_send_time = now + next_send_time += MULAW_CHUNK_DURATION_S + sleep_duration = next_send_time - time.monotonic() + if sleep_duration > 0: + await asyncio.sleep(sleep_duration) + except asyncio.CancelledError: + pass + + # ── User audio forwarding (Twilio WS -> OpenAI) ────────────────── + + async def _forward_user_audio(self, websocket: WebSocket, conn: Any) -> None: + """Read Twilio media frames and forward audio to OpenAI Realtime.""" + try: + while True: + raw = await websocket.receive_text() + data = json.loads(raw) + event_type = data.get("event") + + if event_type == "start": + # Twilio stream start - extract streamSid + self._stream_sid = data.get("start", {}).get("streamSid", self.conversation_id) + logger.debug(f"Twilio stream started: streamSid={self._stream_sid}") + continue + + if event_type == "stop": + logger.debug("Twilio stream stopped") + break + + if event_type == "user_speech_start": + # Timestamp from audio_interface when user audio actually started + self._audio_interface_speech_start_ts = data.get("timestamp_ms") + logger.debug(f"User speech start timestamp received: {self._audio_interface_speech_start_ts}") + continue + + if event_type != "media": + continue + + # Extract raw mulaw audio bytes + mulaw_bytes = parse_twilio_media_message(raw) + if mulaw_bytes is None: + continue + + # Convert 8kHz mulaw -> 24kHz PCM16 + pcm16_24k = mulaw_8k_to_pcm16_24k(mulaw_bytes) + + asst_before = len(self.assistant_audio_buffer) + synced = 0 + if not self._bot_speaking: + sync_target = len(self.user_audio_buffer) + sync_buffer_to_position(self.assistant_audio_buffer, sync_target) + synced = len(self.assistant_audio_buffer) - asst_before + self.user_audio_buffer.extend(pcm16_24k) + self._user_frame_count += 1 + if self._user_frame_count % 50 == 0: + diff = len(self.user_audio_buffer) - len(self.assistant_audio_buffer) + diff_ms = diff / (OPENAI_SAMPLE_RATE * 2) * 1000 + logger.debug( + f"[ALIGN DEBUG] user_frame #{self._user_frame_count}: " + f"user={len(self.user_audio_buffer)} asst={len(self.assistant_audio_buffer)} " + f"diff={diff}({diff_ms:.0f}ms) bot_spk={self._bot_speaking} " + f"usr_spk={self._user_speaking} added={len(pcm16_24k)} synced={synced}" + ) + + # Encode as base64 and send to OpenAI + audio_b64 = base64.b64encode(pcm16_24k).decode("ascii") + await conn.input_audio_buffer.append(audio=audio_b64) + + except WebSocketDisconnect: + logger.debug("Twilio WebSocket disconnected") + except asyncio.CancelledError: + pass + except Exception as e: + logger.error(f"Error forwarding user audio: {e}", exc_info=True) + + # ── OpenAI event processing ─────────────────────────────────────── + + async def _process_openai_events( + self, conn: Any, websocket: WebSocket, audio_output_queue: asyncio.Queue[bytes] + ) -> None: + """Process events from the OpenAI Realtime connection.""" + try: + async for event in conn: + try: + await self._handle_openai_event(event, conn, websocket, audio_output_queue) + except Exception as e: + logger.error(f"Error handling event {getattr(event, 'type', '?')}: {e}", exc_info=True) + except asyncio.CancelledError: + pass + except Exception as e: + logger.error(f"Error in OpenAI event loop: {e}", exc_info=True) + + async def _handle_openai_event( + self, event: Any, conn: Any, websocket: WebSocket, audio_output_queue: asyncio.Queue[bytes] + ) -> None: + """Dispatch a single OpenAI Realtime event.""" + event_type = getattr(event, "type", "") + + match event_type: + case "session.created": + logger.info("OpenAI Realtime session created") + + case "session.updated": + logger.debug("OpenAI Realtime session updated") + + case "input_audio_buffer.speech_started": + await self._on_speech_started(event) + + case "input_audio_buffer.speech_stopped": + await self._on_speech_stopped(event) + + case "conversation.item.input_audio_transcription.completed": + await self._on_transcription_completed(event) + + case "conversation.item.input_audio_transcription.delta": + logger.debug(f"Transcription delta: {getattr(event, 'delta', '')}") + + case "conversation.item.input_audio_transcription.failed": + error_info = getattr(event, "error", "") + logger.warning(f"Transcription failed: {error_info}") + # Gracefully handle transcription failure (e.g. API key lacks + # whisper-1 access). If a user turn was active but has no + # transcript yet, record a placeholder so the turn is not lost. + if self._user_turn and not self._user_turn.flushed: + timestamp_ms = self._user_turn.speech_started_wall_ms or None + self.audit_log.append_user_input( + "[user speech - transcription unavailable]", + timestamp_ms=timestamp_ms, + ) + self._user_turn.flushed = True + + case "response.output_audio.delta": + await self._on_audio_delta(event, audio_output_queue) + + case "response.output_audio_transcript.delta": + self._on_transcript_delta(event) + + case "response.output_audio_transcript.done": + self._on_transcript_done(event) + + case "response.function_call_arguments.done": + await self._on_function_call_done(event, conn) + + case "response.done": + await self._on_response_done(event) + + case "error": + error_data = getattr(event, "error", None) + logger.error(f"OpenAI Realtime error: {error_data}") + + case _: + logger.debug(f"Unhandled OpenAI event: {event_type}") + + # ── Event handlers ──────────────────────────────────────────────── + + async def _on_speech_started(self, event: Any) -> None: + """Handle input_audio_buffer.speech_started.""" + self._user_speaking = True + diff = len(self.user_audio_buffer) - len(self.assistant_audio_buffer) + diff_ms = diff / (OPENAI_SAMPLE_RATE * 2) * 1000 + logger.debug( + f"[ALIGN DEBUG] speech_started: user={len(self.user_audio_buffer)} " + f"asst={len(self.assistant_audio_buffer)} diff={diff}({diff_ms:.0f}ms) " + f"bot_spk={self._bot_speaking}" + ) + wall = _wall_ms() + + # If assistant was responding, flush interrupted response + if self._assistant_state.responding and self._assistant_state.transcript_parts: + partial_text = "".join(self._assistant_state.transcript_parts) + " [interrupted]" + self.audit_log.append_assistant_output( + partial_text, + timestamp_ms=self._assistant_state.first_audio_wall_ms, + ) + if self._fw_log: + self._fw_log.tts_text(partial_text) + self._fw_log.turn_end(was_interrupted=True) + logger.debug(f"Flushed interrupted assistant response: {partial_text[:60]}...") + self._assistant_state = _AssistantResponseState() + + # Start new user turn only if previous one was flushed (or doesn't exist) + # This preserves the original timestamp when VAD fires multiple speech_started + # events during a single logical user utterance (due to brief pauses) + if not self._user_turn or self._user_turn.flushed: + # Use timestamp from audio_interface if available (source of truth) + start_ts = self._audio_interface_speech_start_ts or wall + self._user_turn = _UserTurnRecord(speech_started_wall_ms=start_ts) + if self._fw_log: + self._fw_log.turn_start(timestamp_ms=int(start_ts)) + logger.debug( + f"Speech started at {start_ts} (new turn, from_audio_interface={self._audio_interface_speech_start_ts is not None})" + ) + self._audio_interface_speech_start_ts = None # Reset for next turn + else: + logger.debug(f"Speech started at {wall} (continuing existing turn)") + + async def _on_speech_stopped(self, event: Any) -> None: + """Handle input_audio_buffer.speech_stopped.""" + self._user_speaking = False + diff = len(self.user_audio_buffer) - len(self.assistant_audio_buffer) + diff_ms = diff / (OPENAI_SAMPLE_RATE * 2) * 1000 + logger.info( + f"[ALIGN DEBUG] speech_stopped: user={len(self.user_audio_buffer)} " + f"asst={len(self.assistant_audio_buffer)} diff={diff}({diff_ms:.0f}ms) " + f"bot_spk={self._bot_speaking}" + ) + wall = _wall_ms() + if self._user_turn: + self._user_turn.speech_stopped_wall_ms = wall + else: + self._user_turn = _UserTurnRecord(speech_stopped_wall_ms=wall) + + logger.debug(f"Speech stopped at {wall}") + + async def _on_transcription_completed(self, event: Any) -> None: + """Handle conversation.item.input_audio_transcription.completed.""" + transcript = getattr(event, "transcript", "") or "" + transcript = transcript.strip() + + if not transcript: + logger.debug("Empty transcription, skipping") + return + + timestamp_ms = None + if self._user_turn: + timestamp_ms = self._user_turn.speech_started_wall_ms or None + self._user_turn.transcript = transcript + self._user_turn.flushed = True + + self.audit_log.append_user_input(transcript, timestamp_ms=timestamp_ms) + logger.debug(f"User transcription: {transcript}...") + + async def _on_audio_delta(self, event: Any, audio_output_queue: asyncio.Queue[bytes]) -> None: + """Handle response.audio.delta - assistant audio chunk.""" + delta_b64 = getattr(event, "delta", "") or "" + if not delta_b64: + return + + pcm16_bytes = base64.b64decode(delta_b64) + + if self._assistant_state.first_audio_wall_ms is None: + self._assistant_state.first_audio_wall_ms = _wall_ms() + self._assistant_state.responding = True + self._bot_speaking = True + + user_before = len(self.user_audio_buffer) + synced = 0 + if not self._user_speaking: + sync_buffer_to_position(self.user_audio_buffer, len(self.assistant_audio_buffer)) + synced = len(self.user_audio_buffer) - user_before + self.assistant_audio_buffer.extend(pcm16_bytes) + self._delta_count += 1 + if self._delta_count % 10 == 0: + diff = len(self.user_audio_buffer) - len(self.assistant_audio_buffer) + diff_ms = diff / (OPENAI_SAMPLE_RATE * 2) * 1000 + logger.debug( + f"[ALIGN DEBUG] audio_delta #{self._delta_count}: " + f"user={len(self.user_audio_buffer)} asst={len(self.assistant_audio_buffer)} " + f"diff={diff}({diff_ms:.0f}ms) bot_spk={self._bot_speaking} " + f"usr_spk={self._user_speaking} added={len(pcm16_bytes)} synced_user={synced}" + ) + + # Convert 24kHz PCM16 -> 8kHz mulaw and enqueue for real-time pacing. + # _pace_audio_output owns the timing loop so this method returns immediately, + # allowing the OpenAI event loop to process the next event without delay. + try: + mulaw_bytes = pcm16_24k_to_mulaw_8k(pcm16_bytes) + offset = 0 + while offset < len(mulaw_bytes): + chunk = mulaw_bytes[offset : offset + MULAW_CHUNK_SIZE] + offset += MULAW_CHUNK_SIZE + await audio_output_queue.put(chunk) + except Exception as e: + logger.error(f"Error converting audio for output queue: {e}") + + def _on_transcript_delta(self, event: Any) -> None: + """Handle response.audio_transcript.delta - incremental assistant text.""" + delta = getattr(event, "delta", "") or "" + if delta: + self._assistant_state.transcript_parts.append(delta) + + def _on_transcript_done(self, event: Any) -> None: + """Handle response.audio_transcript.done - full assistant transcript. + + This is the most reliable source of what the model actually said. + Store it so _on_response_done can use it if delta accumulation failed. + """ + transcript = getattr(event, "transcript", "") or "" + if transcript: + self._assistant_state.transcript_done_text = transcript.strip() + logger.debug(f"Assistant transcript done: {transcript}...") + + async def _on_function_call_done(self, event: Any, conn: Any) -> None: + """Handle response.function_call_arguments.done - execute tool call.""" + call_id = getattr(event, "call_id", "") + func_name = getattr(event, "name", "") + arguments_str = getattr(event, "arguments", "{}") or "{}" + + try: + arguments = json.loads(arguments_str) + except json.JSONDecodeError: + arguments = {} + + logger.info(f"Tool call: {func_name}({json.dumps(arguments)})") + self._assistant_state.has_function_calls = True + + # Record in audit log + self.audit_log.append_realtime_tool_call(func_name, arguments) + + # Execute tool + result = await self.tool_handler.execute(func_name, arguments) + + # Record tool response + self.audit_log.append_tool_response(func_name, result) + + if self._fw_log: + self._fw_log.write( + "tool_call", + { + "frame": "tool_call", + "tool_name": func_name, + "arguments": arguments, + "result": result, + }, + ) + + # Send function call output back to OpenAI + await conn.conversation.item.create( + item={ + "type": "function_call_output", + "call_id": call_id, + "output": json.dumps(result), + } + ) + + # Trigger next response after tool result + await conn.response.create() + + async def _on_response_done(self, event: Any) -> None: + """Handle response.done - assistant response complete. + + Following the pipecat InstrumentedRealtimeLLMService pattern: + - Only call append_assistant_output() (no append_llm_call) + - Token usage goes to pipecat_metrics.jsonl only + """ + # Extract usage metrics + response = getattr(event, "response", None) + if response: + usage = getattr(response, "usage", None) + if usage and self._metrics_log: + input_tokens = getattr(usage, "input_tokens", 0) or 0 + output_tokens = getattr(usage, "output_tokens", 0) or 0 + self._metrics_log.write_token_usage( + processor="openai_realtime", + model=self._model, + prompt_tokens=input_tokens, + completion_tokens=output_tokens, + ) + + # Skip cancelled responses - these were interrupted and not fully spoken + if response and getattr(response, "status", None) == "cancelled": + logger.debug("response_done: cancelled response, skipping transcript entry") + self._reset_assistant_state() + return + + has_function_calls = self._response_has_function_calls(event) + + # Build transcript text from best available source: + # 1. response.audio_transcript.done text (most reliable) + # 2. Accumulated response.audio_transcript.delta parts + # 3. Text extracted from response.done output items + content = self._assistant_state.transcript_done_text + if not content: + content = "".join(self._assistant_state.transcript_parts).strip() + if not content: + content = self._extract_response_text(event) + + audio_was_streamed = self._assistant_state.first_audio_wall_ms is not None + + # Skip tool-call-only responses (nothing spoken) + if not content and has_function_calls: + logger.debug("response_done: tool-call-only response, skipping assistant entry") + self._reset_assistant_state() + return + + # Skip mixed responses where audio was not streamed + if content and not audio_was_streamed and has_function_calls: + logger.debug(f"response_done: mixed response with no audio, skipping: '{content[:60]}...'") + self._reset_assistant_state() + return + + # If audio was streamed but we have no transcript at all, skip rather + # than pollute the audit log with a placeholder. The audio recording + # still captures what was said. + if not content and audio_was_streamed: + logger.debug("response_done: audio streamed but no transcript available, skipping text entry") + self._reset_assistant_state() + return + + if not content: + # No audio, no text, no function calls — nothing to log + self._reset_assistant_state() + return + + # Log assistant output (single entry — no append_llm_call) + timestamp = self._assistant_state.first_audio_wall_ms or _wall_ms() + self.audit_log.append_assistant_output(content, timestamp_ms=timestamp) + + if self._fw_log: + self._fw_log.llm_response(content) + self._fw_log.turn_end(was_interrupted=False) + + logger.debug(f"response_done: '{content[:60]}...'") + self._reset_assistant_state() + + # ── Helpers ─────────────────────────────────────────────────────── + + def _reset_assistant_state(self) -> None: + """Clear accumulated assistant response state.""" + audio_was_streamed = self._assistant_state.first_audio_wall_ms is not None + diff = len(self.user_audio_buffer) - len(self.assistant_audio_buffer) + diff_ms = diff / (OPENAI_SAMPLE_RATE * 2) * 1000 + logger.debug( + f"[ALIGN DEBUG] reset_state: user={len(self.user_audio_buffer)} " + f"asst={len(self.assistant_audio_buffer)} diff={diff}({diff_ms:.0f}ms) " + f"audio_streamed={audio_was_streamed} bot_spk={self._bot_speaking}" + ) + if audio_was_streamed: + self._bot_speaking = False + self._assistant_state = _AssistantResponseState() + + def _build_realtime_tools(self) -> list[dict]: + """Convert agent tools to OpenAI Realtime session tool format. + + The Realtime API session.tools expects a flat structure: + {type, name, description, parameters: {type, properties, required}} + """ + tools: list[dict] = [] + if not self.agent.tools: + return tools + + for tool in self.agent.tools: + tools.append( + { + "type": "function", + "name": tool.function_name, + "description": f"{tool.name}: {tool.description}", + "parameters": { + "type": "object", + "properties": tool.get_parameter_properties(), + "required": tool.get_required_param_names(), + }, + } + ) + return tools + + @staticmethod + def _response_has_function_calls(event: Any) -> bool: + """Return True if the response.done event contains function_call outputs.""" + response = getattr(event, "response", None) + if not response: + return False + output_items = getattr(response, "output", None) or [] + return any(getattr(item, "type", "") == "function_call" for item in output_items) + + @staticmethod + def _extract_response_text(event: Any) -> str: + """Extract text content from response.done output items.""" + response = getattr(event, "response", None) + if not response: + return "" + + output_items = getattr(response, "output", None) or [] + text_parts: list[str] = [] + + for item in output_items: + content_list = getattr(item, "content", None) or [] + for part in content_list: + part_type = getattr(part, "type", "") + if part_type in ("audio", "text"): + transcript = getattr(part, "transcript", None) or getattr(part, "text", None) or "" + if transcript: + text_parts.append(transcript) + + return "".join(text_parts).strip() diff --git a/src/eva/assistant/server.py b/src/eva/assistant/pipecat_server.py similarity index 79% rename from src/eva/assistant/server.py rename to src/eva/assistant/pipecat_server.py index 2de9661a..42d8700c 100644 --- a/src/eva/assistant/server.py +++ b/src/eva/assistant/pipecat_server.py @@ -6,9 +6,7 @@ import asyncio import json -import wave from pathlib import Path -from typing import Any import uvicorn from fastapi import FastAPI, WebSocket @@ -22,7 +20,6 @@ TTSSpeakFrame, ) from pipecat.observers.loggers.metrics_log_observer import MetricsLogObserver -from pipecat.observers.user_bot_latency_observer import UserBotLatencyObserver from pipecat.pipeline.parallel_pipeline import ParallelPipeline from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -45,7 +42,8 @@ from pipecat.turns.user_turn_strategies import ExternalUserTurnStrategies, UserTurnStrategies from pipecat.utils.time import time_now_iso8601 -from eva.assistant.agentic.audit_log import AuditLog, convert_to_epoch_ms, current_timestamp_ms +from eva.assistant.agentic.audit_log import convert_to_epoch_ms, current_timestamp_ms +from eva.assistant.base_server import INITIAL_MESSAGE, AbstractAssistantServer from eva.assistant.pipeline.agent_processor import BenchmarkAgentProcessor, UserAudioCollector, UserObserver from eva.assistant.pipeline.audio_llm_processor import ( AudioLLMProcessor, @@ -62,7 +60,6 @@ create_tts_service, ) from eva.assistant.services.llm import LiteLLMClient -from eva.assistant.tools.tool_executor import ToolExecutor from eva.models.agents import AgentConfig from eva.models.config import AudioLLMConfig, PipelineConfig, SpeechToSpeechConfig from eva.utils.logging import get_logger @@ -77,10 +74,8 @@ # Should be larger than pipecat's VAD start_secs (0.2s) to account for VAD latency. VAD_PRE_SPEECH_BUFFER_SECS = 0.5 -INITIAL_MESSAGE = "Hello! How can I help you today?" - -class AssistantServer: +class PipecatAssistantServer(AbstractAssistantServer): """Pipecat-based WebSocket server for the assistant in voice conversations. This server: @@ -113,35 +108,24 @@ def __init__( port: Port to listen on conversation_id: Unique ID for this conversation """ - self.pipeline_config = pipeline_config - self.agent: AgentConfig = agent - self.agent_config_path = agent_config_path - self.scenario_db_path = scenario_db_path - self.output_dir = Path(output_dir) - self.port = port - self.conversation_id = conversation_id - self.current_date_time = current_date_time - - # Components (initialized on start) - self.audit_log = AuditLog() - self.agentic_system = None # Will be set in _handle_session - - # Initialize Python-based tool executor - self.tool_handler = ToolExecutor( - tool_config_path=agent_config_path, + super().__init__( + current_date_time=current_date_time, + pipeline_config=pipeline_config, + agent=agent, + agent_config_path=agent_config_path, scenario_db_path=scenario_db_path, - tool_module_path=self.agent.tool_module_path, - current_date_time=self.current_date_time, + output_dir=output_dir, + port=port, + conversation_id=conversation_id, ) + self.agentic_system = None # Will be set in _handle_session + # Wall-clock captured at on_user_turn_started for non-instrumented S2S models self._user_turn_started_wall_ms: str | None = None - # Audio buffer for accumulating audio data - self._audio_buffer = bytearray() + # Override audio sample rate for pipecat self._audio_sample_rate = SAMPLE_RATE - self.user_audio_buffer = bytearray() - self.assistant_audio_buffer = bytearray() # Server state self._app = None @@ -151,7 +135,6 @@ def __init__( self._task: PipelineTask | None = None self._running = False self.num_seconds = 0 - self._latency_measurements: list[float] = [] self._metrics_observer: MetricsFileObserver | None = None self.non_instrumented_realtime_llm = False @@ -230,7 +213,7 @@ async def stop(self) -> None: self._server_task = None # Save outputs - await self._save_outputs() + await self.save_outputs() logger.info(f"Assistant server stopped on port {self.port}") @@ -444,35 +427,6 @@ async def on_user_transcription(text: str, timestamp: str, turn_id: int | None) ) metrics_log_path = self.output_dir / "pipecat_metrics.jsonl" - self._latency_measurements = [] - - async def on_latency_measured(observer, latency_seconds: float): - """Event handler for UserBotLatencyObserver - stores latency measurements. - - For realtime LLM, adds VAD delay to get full user-perceived latency. - For pipecat VAD (non-realtime), uses the latency as-is. - """ - adjusted_latency = latency_seconds - - # Add VAD delay for realtime LLM to get full user-perceived latency - if isinstance(realtime_llm, InstrumentedRealtimeLLMService): - vad_delay_ms = realtime_llm.last_vad_delay_ms - if vad_delay_ms is not None: - vad_delay_s = vad_delay_ms / 1000.0 - adjusted_latency = latency_seconds + vad_delay_s - logger.debug( - f"Response latency captured: {adjusted_latency:.3f}s " - f"(VAD delay: {vad_delay_s:.3f}s + pipecat: {latency_seconds:.3f}s)" - ) - else: - logger.debug(f"Response latency captured: {latency_seconds:.3f}s (no VAD delay available)") - else: - logger.debug(f"Response latency captured: {latency_seconds:.3f}s") - - self._latency_measurements.append(adjusted_latency) - - user_bot_observer = UserBotLatencyObserver() - user_bot_observer.add_event_handler("on_latency_measured", on_latency_measured) # Create wall clock for consistent timestamps across log sources wall_clock = WallClock() @@ -481,7 +435,6 @@ async def on_latency_measured(observer, latency_seconds: float): observers = [ BenchmarkLogObserver(str(self.output_dir), self.conversation_id, clock=wall_clock), - user_bot_observer, # Track end-to-end response latency MetricsLogObserver(), # Log all metrics to console self._metrics_observer, # Write metrics to JSONL file ] @@ -530,28 +483,6 @@ async def on_latency_measured(observer, latency_seconds: float): self._metrics_observer.close() self._metrics_observer = None - # Save response latencies from UserBotLatencyObserver - try: - latencies = self._latency_measurements - latencies_file = self.output_dir / "response_latencies.json" - mean_latency = sum(latencies) / len(latencies) if latencies else 0.0 - max_latency = max(latencies) if latencies else 0.0 - - with open(latencies_file, "w") as f: - json.dump( - { - "latencies": latencies, - "mean": mean_latency, - "max": max_latency, - "count": len(latencies), - }, - f, - indent=2, - ) - logger.info(f"Saved {len(latencies)} response latencies to {latencies_file}") - except Exception as e: - logger.error(f"Error saving response latencies: {e}", exc_info=True) - logger.info("Client disconnected from assistant server") def _create_transport(self, websocket) -> FastAPIWebsocketTransport: @@ -786,112 +717,18 @@ def _current_iso_timestamp() -> str: """Return the current time as an ISO 8601 string with timezone.""" return time_now_iso8601() - def _save_wav_file(self, audio_data: bytes, file_path: Path, sample_rate: int, num_channels: int) -> None: - """Save audio data to a WAV file. - - Args: - audio_data: Raw audio bytes (16-bit PCM) - file_path: Path to save the WAV file - sample_rate: Sample rate in Hz - num_channels: Number of channels (1=mono, 2=stereo) - """ - try: - with wave.open(str(file_path), "wb") as wav_file: - wav_file.setnchannels(num_channels) - wav_file.setsampwidth(2) # 16-bit PCM - wav_file.setframerate(sample_rate) - wav_file.writeframes(audio_data) - logger.debug(f"Audio saved to {file_path} ({len(audio_data)} bytes)") - except Exception as e: - logger.error(f"Error saving audio to {file_path}: {e}") - - def _save_audio(self) -> None: - """Save accumulated audio to WAV file.""" - if not self._audio_buffer: - logger.warning("No audio data to save") - return - - audio_path = self.output_dir / "audio_mixed.wav" - self._save_wav_file( - bytes(self._audio_buffer), - audio_path, - self._audio_sample_rate, - 1, # Mono - ) - user_audio_path = self.output_dir / "audio_user.wav" - self._save_wav_file( - bytes(self.user_audio_buffer), - user_audio_path, - self._audio_sample_rate, - 1, # Mono - ) - assistant_audio_path = self.output_dir / "audio_assistant.wav" - self._save_wav_file( - bytes(self.assistant_audio_buffer), - assistant_audio_path, - self._audio_sample_rate, - 1, # Mono - ) - logger.info(f"Saved {len(self._audio_buffer)} bytes of audio to {audio_path}") - - async def _save_outputs(self) -> None: - """Save all outputs (audit log, audio files, etc.).""" - # Save audit log - audit_path = self.output_dir / "audit_log.json" - self.audit_log.save(audit_path) - - # Save transcript from audit log. - # When using the instrumented realtime pipeline, always overwrite the - # eagerly-written transcript.jsonl with a version derived from the - # (correctly ordered) audit log. - transcript_path = self.output_dir / "transcript.jsonl" - if isinstance(self.pipeline_config, SpeechToSpeechConfig): - self.audit_log.save_transcript_jsonl(transcript_path) - elif not transcript_path.exists(): - self.audit_log.save_transcript_jsonl(transcript_path) - - # Save agent performance stats + async def save_outputs(self) -> None: + """Save all outputs, with pipecat-specific additions.""" + # Save agent performance stats (pipecat-specific: AgenticSystem tracking) if self.agentic_system: try: - logger.info("Saving agent performance stats from _save_outputs()...") + logger.info("Saving agent performance stats from save_outputs()...") self.agentic_system.save_agent_perf_stats() except Exception as e: logger.error(f"Error saving agent perf stats: {e}", exc_info=True) - # Save accumulated audio files - self._save_audio() - - # Save initial and final scenario database states (REQUIRED for deterministic metrics) - try: - initial_db = self.get_initial_scenario_db() - final_db = self.get_final_scenario_db() - - initial_db_path = self.output_dir / "initial_scenario_db.json" - with open(initial_db_path, "w") as f: - json.dump(initial_db, f, indent=2, sort_keys=True, default=str) - - final_db_path = self.output_dir / "final_scenario_db.json" - with open(final_db_path, "w") as f: - json.dump(final_db, f, indent=2, sort_keys=True, default=str) - - logger.info(f"Saved scenario database states to {self.output_dir}") - except Exception as e: - logger.error(f"Error saving scenario database states: {e}", exc_info=True) - raise # Re-raise since this is now required for deterministic metrics - - logger.info(f"Outputs saved to {self.output_dir}") - - def get_conversation_stats(self) -> dict[str, Any]: - """Get statistics about the conversation.""" - return self.audit_log.get_stats() - - def get_initial_scenario_db(self) -> dict[str, Any]: - """Get initial scenario database state.""" - return self.tool_handler.original_db - - def get_final_scenario_db(self) -> dict[str, Any]: - """Get final scenario database state.""" - return self.tool_handler.db + # Call base class to save audit_log, audio, scenario DBs, latencies + await super().save_outputs() async def override__maybe_trigger_user_turn_stopped(self): diff --git a/src/eva/assistant/pipeline/audio_llm_processor.py b/src/eva/assistant/pipeline/audio_llm_processor.py index 25d16a7a..296fe4c4 100644 --- a/src/eva/assistant/pipeline/audio_llm_processor.py +++ b/src/eva/assistant/pipeline/audio_llm_processor.py @@ -51,7 +51,7 @@ logger = get_logger(__name__) -# Pipeline sample rate (matches server.py SAMPLE_RATE) +# Pipeline sample rate (matches pipecat_server.py SAMPLE_RATE) PIPELINE_SAMPLE_RATE = 24000 # Minimum audio size to process (< 10ms of 24kHz 16-bit mono is noise/empty) @@ -198,7 +198,7 @@ def __init__( self._current_query_task: asyncio.Task | None = None self._interrupted = asyncio.Event() - # Optional callback for transcript saving (set by server.py) + # Optional callback for transcript saving (set by pipecat_server.py) self.on_assistant_response: Awaitable | None = None async def process_frame(self, frame: Frame, direction: FrameDirection) -> None: @@ -233,7 +233,7 @@ async def _start_interruption(self): async def process_complete_user_turn(self, text_from_aggregator: str) -> None: """Process a complete user turn with audio. - Called by the on_user_turn_stopped event handler in server.py. + Called by the on_user_turn_stopped event handler in pipecat_server.py. The text_from_aggregator is typically empty since there is no STT; Args: @@ -425,7 +425,7 @@ def __init__( base_url, _transcription_url_counter = _resolve_url(params, _transcription_url_counter) self._client: AsyncOpenAI = AsyncOpenAI(api_key=self._api_key, base_url=base_url) - # Callback for when transcription is ready (set by server.py) + # Callback for when transcription is ready (set by pipecat_server.py) self.on_transcription: Any | None = None # Track background transcription tasks so they can complete even during interruptions diff --git a/src/eva/assistant/pipeline/nvidia_baseten.py b/src/eva/assistant/pipeline/nvidia_baseten.py index 3a38e23b..61bdd954 100644 --- a/src/eva/assistant/pipeline/nvidia_baseten.py +++ b/src/eva/assistant/pipeline/nvidia_baseten.py @@ -24,6 +24,26 @@ def _check_nvidia_available(): ) +try: + import riva.client as riva_client # type: ignore[import-untyped] + from pipecat.services.nvidia.stt import NvidiaSTTService + from pipecat.services.nvidia.tts import NvidiaTTSService + + _NVIDIA_AVAILABLE = True +except ImportError: + riva_client = None # type: ignore[assignment] + _NVIDIA_AVAILABLE = False + NvidiaSTTService = object # type: ignore[misc] + NvidiaTTSService = object # type: ignore[misc] + + +def _check_nvidia_available(): + if not _NVIDIA_AVAILABLE: + raise ImportError( + "nvidia-riva-client is required for Baseten services. Install it with: pip install nvidia-riva-client" + ) + + logger = get_logger(__name__) diff --git a/src/eva/metrics/processor.py b/src/eva/metrics/processor.py index 660e7ce9..4c12fead 100644 --- a/src/eva/metrics/processor.py +++ b/src/eva/metrics/processor.py @@ -26,7 +26,7 @@ # Elevenlabs audio user field → _ProcessorContext attribute name AUDIO_ATTR = { - "pipecat_agent": "audio_timestamps_assistant_turns", + "framework_agent": "audio_timestamps_assistant_turns", "elevenlabs_user": "audio_timestamps_user_turns", } @@ -68,7 +68,7 @@ class _TurnExtractionState: # Track which turn each speaker's audio started at, so late-arriving speech transcripts land at the correct turn. last_assistant_audio_turn: int = 0 last_user_audio_turn: int = 0 - # True when pipecat_agent audio started after user audio ended, meaning any subsequent user_speech (while + # True when framework_agent audio started after user audio ended, meaning any subsequent user_speech (while # user_audio_open is False) belongs to a new speaking session and should be buffered until the next # audio_start(elevenlabs_user) sets the correct turn. assistant_responded_since_user_ended: bool = False @@ -348,7 +348,7 @@ def _handle_audio_start( _process_user_speech(buffered, state, context, conversation_trace, is_audio_native) state.buffered_user_speech.clear() - elif role == "pipecat_agent": + elif role == "framework_agent": state.assistant_audio_open = True state.last_assistant_audio_turn = state.turn_num if not state.user_audio_open: @@ -385,7 +385,7 @@ def _handle_audio_end(event: dict, state: "_TurnExtractionState") -> None: # assistant_spoke_in_turn — this prevents late audit_log/user STT chunks from advancing # (they naturally stay at the current turn). state.pending_advance_after_rollback = True - elif role == "pipecat_agent": + elif role == "framework_agent": state.assistant_audio_open = False @@ -501,7 +501,7 @@ def _validate_conversation_trace( def _fix_interruption_labels(context: "_ProcessorContext", state: "_TurnExtractionState") -> None: """Fix interruption labels that may have been missed during the event loop. - The audit_log/assistant entry can arrive before the interruption is detected at audio_start(pipecat_agent), + The audit_log/assistant entry can arrive before the interruption is detected at audio_start(framework_agent), so the prefix wasn't applied during the loop. Only fix the first assistant entry per interrupted turn. """ # Clean up per-entry interrupted keys (used during event loop only) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index e08783bd..f626a337 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -331,6 +331,19 @@ class ModelDeployment(DeploymentTypedDict): description="Pipeline (STT + LLM + TTS), speech-to-speech, or audio-LLM model configuration", ) + # Framework selection + framework: Literal["pipecat", "openai_realtime", "gemini_live", "elevenlabs", "deepgram"] = Field( + "pipecat", + description=( + "Agent framework to use for the assistant server. " + "'pipecat' (default): Pipecat pipeline. " + "'openai_realtime': OpenAI Realtime API directly. " + "'gemini_live': Gemini Live API via google-genai. " + "'elevenlabs': ElevenLabs Conversational AI. " + "'deepgram': Deepgram Voice Agent API." + ), + ) + # Run identifier run_id: str = Field( "timestamp and model name(s)", # Overwritten by _set_default_run_id() diff --git a/src/eva/orchestrator/worker.py b/src/eva/orchestrator/worker.py index 9f96ba2d..1f0451d2 100644 --- a/src/eva/orchestrator/worker.py +++ b/src/eva/orchestrator/worker.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Any -from eva.assistant.server import AssistantServer +from eva.assistant.base_server import AbstractAssistantServer from eva.models.agents import AgentConfig from eva.models.config import RunConfig from eva.models.record import EvaluationRecord @@ -20,6 +20,38 @@ logger = get_logger(__name__) +def _get_server_class(framework: str) -> type[AbstractAssistantServer]: + """Return the server class for the given framework name. + + Uses lazy imports to avoid importing heavy dependencies (pipecat, openai, etc.) + unless the framework is actually selected. + """ + if framework == "pipecat": + from eva.assistant.pipecat_server import PipecatAssistantServer + + return PipecatAssistantServer + elif framework == "openai_realtime": + from eva.assistant.openai_realtime_server import OpenAIRealtimeAssistantServer + + return OpenAIRealtimeAssistantServer + elif framework == "gemini_live": + from eva.assistant.gemini_live_server import GeminiLiveAssistantServer + + return GeminiLiveAssistantServer + elif framework == "elevenlabs": + from eva.assistant.elevenlabs_server import ElevenLabsAssistantServer + + return ElevenLabsAssistantServer + elif framework == "deepgram": + from eva.assistant.deepgram_server import DeepgramAssistantServer + + return DeepgramAssistantServer + else: + raise ValueError( + f"Unknown framework: {framework!r}. Supported: pipecat, openai_realtime, gemini_live, elevenlabs, deepgram" + ) + + def _percentile(sorted_data: list[float], p: float) -> float: """Calculate the p-th percentile using the nearest-rank method. @@ -223,7 +255,7 @@ async def run(self) -> ConversationResult: transcript_path=str(self.output_dir / "transcript.jsonl"), audit_log_path=str(self.output_dir / "audit_log.json"), conversation_log_path=str(self.output_dir / "logs.log"), - pipecat_logs_path=str(self.output_dir / "pipecat_logs.jsonl"), + pipecat_logs_path=self._resolve_framework_logs_path(), elevenlabs_logs_path=str(self.output_dir / "elevenlabs_events.jsonl"), num_turns=self._conversation_stats.get("num_turns", 0), num_tool_calls=self._conversation_stats.get("num_tool_calls", 0), @@ -234,8 +266,9 @@ async def run(self) -> ConversationResult: ) async def _start_assistant(self) -> None: - """Start the assistant server.""" - self._assistant_server = AssistantServer( + """Start the assistant server using the configured framework.""" + server_cls = _get_server_class(self.config.framework) + self._assistant_server = server_cls( current_date_time=self.record.current_date_time, pipeline_config=self.config.model, agent=self.agent, @@ -276,6 +309,14 @@ async def _run_conversation(self) -> str: return ended_reason + def _resolve_framework_logs_path(self) -> str: + """Resolve the framework/pipecat logs path, preferring framework_logs.jsonl.""" + framework_path = self.output_dir / "framework_logs.jsonl" + pipecat_path = self.output_dir / "pipecat_logs.jsonl" + if framework_path.exists(): + return str(framework_path) + return str(pipecat_path) + async def _cleanup(self) -> None: """Clean up resources.""" if self._assistant_server: diff --git a/src/eva/user_simulator/audio_interface.py b/src/eva/user_simulator/audio_interface.py index b8e0ec0b..9919e8a3 100644 --- a/src/eva/user_simulator/audio_interface.py +++ b/src/eva/user_simulator/audio_interface.py @@ -10,6 +10,7 @@ import asyncio import base64 import json +import time from collections.abc import Callable import websockets @@ -102,7 +103,7 @@ def __init__( # Track audio timing state self._user_audio_active = False # elevenlabs_user speaking - self._assistant_audio_active = False # pipecat_agent speaking + self._assistant_audio_active = False # framework_agent speaking self._user_audio_ended_time = None # Track when user audio ended for silence sending self._assistant_audio_ended_time = None # Track when assistant audio ended for silence sending @@ -110,6 +111,9 @@ def __init__( self._stopping = False self._send_errors_logged = 0 + # Latency tracking + self._latency_measurements: list[float] = [] + async def start_async(self) -> None: """Async initialization - connect to assistant WebSocket.""" self.running = True @@ -362,18 +366,35 @@ async def _send_catchup_silence(self, source: str, num_chunks: int) -> None: if i < num_chunks - 1: await asyncio.sleep(send_interval) - def _on_user_audio_start(self) -> None: + async def _on_user_audio_start(self) -> None: """Handle user audio starting.""" self._user_audio_active = True self._user_audio_ended_time = None + timestamp_ms = time.time() + if self._assistant_audio_ended_time is not None: silence_duration = asyncio.get_event_loop().time() - self._assistant_audio_ended_time logger.info(f"🎤 User audio START - stopping user silence after {silence_duration:.2f}s") self._assistant_audio_ended_time = None if self.event_logger: - self.event_logger.log_audio_start("elevenlabs_user") + self.event_logger.log_audio_start("elevenlabs_user", timestamp_ms) logger.info("🎤 User audio START") + # Send user_speech_start event to assistant server with timestamp + if self.websocket and self.websocket.state == WebSocketState.OPEN: + try: + await self.websocket.send( + json.dumps( + { + "event": "user_speech_start", + "conversation_id": self.conversation_id, + "timestamp_ms": str(int(round(timestamp_ms * 1000))), + } + ) + ) + except Exception as e: + logger.warning(f"Error sending user_speech_start event: {e}") + async def _on_user_audio_end(self, current_time: float) -> None: """Handle user audio ending.""" self._user_audio_ended_time = current_time @@ -390,14 +411,15 @@ async def _on_user_audio_end(self, current_time: float) -> None: def _on_assistant_audio_start(self) -> None: """Handle assistant audio starting.""" if self._user_audio_ended_time is not None: - silence_duration = asyncio.get_event_loop().time() - self._user_audio_ended_time - logger.info(f"✅ Assistant responded after {silence_duration:.2f}s - stopping assistant silence") + latency = asyncio.get_event_loop().time() - self._user_audio_ended_time + self._latency_measurements.append(latency) + logger.info(f"✅ Assistant responded after {latency:.2f}s - stopping assistant silence") self._user_audio_ended_time = None if self._assistant_audio_ended_time is not None: self._assistant_audio_ended_time = None self._assistant_audio_active = True if self.event_logger: - self.event_logger.log_audio_start("pipecat_agent") + self.event_logger.log_audio_start("framework_agent") logger.info("🔊 Assistant audio START") async def _on_assistant_audio_end(self) -> None: @@ -405,7 +427,7 @@ async def _on_assistant_audio_end(self) -> None: self._assistant_audio_active = False self._assistant_audio_ended_time = asyncio.get_event_loop().time() if self.event_logger: - self.event_logger.log_audio_end("pipecat_agent") + self.event_logger.log_audio_end("framework_agent") logger.info("🔊 Assistant audio END (silence detected)") # Send catch-up silence to cover the detection delay for ElevenLabs if ASSISTANT_CATCHUP_SILENCE_CHUNKS > 0: @@ -542,7 +564,7 @@ async def _receive_from_assistant(self) -> None: # Mark end of assistant audio if still active if self._assistant_audio_active and self.event_logger: self._assistant_audio_active = False - self.event_logger.log_audio_end("pipecat_agent") + self.event_logger.log_audio_end("framework_agent") logger.info("🔊 Assistant audio END (connection closed)") # Signal conversation end if disconnected while still running @@ -612,7 +634,7 @@ async def _send_to_assistant(self) -> None: if self.websocket: # Mark start of user audio on first chunk if not self._user_audio_active: - self._on_user_audio_start() + await self._on_user_audio_start() # Convert to μ-law and send mulaw_audio = self._convert_pcm_to_mulaw(chunk) @@ -705,3 +727,14 @@ async def _send_to_assistant(self) -> None: logger.info(f"Sent final {len(pending_audio)} bytes on shutdown") except Exception as e: logger.warning(f"Error sending final audio: {e}") + + def get_latencies(self) -> list[float]: + """Return accumulated response latency measurements. + + Latency is measured as the time between when the user stops speaking + and when the assistant's audio response begins. + + Returns: + List of latency measurements in seconds. + """ + return self._latency_measurements.copy() diff --git a/src/eva/user_simulator/client.py b/src/eva/user_simulator/client.py index ff8aa29f..6eba444d 100644 --- a/src/eva/user_simulator/client.py +++ b/src/eva/user_simulator/client.py @@ -230,6 +230,11 @@ async def _run_elevenlabs_conversation(self, api_key: str) -> str: except Exception as e: logger.warning(f"Failed to check conversation history for end_call: {e}") + try: + await self._fetch_elevenlabs_audio(conversation_id) + except Exception as e: + logger.warning(f"Failed to fetch ElevenLabs server audio: {e}") + self.event_logger.log_connection_state("session_ended", {"reason": self._end_reason}) except Exception as e: @@ -237,6 +242,24 @@ async def _run_elevenlabs_conversation(self, api_key: str) -> str: self._end_reason = "error" raise finally: + # Save response latencies from audio interface before cleanup + if self._audio_interface: + latencies = self._audio_interface.get_latencies() + if latencies: + latency_file = self.output_dir / "response_latencies.json" + with open(latency_file, "w") as f: + json.dump( + { + "latencies": latencies, + "mean": sum(latencies) / len(latencies), + "max": max(latencies), + "count": len(latencies), + }, + f, + indent=2, + ) + logger.info(f"Saved {len(latencies)} response latencies to {latency_file}") + # Grace period: keep the WebSocket open so the assistant pipeline # (Pipecat STT) can finish processing the last user utterance. # Observed delay from "User audio END" to "UserStoppedSpeaking" @@ -294,6 +317,26 @@ async def _check_end_call_via_api(self, conversation_id: str) -> bool: logger.warning(f"Conversation transcript still empty after {max_attempts} attempts") return False + async def _fetch_elevenlabs_audio(self, conversation_id: str) -> None: + max_attempts = 5 + delay = 2.0 + + for attempt in range(max_attempts): + try: + audio_iter = self._client.conversational_ai.conversations.audio.get(conversation_id) + audio_path = self.output_dir / "elevenlabs_audio_recording.mp3" + with open(audio_path, "wb") as f: + f.writelines(audio_iter) + logger.info(f"Saved ElevenLabs server-side audio to {audio_path}") + return + except Exception as e: + if attempt < max_attempts - 1: + logger.debug(f"Audio not yet available (attempt {attempt + 1}/{max_attempts}): {e}") + await asyncio.sleep(delay) + delay = min(delay * 2, 10.0) + else: + logger.warning(f"Failed to fetch ElevenLabs server audio after {max_attempts} attempts: {e}") + def _reset_keepalive_counter(self) -> None: """Reset the consecutive keep-alive counter on user/agent activity.""" self._consecutive_keepalive_count = 0 @@ -387,7 +430,7 @@ def _on_assistant_speaks(self, transcript: str) -> None: transcript: The text that the assistant said """ self._reset_keepalive_counter() - logger.info(f"🤖 Assistant (Pipecat): {transcript}") + logger.info(f"🤖 Assistant: {transcript}") self.event_logger.log_event( "assistant_speech", diff --git a/src/eva/user_simulator/event_logger.py b/src/eva/user_simulator/event_logger.py index c1183e2c..81a9418f 100644 --- a/src/eva/user_simulator/event_logger.py +++ b/src/eva/user_simulator/event_logger.py @@ -100,14 +100,15 @@ def log_error(self, error: str, details: dict[str, Any] | None = None) -> None: }, ) - def log_audio_start(self, role: str) -> None: + def log_audio_start(self, role: str, timestamp: float | None = None) -> None: """Log when audio starts for a given role. Args: - role: Either 'elevenlabs_user' or 'pipecat_agent' + role: Either 'elevenlabs_user' or 'framework_agent' + timestamp: Timestamp in milliseconds when audio started """ # Use Unix timestamp in seconds (as float) - audio_timestamp = time.time() + audio_timestamp = timestamp or time.time() # Note: For audio events, we need to store event_type and user at top level # not nested in data self._sequence += 1 @@ -125,7 +126,7 @@ def log_audio_end(self, role: str) -> None: """Log when audio ends for a given role. Args: - role: Either 'elevenlabs_user' or 'pipecat_agent' + role: Either 'elevenlabs_user' or 'framework_agent' """ # Use Unix timestamp in seconds (as float) audio_timestamp = time.time() diff --git a/src/eva/utils/conversation_checks.py b/src/eva/utils/conversation_checks.py index d284549e..d77993de 100644 --- a/src/eva/utils/conversation_checks.py +++ b/src/eva/utils/conversation_checks.py @@ -63,7 +63,9 @@ def find_records_with_llm_generic_error(output_dir: Path, record_ids: set[str] | """Find records that have the LLM generic error message in pipecat_logs.jsonl.""" affected = [] for record_id in record_ids: - pipecat_logs_path = output_dir / "records" / record_id / "pipecat_logs.jsonl" + pipecat_logs_path = output_dir / "records" / record_id / "framework_logs.jsonl" + if not pipecat_logs_path.exists(): + pipecat_logs_path = output_dir / "records" / record_id / "pipecat_logs.jsonl" if not pipecat_logs_path.exists(): continue with open(pipecat_logs_path) as f: diff --git a/tests/artifacts/records/1.1.2/elevenlabs_events.jsonl b/tests/artifacts/records/1.1.2/elevenlabs_events.jsonl index bc2c1b7e..e64b5520 100644 --- a/tests/artifacts/records/1.1.2/elevenlabs_events.jsonl +++ b/tests/artifacts/records/1.1.2/elevenlabs_events.jsonl @@ -1,61 +1,61 @@ {"timestamp": 1772403723723, "sequence": 1, "type": "connection_state", "data": {"state": "connected", "details": {"server_url": "ws://localhost:10000/ws"}}} {"timestamp": 1772403724132, "sequence": 2, "type": "connection_state", "data": {"state": "session_started", "details": {}}} -{"timestamp": 1772403724647, "sequence": 3, "event_type": "audio_start", "user": "pipecat_agent", "audio_timestamp": 1772403724.647605} -{"timestamp": 1772403727245, "sequence": 4, "event_type": "audio_end", "user": "pipecat_agent", "audio_timestamp": 1772403727.245576} +{"timestamp": 1772403724647, "sequence": 3, "event_type": "audio_start", "user": "framework_agent", "audio_timestamp": 1772403724.647605} +{"timestamp": 1772403727245, "sequence": 4, "event_type": "audio_end", "user": "framework_agent", "audio_timestamp": 1772403727.245576} {"timestamp": 1772403727939, "sequence": 5, "type": "assistant_speech", "data": {"text": "Hello, how can I help you today?", "source": "pipecat_assistant"}} {"timestamp": 1772403727947, "sequence": 6, "event_type": "audio_start", "user": "elevenlabs_user", "audio_timestamp": 1772403727.94754} {"timestamp": 1772403728189, "sequence": 7, "type": "user_speech", "data": {"text": "Hi, I need to change my flight to March twenty fifth.", "source": "elevenlabs_agent"}} {"timestamp": 1772403731368, "sequence": 8, "event_type": "audio_end", "user": "elevenlabs_user", "audio_timestamp": 1772403731.368855} -{"timestamp": 1772403733334, "sequence": 9, "event_type": "audio_start", "user": "pipecat_agent", "audio_timestamp": 1772403733.3349462} -{"timestamp": 1772403739691, "sequence": 10, "event_type": "audio_end", "user": "pipecat_agent", "audio_timestamp": 1772403739.691956} +{"timestamp": 1772403733334, "sequence": 9, "event_type": "audio_start", "user": "framework_agent", "audio_timestamp": 1772403733.3349462} +{"timestamp": 1772403739691, "sequence": 10, "event_type": "audio_end", "user": "framework_agent", "audio_timestamp": 1772403739.691956} {"timestamp": 1772403742595, "sequence": 11, "type": "assistant_speech", "data": {"text": "Sure. First, I'll need to pull up your reservation. What's your six-character confirmation number and the last name on the booking?", "source": "pipecat_assistant"}} {"timestamp": 1772403742613, "sequence": 12, "event_type": "audio_start", "user": "elevenlabs_user", "audio_timestamp": 1772403742.613776} {"timestamp": 1772403743656, "sequence": 13, "type": "user_speech", "data": {"text": "The confirmation is Z K three F F W, and the last name is Rodriguez.", "source": "elevenlabs_agent"}} {"timestamp": 1772403749434, "sequence": 14, "event_type": "audio_end", "user": "elevenlabs_user", "audio_timestamp": 1772403749.434989} -{"timestamp": 1772403751789, "sequence": 15, "event_type": "audio_start", "user": "pipecat_agent", "audio_timestamp": 1772403751.7891579} -{"timestamp": 1772403761402, "sequence": 16, "event_type": "audio_end", "user": "pipecat_agent", "audio_timestamp": 1772403761.4023108} +{"timestamp": 1772403751789, "sequence": 15, "event_type": "audio_start", "user": "framework_agent", "audio_timestamp": 1772403751.7891579} +{"timestamp": 1772403761402, "sequence": 16, "event_type": "audio_end", "user": "framework_agent", "audio_timestamp": 1772403761.4023108} {"timestamp": 1772403763780, "sequence": 17, "type": "assistant_speech", "data": {"text": "Thanks, Rodriguez. I want to make sure I have the confirmation number exactly right. Could you please say the full code again, one character at a time? It should be six letters and numbers.", "source": "pipecat_assistant"}} {"timestamp": 1772403763783, "sequence": 18, "event_type": "audio_start", "user": "elevenlabs_user", "audio_timestamp": 1772403763.783529} {"timestamp": 1772403764544, "sequence": 19, "type": "user_speech", "data": {"text": "[slow] Zulu. Kilo. Three. Foxtrot. Foxtrot. Whiskey. \nLast name Rodriguez.", "source": "elevenlabs_agent"}} -{"timestamp": 1772403776159, "sequence": 20, "event_type": "audio_start", "user": "pipecat_agent", "audio_timestamp": 1772403776.159976} -{"timestamp": 1772403794620, "sequence": 21, "event_type": "audio_end", "user": "pipecat_agent", "audio_timestamp": 1772403794.620748} +{"timestamp": 1772403776159, "sequence": 20, "event_type": "audio_start", "user": "framework_agent", "audio_timestamp": 1772403776.159976} +{"timestamp": 1772403794620, "sequence": 21, "event_type": "audio_end", "user": "framework_agent", "audio_timestamp": 1772403794.620748} {"timestamp": 1772403797398, "sequence": 22, "type": "assistant_speech", "data": {"text": "I've pulled up Samantha Rodriguez. Confirmation code Zulu Kilo three Foxtrot Foxtrot Whiskey. You're confirmed on Skyway Flight Sierra Kilo six twenty-one on March twentieth with C twenty-two A. To move you to March twenty-fifth, is your origin and destination staying the same as this current booking? And what time of day do you prefer? Morning, afternoon, or evening?", "source": "pipecat_assistant"}} {"timestamp": 1772403797447, "sequence": 23, "event_type": "audio_end", "user": "elevenlabs_user", "audio_timestamp": 1772403797.4478238} {"timestamp": 1772403797504, "sequence": 24, "event_type": "audio_start", "user": "elevenlabs_user", "audio_timestamp": 1772403797.504352} {"timestamp": 1772403799084, "sequence": 25, "type": "user_speech", "data": {"text": "Yeah, same route, Austin to L A X only. \nAfternoon, as long as I arrive by four p m Pacific.", "source": "elevenlabs_agent"}} {"timestamp": 1772403804704, "sequence": 26, "event_type": "audio_end", "user": "elevenlabs_user", "audio_timestamp": 1772403804.7048872} -{"timestamp": 1772403810133, "sequence": 27, "event_type": "audio_start", "user": "pipecat_agent", "audio_timestamp": 1772403810.133575} -{"timestamp": 1772403845347, "sequence": 28, "event_type": "audio_end", "user": "pipecat_agent", "audio_timestamp": 1772403845.347048} +{"timestamp": 1772403810133, "sequence": 27, "event_type": "audio_start", "user": "framework_agent", "audio_timestamp": 1772403810.133575} +{"timestamp": 1772403845347, "sequence": 28, "event_type": "audio_end", "user": "framework_agent", "audio_timestamp": 1772403845.347048} {"timestamp": 1772403848829, "sequence": 29, "type": "assistant_speech", "data": {"text": "For Wednesday, March twenty-fifth, Austin to Los Angeles, arriving by four PM Pacific, I have two good nonstop afternoon options. Option one is Skyway Flight Sierra Kilo seven thirty-nine. Departs one forty-five PM Austin time, arrives three PM Los Angeles time. Main cabin fare is three hundred forty dollars. Option two is Skyway Flight Sierra Kilo seven eighty-one. Departs two thirty-five PM Austin time, arrives three fifty-five PM Los Angeles time. Main cabin fare is five hundred twenty dollars. Since this is a voluntary change in main cabin, there's a seventy-five dollar change fee, plus any fare difference. Which option would you like me to book?", "source": "pipecat_assistant"}} {"timestamp": 1772403848829, "sequence": 30, "event_type": "audio_start", "user": "elevenlabs_user", "audio_timestamp": 1772403848.829487} {"timestamp": 1772403850219, "sequence": 31, "type": "user_speech", "data": {"text": "Before I pick, I need the total cost to change each option, all in, including the change fee and fare difference.", "source": "elevenlabs_agent"}} {"timestamp": 1772403855549, "sequence": 32, "event_type": "audio_end", "user": "elevenlabs_user", "audio_timestamp": 1772403855.5498629} -{"timestamp": 1772403860055, "sequence": 33, "event_type": "audio_start", "user": "pipecat_agent", "audio_timestamp": 1772403860.0551932} -{"timestamp": 1772403894073, "sequence": 34, "event_type": "audio_end", "user": "pipecat_agent", "audio_timestamp": 1772403894.0739589} +{"timestamp": 1772403860055, "sequence": 33, "event_type": "audio_start", "user": "framework_agent", "audio_timestamp": 1772403860.0551932} +{"timestamp": 1772403894073, "sequence": 34, "event_type": "audio_end", "user": "framework_agent", "audio_timestamp": 1772403894.0739589} {"timestamp": 1772403897950, "sequence": 35, "type": "assistant_speech", "data": {"text": "For your current main cabin ticket, you paid two hundred and sixty dollars for flight Sierra Kilo seven thirty-nine at one forty-five PM. The new fare is three hundred and forty dollars. That's an eighty dollar fare difference, plus the seventy-five dollar change fee. So the total to change is one hundred and fifty-five dollars. For flight Sierra Kilo seven eighty-one at two thirty-five PM, the new fare is five hundred and twenty dollars. That's a two hundred and sixty dollar fare difference, plus the seventy-five dollar change fee. So the total to change is three hundred and thirty-five dollars. Those totals are based on the current available fares right now. Which one should I switch you to?[background noise]", "source": "pipecat_assistant"}} {"timestamp": 1772403897955, "sequence": 36, "event_type": "audio_start", "user": "elevenlabs_user", "audio_timestamp": 1772403897.955505} {"timestamp": 1772403901979, "sequence": 37, "type": "user_speech", "data": {"text": "Neither of those works; my max total to change is one hundred twenty dollars. \nCan you look for any other Austin to L A X flight on March twenty-fifth that arrives by four p m Pacific, keeps it under one hundred twenty total, and has a window seat available?", "source": "elevenlabs_agent"}} {"timestamp": 1772403913317, "sequence": 38, "event_type": "audio_end", "user": "elevenlabs_user", "audio_timestamp": 1772403913.3170938} -{"timestamp": 1772403930889, "sequence": 39, "event_type": "audio_start", "user": "pipecat_agent", "audio_timestamp": 1772403930.889625} -{"timestamp": 1772403962091, "sequence": 40, "event_type": "audio_end", "user": "pipecat_agent", "audio_timestamp": 1772403962.091439} +{"timestamp": 1772403930889, "sequence": 39, "event_type": "audio_start", "user": "framework_agent", "audio_timestamp": 1772403930.889625} +{"timestamp": 1772403962091, "sequence": 40, "event_type": "audio_end", "user": "framework_agent", "audio_timestamp": 1772403962.091439} {"timestamp": 1772403964874, "sequence": 41, "type": "assistant_speech", "data": {"text": "On March twenty-fifth, to keep your total change cost at one hundred twenty dollars or less. The only option I can see is an earlier non-stop. Skyway flight Sierra Kilo seven zero three. Departs eight fifteen AM Austin time, arrives nine twenty-five AM Los Angeles time. In main cabin, the fare is three hundred dollars, so your total to change would be one hundred fifteen dollars, which is the seventy-five dollar change, fee plus a forty dollar fare difference. Window seats are showing as available. If that morning arrival works for you, I can switch you to that flight and request a window seat. Do you want me to book it?", "source": "pipecat_assistant"}} {"timestamp": 1772403964892, "sequence": 42, "event_type": "audio_start", "user": "elevenlabs_user", "audio_timestamp": 1772403964.8924348} {"timestamp": 1772403966378, "sequence": 43, "type": "user_speech", "data": {"text": "That works. \nPlease move me to Sierra Kilo seven zero three on March twenty-fifth, and I need a confirmed window seat, not just a request.", "source": "elevenlabs_agent"}} {"timestamp": 1772403974693, "sequence": 44, "event_type": "audio_end", "user": "elevenlabs_user", "audio_timestamp": 1772403974.69313} -{"timestamp": 1772403978431, "sequence": 45, "event_type": "audio_start", "user": "pipecat_agent", "audio_timestamp": 1772403978.431253} -{"timestamp": 1772403992311, "sequence": 46, "event_type": "audio_end", "user": "pipecat_agent", "audio_timestamp": 1772403992.311546} +{"timestamp": 1772403978431, "sequence": 45, "event_type": "audio_start", "user": "framework_agent", "audio_timestamp": 1772403978.431253} +{"timestamp": 1772403992311, "sequence": 46, "event_type": "audio_end", "user": "framework_agent", "audio_timestamp": 1772403992.311546} {"timestamp": 1772403994920, "sequence": 47, "type": "assistant_speech", "data": {"text": "I can do that. The total to change to Skyway flight Sierra Kilo seven zero three on Wednesday, March twenty-fifth is one hundred fifteen dollars, which is the seventy-five dollar change fee, plus a forty dollar fare difference. Do I have your okay to charge that one hundred fifteen dollars and rebook you now?", "source": "pipecat_assistant"}} {"timestamp": 1772403994942, "sequence": 48, "event_type": "audio_start", "user": "elevenlabs_user", "audio_timestamp": 1772403994.942067} {"timestamp": 1772403996248, "sequence": 49, "type": "user_speech", "data": {"text": "Yes, you have my okay to charge one hundred fifteen and rebook me on that flight, as long as you assign a specific window seat.", "source": "elevenlabs_agent"}} {"timestamp": 1772404001442, "sequence": 50, "event_type": "audio_end", "user": "elevenlabs_user", "audio_timestamp": 1772404001.442026} -{"timestamp": 1772404008294, "sequence": 51, "event_type": "audio_start", "user": "pipecat_agent", "audio_timestamp": 1772404008.294615} -{"timestamp": 1772404031788, "sequence": 52, "event_type": "audio_end", "user": "pipecat_agent", "audio_timestamp": 1772404031.788522} +{"timestamp": 1772404008294, "sequence": 51, "event_type": "audio_start", "user": "framework_agent", "audio_timestamp": 1772404008.294615} +{"timestamp": 1772404031788, "sequence": 52, "event_type": "audio_end", "user": "framework_agent", "audio_timestamp": 1772404031.788522} {"timestamp": 1772404034248, "sequence": 53, "type": "assistant_speech", "data": {"text": "Mm-hmm ... I rebooked you to Skyway Flight Sierra Kilo seven zero three on Wednesday, March twenty-fifth, Austin to Los Angeles. It departs at eight fifteen AM Austin time and arrives at nine twenty-five AM Los Angeles time. I charged a total of one hundred fifteen dollars, made up of the seventy-five dollar change fee plus a forty dollar fare difference. And you now have a confirmed window seat, seat twenty-one A. Your confirmation number remains Zulu Kilo three Foxtrot Foxtrot Whiskey.", "source": "pipecat_assistant"}} {"timestamp": 1772404034250, "sequence": 54, "event_type": "audio_start", "user": "elevenlabs_user", "audio_timestamp": 1772404034.250272} {"timestamp": 1772404035171, "sequence": 55, "type": "user_speech", "data": {"text": "Perfect, thanks for getting that done. \n\nCan you help me with anything else on this reservation today?", "source": "elevenlabs_agent"}} {"timestamp": 1772404039691, "sequence": 56, "event_type": "audio_end", "user": "elevenlabs_user", "audio_timestamp": 1772404039.691183} -{"timestamp": 1772404042197, "sequence": 57, "event_type": "audio_start", "user": "pipecat_agent", "audio_timestamp": 1772404042.1977642} -{"timestamp": 1772404052742, "sequence": 58, "event_type": "audio_end", "user": "pipecat_agent", "audio_timestamp": 1772404052.7429702} +{"timestamp": 1772404042197, "sequence": 57, "event_type": "audio_start", "user": "framework_agent", "audio_timestamp": 1772404042.1977642} +{"timestamp": 1772404052742, "sequence": 58, "event_type": "audio_end", "user": "framework_agent", "audio_timestamp": 1772404052.7429702} {"timestamp": 1772404056031, "sequence": 59, "type": "assistant_speech", "data": {"text": "You're welcome. Yes, I can help. Right now, you have seat twenty-one A and no checked bags or special meal on the reservation. Do you want to add a checked bag or request a special meal?", "source": "pipecat_assistant"}} {"timestamp": 1772404056046, "sequence": 60, "event_type": "audio_start", "user": "elevenlabs_user", "audio_timestamp": 1772404056.046836} {"timestamp": 1772404056898, "sequence": 61, "type": "user_speech", "data": {"text": "No, I do not want to add anything else. Thanks for your help, bye.", "source": "elevenlabs_agent"}} diff --git a/tests/fixtures/processor_histories.json b/tests/fixtures/processor_histories.json index 21133a90..7ab8b4c0 100644 --- a/tests/fixtures/processor_histories.json +++ b/tests/fixtures/processor_histories.json @@ -5,9 +5,9 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Hello, how can I help you?"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.0}}, {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello how can I help you?"}}}, - {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 2.0}}, + {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 2.0}}, {"timestamp_ms": 2100, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 3000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 3100, "source": "audit_log", "event_type": "user", "data": "I need to change my flight"}, @@ -16,9 +16,9 @@ {"timestamp_ms": 4000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.0}}, {"timestamp_ms": 4500, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Sure let me help with that."}}, {"timestamp_ms": 4550, "source": "audit_log", "event_type": "assistant", "data": "Sure let me help with that."}, - {"timestamp_ms": 4600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 4.5}}, + {"timestamp_ms": 4600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 4.5}}, {"timestamp_ms": 5000, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Sure, let me help with that."}}}, - {"timestamp_ms": 5500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 5.5}}, + {"timestamp_ms": 5500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 5.5}}, {"timestamp_ms": 5600, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 6000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 6100, "source": "audit_log", "event_type": "user", "data": "Thanks, goodbye"}, @@ -53,20 +53,20 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Welcome!"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.0}}, {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Welcome!"}}}, - {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 2.0}}, + {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 2.0}}, {"timestamp_ms": 2100, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 3000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 3100, "source": "audit_log", "event_type": "user", "data": "I want to rebook"}, {"timestamp_ms": 3200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 3.0}}, {"timestamp_ms": 3500, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "I want to rebook my flight"}}}, - {"timestamp_ms": 3700, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 3.7}}, + {"timestamp_ms": 3700, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 3.7}}, {"timestamp_ms": 3800, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Let me look that up."}}, {"timestamp_ms": 3850, "source": "audit_log", "event_type": "assistant", "data": "Let me look that up."}, {"timestamp_ms": 3900, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Let me look that up."}}}, {"timestamp_ms": 4000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.0}}, - {"timestamp_ms": 4500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 4.5}}, + {"timestamp_ms": 4500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 4.5}}, {"timestamp_ms": 4600, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 5000, "source": "elevenlabs", "event_type": "connection_state", "data": {"data": {"state": "session_ended"}}} ], @@ -95,9 +95,9 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Hello!"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.0}}, {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello!"}}}, - {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 2.0}}, + {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 2.0}}, {"timestamp_ms": 2100, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 3000, "source": "elevenlabs", "event_type": "connection_state", "data": {"data": {"state": "session_ended"}}} ], @@ -124,9 +124,9 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Welcome!"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.0}}, {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Welcome!"}}}, - {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 2.0}}, + {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 2.0}}, {"timestamp_ms": 2100, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 3000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 3100, "source": "audit_log", "event_type": "user", "data": "Look up reservation ABC123"}, @@ -137,9 +137,9 @@ {"timestamp_ms": 4100, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "success", "reservation": {"confirmation_number": "ABC123"}}}}, {"timestamp_ms": 4500, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I found your reservation."}}, {"timestamp_ms": 4550, "source": "audit_log", "event_type": "assistant", "data": "I found your reservation."}, - {"timestamp_ms": 4600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 4.5}}, + {"timestamp_ms": 4600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 4.5}}, {"timestamp_ms": 5000, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I found your reservation."}}}, - {"timestamp_ms": 5500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 5.5}}, + {"timestamp_ms": 5500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 5.5}}, {"timestamp_ms": 5600, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 6000, "source": "elevenlabs", "event_type": "connection_state", "data": {"data": {"state": "session_ended"}}} ], @@ -175,10 +175,10 @@ {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 1.2}}, {"timestamp_ms": 1500, "source": "audit_log", "event_type": "tool_call", "data": {"tool": "get_reservation", "parameters": {"confirmation_number": "KOLTSF", "last_name": "Johansson"}}}, {"timestamp_ms": 1600, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "error", "message": "Last name does not match"}}}, - {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 2.0}}, + {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 2.0}}, {"timestamp_ms": 2100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Thanks. I tried Kilo India Lima Tango Sierra Foxtrot with Johansson, and I still cannot find a match. Could you please double-check the confirmation code?"}}, {"timestamp_ms": 2150, "source": "audit_log", "event_type": "assistant", "data": "Thanks. I tried Kilo India Lima Tango Sierra Foxtrot with Johansson, and I still cannot find a match. Could you please double-check the confirmation code?"}, - {"timestamp_ms": 3500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 3.5}}, + {"timestamp_ms": 3500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 3.5}}, {"timestamp_ms": 3600, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Tried Kilo India Lima Tango Sierra Foxtrot with Johansson and I still cannot find a match. Could you please double-check the confirmation code?"}}}, {"timestamp_ms": 3700, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 3.7}}, {"timestamp_ms": 3800, "source": "pipecat", "event_type": "turn_end", "data": {}}, @@ -189,10 +189,10 @@ {"timestamp_ms": 4700, "source": "audit_log", "event_type": "user", "data": "And the code is... K as in kilo. as in Oscar. L as in Lima. T as in tango. S as in Sierra. F as in foxtrot."}, {"timestamp_ms": 4800, "source": "audit_log", "event_type": "tool_call", "data": {"tool": "get_reservation", "parameters": {"confirmation_number": "KOLTSF", "last_name": "A"}}}, {"timestamp_ms": 4900, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "error", "message": "Last name does not match reservation KOLTSF"}}}, - {"timestamp_ms": 5200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 5.2}}, + {"timestamp_ms": 5200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 5.2}}, {"timestamp_ms": 5300, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Okay, I found a reservation for confirmation code Kilo Oscar Lima Tango Sierra Foxtrot, but the last name is not matching. What is the full last name on the ticket, spelled out letter by letter?"}}, {"timestamp_ms": 5350, "source": "audit_log", "event_type": "assistant", "data": "Okay, I found a reservation for confirmation code Kilo Oscar Lima Tango Sierra Foxtrot, but the last name is not matching. What is the full last name on the ticket, spelled out letter by letter?"}, - {"timestamp_ms": 6300, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 6.3}}, + {"timestamp_ms": 6300, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 6.3}}, {"timestamp_ms": 6400, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 6700, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I found a reservation for confirmation code kilo Oscar Lima tango Sierra Foxtrot, but the last name is not matching. What is the full last name on the ticket spelled out letter by letter?"}}}, {"timestamp_ms": 6800, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 6.8}}, @@ -201,11 +201,11 @@ {"timestamp_ms": 7200, "source": "audit_log", "event_type": "user", "data": "The last name is Johansson. J. H"}, {"timestamp_ms": 7300, "source": "audit_log", "event_type": "tool_call", "data": {"tool": "get_reservation", "parameters": {"confirmation_number": "KOLTSF", "last_name": "Johansson"}}}, {"timestamp_ms": 7400, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "success"}}}, - {"timestamp_ms": 7600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 7.6}}, + {"timestamp_ms": 7600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 7.6}}, {"timestamp_ms": 7700, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Perfect, I have your reservation. You are booked Boston to Seattle on SkyWay flight eight ninety on Wednesday, August twentieth. What date would you like to move that return to, and do you want to keep an aisle seat if available?"}}, {"timestamp_ms": 7750, "source": "audit_log", "event_type": "assistant", "data": "Perfect, I have your reservation. You are booked Boston to Seattle on SkyWay flight eight ninety on Wednesday, August twentieth. What date would you like to move that return to, and do you want to keep an aisle seat if available?"}, {"timestamp_ms": 7800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 7.8}}, - {"timestamp_ms": 8500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 8.5}}, + {"timestamp_ms": 8500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 8.5}}, {"timestamp_ms": 8800, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I have your reservation. You are booked Boston to Seattle on Skyway flight 890 on Wednesday, August 20th. What date would you like to move that return to? And do you want to keep an aisle seat if available?"}}}, {"timestamp_ms": 8900, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 8.9}}, {"timestamp_ms": 9000, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "I want to move the return to August twenty third. \nAnd yes, aisle seat if you have it."}}}, @@ -215,10 +215,10 @@ {"timestamp_ms": 9400, "source": "audit_log", "event_type": "user", "data": "I want to move the return to August 23rd. And yes, I'll see it if you have it."}, {"timestamp_ms": 9500, "source": "audit_log", "event_type": "tool_call", "data": {"tool": "search_rebooking_options", "parameters": {"origin": "BOS", "destination": "SEA", "date": "2026-08-23", "passenger_count": 1, "fare_class": "main_cabin"}}}, {"timestamp_ms": 9600, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "search_rebooking_options", "response": {"status": "success", "count": 3}}}, - {"timestamp_ms": 9800, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 9.8}}, + {"timestamp_ms": 9800, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 9.8}}, {"timestamp_ms": 9900, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "On Sunday, August twenty-third, I have three nonstop options from Boston to Seattle: flight nine zero four departing twelve thirty PM, flight nine hundred departing two fifty PM, or flight nine ten departing four twenty PM. Because this is a voluntary change in Main Cabin, there is a seventy-five dollar change fee, plus any fare difference. The twelve thirty PM option is three hundred dollars, which matches what you paid for the return, so it would be seventy-five dollars total. Which flight would you like?"}}, {"timestamp_ms": 9950, "source": "audit_log", "event_type": "assistant", "data": "On Sunday, August twenty-third, I have three nonstop options from Boston to Seattle: flight nine zero four departing twelve thirty PM, flight nine hundred departing two fifty PM, or flight nine ten departing four twenty PM. Because this is a voluntary change in Main Cabin, there is a seventy-five dollar change fee, plus any fare difference. The twelve thirty PM option is three hundred dollars, which matches what you paid for the return, so it would be seventy-five dollars total. Which flight would you like?"}, - {"timestamp_ms": 11000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 11.0}}, + {"timestamp_ms": 11000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 11.0}}, {"timestamp_ms": 11200, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "On Sunday, August 23rd, I have three nonstop options from Boston to Seattle. Flight 904 departing 1230 p.m., flight 900 departing 250 p.m., or flight 910 departing 420 p.m. Because this is a voluntary change in main cabin. There is a $75 change fee plus any fare difference. The 1230 p.m. option is $300, which matches what you paid for the return, so it would be $75 total. Which flight would you like?"}}}, {"timestamp_ms": 11500, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 11700, "source": "elevenlabs", "event_type": "connection_state", "data": {"data": {"state": "session_ended"}}} @@ -262,8 +262,8 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Hello, how can I help you today?"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.2}}, - {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 1.5}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.2}}, + {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 1.5}}, {"timestamp_ms": 1600, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, how can I help you today?"}}}, {"timestamp_ms": 1700, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 1.7}}, {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "I need to change my flight to an earlier one today."}}}, @@ -271,10 +271,10 @@ {"timestamp_ms": 2000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 2100, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.1}}, {"timestamp_ms": 2200, "source": "audit_log", "event_type": "user", "data": "I need to change my flight to an earlier one today."}, - {"timestamp_ms": 2500, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 2.5}}, + {"timestamp_ms": 2500, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 2.5}}, {"timestamp_ms": 2600, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Sure, I can help with that. What is your six-character confirmation code and last name?"}}, {"timestamp_ms": 2650, "source": "audit_log", "event_type": "assistant", "data": "Sure, I can help with that. What is your six-character confirmation code and last name?"}, - {"timestamp_ms": 3200, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 3.2}}, + {"timestamp_ms": 3200, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 3.2}}, {"timestamp_ms": 3400, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Sure, I can help with that. What is your six-character confirmation code and last name?"}}}, {"timestamp_ms": 3500, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 3.5}}, {"timestamp_ms": 3600, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "Code is six V O R J U. Last name Thompson."}}}, @@ -282,23 +282,23 @@ {"timestamp_ms": 3800, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 3900, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 3.9}}, {"timestamp_ms": 4000, "source": "audit_log", "event_type": "user", "data": "Code is 6. V-O-R-J-U. Last name Thompson."}, - {"timestamp_ms": 4200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 4.2}}, + {"timestamp_ms": 4200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 4.2}}, {"timestamp_ms": 4300, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Thanks. Just to confirm, that is the number six, then Victor Oscar Romeo Juliet Uniform, with last name Thompson, correct?"}}, {"timestamp_ms": 4350, "source": "audit_log", "event_type": "assistant", "data": "Thanks. Just to confirm, that is the number six, then Victor Oscar Romeo Juliet Uniform, with last name Thompson, correct?"}, {"timestamp_ms": 4500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Thanks."}}}, {"timestamp_ms": 4500, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.5}}, {"timestamp_ms": 4600, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 4700, "source": "pipecat", "event_type": "turn_start", "data": {}}, - {"timestamp_ms": 4800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 4.8}}, + {"timestamp_ms": 4800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 4.8}}, {"timestamp_ms": 5000, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "I want to move my LAX to SFO flight today to an earlier direct flight before two PM, if the change fee is under eighty dollars."}}}, {"timestamp_ms": 5500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 5.5}}, {"timestamp_ms": 5600, "source": "audit_log", "event_type": "user", "data": "I want to move my LAX to SFO flight today to an earlier direct flight before 2 PM, if the change fee is under $80."}, {"timestamp_ms": 5700, "source": "audit_log", "event_type": "tool_call", "data": {"tool": "get_reservation", "parameters": {"confirmation_number": "6VORJU", "last_name": "UNKNOWN"}}}, {"timestamp_ms": 5800, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "error", "message": "Last name does not match"}}}, - {"timestamp_ms": 6000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 6.0}}, + {"timestamp_ms": 6000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 6.0}}, {"timestamp_ms": 6100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I can help with that, but I need to verify your identity first. What is the last name on the reservation?"}}, {"timestamp_ms": 6150, "source": "audit_log", "event_type": "assistant", "data": "I can help with that, but I need to verify your identity first. What is the last name on the reservation?"}, - {"timestamp_ms": 6800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 6.8}}, + {"timestamp_ms": 6800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 6.8}}, {"timestamp_ms": 7000, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I can help with that, but I need to verify your identity first. What is the last name on the reservation?"}}}, {"timestamp_ms": 7100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 7.1}}, {"timestamp_ms": 7200, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "Last name is Thompson."}}}, @@ -310,10 +310,10 @@ {"timestamp_ms": 7800, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "success", "reservation": {"confirmation_number": "6VORJU"}}}}, {"timestamp_ms": 7900, "source": "audit_log", "event_type": "tool_call", "data": {"tool": "search_rebooking_options", "parameters": {"origin": "LAX", "destination": "SFO", "date": "2026-06-18", "passenger_count": 1}}}, {"timestamp_ms": 8000, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "search_rebooking_options", "response": {"status": "success", "count": 2}}}, - {"timestamp_ms": 8200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 8.2}}, + {"timestamp_ms": 8200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 8.2}}, {"timestamp_ms": 8300, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I found two nonstop options. Flight SK one thirty at one PM for seventy-five dollars total, or flight SK two ten at eleven AM for one hundred twenty-five dollars. Which would you prefer?"}}, {"timestamp_ms": 8350, "source": "audit_log", "event_type": "assistant", "data": "I found two nonstop options. Flight SK one thirty at one PM for seventy-five dollars total, or flight SK two ten at eleven AM for one hundred twenty-five dollars. Which would you prefer?"}, - {"timestamp_ms": 9500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 9.5}}, + {"timestamp_ms": 9500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 9.5}}, {"timestamp_ms": 9600, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 9800, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I found two nonstop options. Flight SK 130 at 1 PM for seventy-five dollars total, or flight SK 210 at 11 AM for one hundred twenty-five dollars. Which would you prefer?"}}}, {"timestamp_ms": 10000, "source": "elevenlabs", "event_type": "connection_state", "data": {"data": {"state": "session_ended"}}} @@ -356,8 +356,8 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Hello, how can I help you today?"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.2}}, - {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 1.5}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.2}}, + {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 1.5}}, {"timestamp_ms": 1600, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, how can I help you today?"}}}, {"timestamp_ms": 1700, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 1700, "source": "pipecat", "event_type": "turn_start", "data": {}}, @@ -365,10 +365,10 @@ {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "I need to change my flight to an earlier departure."}}}, {"timestamp_ms": 2200, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.2}}, {"timestamp_ms": 2300, "source": "audit_log", "event_type": "user", "data": "I need to change my flight to an earlier departure."}, - {"timestamp_ms": 2500, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 2.5}}, + {"timestamp_ms": 2500, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 2.5}}, {"timestamp_ms": 2600, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Sure, what is your six-character confirmation code and last name on the booking?"}}, {"timestamp_ms": 2650, "source": "audit_log", "event_type": "assistant", "data": "Sure, what is your six-character confirmation code and last name on the booking?"}, - {"timestamp_ms": 3200, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 3.2}}, + {"timestamp_ms": 3200, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 3.2}}, {"timestamp_ms": 3400, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Sure, what is your six-character confirmation code and last name on the booking?"}}}, {"timestamp_ms": 3500, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 3500, "source": "pipecat", "event_type": "turn_start", "data": {}}, @@ -378,31 +378,31 @@ {"timestamp_ms": 4300, "source": "audit_log", "event_type": "user", "data": "The confirmation code is XXF60H. And the last name is Kim."}, {"timestamp_ms": 4400, "source": "audit_log", "event_type": "tool_call", "data": {"tool": "get_reservation", "parameters": {"confirmation_number": "XXF60H", "last_name": "Kim"}}}, {"timestamp_ms": 4500, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "error", "message": "Reservation XXF60H not found"}}}, - {"timestamp_ms": 4800, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 4.8}}, + {"timestamp_ms": 4800, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 4.8}}, {"timestamp_ms": 4900, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I was not able to find that code. Could you spell it out one letter at a time?"}}, {"timestamp_ms": 4950, "source": "audit_log", "event_type": "assistant", "data": "I was not able to find that code. Could you spell it out one letter at a time?"}, - {"timestamp_ms": 5800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 5.8}}, + {"timestamp_ms": 5800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 5.8}}, {"timestamp_ms": 6000, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I was not able to find that code. Could you spell it out one letter at a time?"}}}, {"timestamp_ms": 6010, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 6.01}}, {"timestamp_ms": 6100, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 6100, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 6200, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "[slow] It is X X F six O H, with the letter O not zero."}}}, {"timestamp_ms": 6300, "source": "audit_log", "event_type": "user", "data": "It is..."}, - {"timestamp_ms": 6500, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 6.5}}, + {"timestamp_ms": 6500, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 6.5}}, {"timestamp_ms": 6600, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I'm here. Take your time. Please say the six characters one at a time and I will confirm each one."}}, {"timestamp_ms": 6650, "source": "audit_log", "event_type": "assistant", "data": "I'm here. Take your time. Please say the six characters one at a time and I will confirm each one."}, - {"timestamp_ms": 7200, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 7.2}}, + {"timestamp_ms": 7200, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 7.2}}, {"timestamp_ms": 7500, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 7500, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 7800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 7.8}}, {"timestamp_ms": 7900, "source": "audit_log", "event_type": "user", "data": "X. F. 6. O. H. with the letter O. not zero."}, {"timestamp_ms": 8000, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I'm here. Take your time. Please say the six characters one at a time and I will confirm each one."}}}, - {"timestamp_ms": 8200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 8.2}}, + {"timestamp_ms": 8200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 8.2}}, {"timestamp_ms": 8300, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Got it. So I have X, F, six, O, H. Let me try that with last name Kim."}}, {"timestamp_ms": 8350, "source": "audit_log", "event_type": "assistant", "data": "Got it. So I have X, F, six, O, H. Let me try that with last name Kim."}, {"timestamp_ms": 8400, "source": "audit_log", "event_type": "tool_call", "data": {"tool": "get_reservation", "parameters": {"confirmation_number": "XXF6OH", "last_name": "Kim"}}}, {"timestamp_ms": 8500, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "error", "message": "Reservation XXF6OH not found"}}}, - {"timestamp_ms": 9000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 9.0}}, + {"timestamp_ms": 9000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 9.0}}, {"timestamp_ms": 9200, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Got it. So I have X, F, six, O, H. Let me try that with last name Kim."}}}, {"timestamp_ms": 9300, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 9.3}}, {"timestamp_ms": 9400, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "No, that is not right. It is X X F six O H."}}}, @@ -447,8 +447,8 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Hello! How can I help you today?"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.2}}, - {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 2.0}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.2}}, + {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 2.0}}, {"timestamp_ms": 2500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, how can I help you today?"}}}, {"timestamp_ms": 3000, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 3000, "source": "pipecat", "event_type": "turn_start", "data": {}}, @@ -458,8 +458,8 @@ {"timestamp_ms": 3800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 3.8}}, {"timestamp_ms": 4000, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Sure, I can help with that. Could you provide your confirmation number and last name?"}}, {"timestamp_ms": 4050, "source": "audit_log", "event_type": "assistant", "data": "Sure, I can help with that. Could you provide your confirmation number and last name?"}, - {"timestamp_ms": 4100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 4.1}}, - {"timestamp_ms": 5000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 5.0}}, + {"timestamp_ms": 4100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 4.1}}, + {"timestamp_ms": 5000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 5.0}}, {"timestamp_ms": 5500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Sure, I can help with that. Could you provide your confirmation number and last name?"}}}, {"timestamp_ms": 6000, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 6000, "source": "pipecat", "event_type": "turn_start", "data": {}}, @@ -471,8 +471,8 @@ {"timestamp_ms": 7000, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "error", "message": "Reservation VK3FFW not found"}}}, {"timestamp_ms": 7500, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "It seems I couldn't find the reservation. Could you double-check the confirmation number?"}}, {"timestamp_ms": 7550, "source": "audit_log", "event_type": "assistant", "data": "It seems I couldn't find the reservation. Could you double-check the confirmation number?"}, - {"timestamp_ms": 7600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 7.6}}, - {"timestamp_ms": 8500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 8.5}}, + {"timestamp_ms": 7600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 7.6}}, + {"timestamp_ms": 8500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 8.5}}, {"timestamp_ms": 9000, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "It seems I couldn't find the reservation. Could you double-check the confirmation number?"}}}, {"timestamp_ms": 9500, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 10000, "source": "elevenlabs", "event_type": "connection_state", "data": {"data": {"state": "session_ended"}}} @@ -508,8 +508,8 @@ "history": [ {"timestamp_ms": 4784, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 7000, "source": "pipecat", "event_type": "llm_response", "data": {"frame": "Hello, thank you for calling SkyWay Airlines. How can I help you?"}}, - {"timestamp_ms": 7188, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 7.188}}, - {"timestamp_ms": 13227, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 13.227}}, + {"timestamp_ms": 7188, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 7.188}}, + {"timestamp_ms": 13227, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 13.227}}, {"timestamp_ms": 14602, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, thank you for calling Skyway Airlines. How can I help you?"}}}, {"timestamp_ms": 15013, "source": "audit_log", "event_type": "assistant", "data": "Hello, thank you for calling SkyWay Airlines. How can I help you?"}, {"timestamp_ms": 15412, "source": "pipecat", "event_type": "turn_end", "data": {}}, @@ -519,8 +519,8 @@ {"timestamp_ms": 18028, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 18.028}}, {"timestamp_ms": 15414, "source": "audit_log", "event_type": "user", "data": "Hi. I need to change my flight to March 25."}, {"timestamp_ms": 19000, "source": "pipecat", "event_type": "llm_response", "data": {"frame": "Sure! Could you provide your confirmation number and last name?"}}, - {"timestamp_ms": 19590, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 19.59}}, - {"timestamp_ms": 30175, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 30.175}}, + {"timestamp_ms": 19590, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 19.59}}, + {"timestamp_ms": 30175, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 30.175}}, {"timestamp_ms": 32912, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Sure! Could you provide your confirmation number and last name?"}}}, {"timestamp_ms": 32044, "source": "audit_log", "event_type": "assistant", "data": "Sure! Could you provide your confirmation number and last name?"}, {"timestamp_ms": 34059, "source": "pipecat", "event_type": "turn_end", "data": {}}, @@ -536,8 +536,8 @@ {"timestamp_ms": 66393, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "search_flights", "response": {"status": "success", "count": 3}}}, {"timestamp_ms": 66396, "source": "audit_log", "event_type": "assistant", "data": "I found your reservation. You are booked on flight SK621 from Austin to Chicago."}, {"timestamp_ms": 41000, "source": "pipecat", "event_type": "llm_response", "data": {"frame": "Thank you! Let me pull up your booking.\nI found your reservation. You are booked on flight SK621 from Austin to Chicago."}}, - {"timestamp_ms": 41880, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 41.88}}, - {"timestamp_ms": 64809, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 64.809}}, + {"timestamp_ms": 41880, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 41.88}}, + {"timestamp_ms": 64809, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 64.809}}, {"timestamp_ms": 69138, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Thank you. Let me pull up your booking. I found your reservation. You are booked on flight SK 621 from Austin to Chicago."}}}, {"timestamp_ms": 70314, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 70314, "source": "pipecat", "event_type": "turn_start", "data": {}}, @@ -578,26 +578,26 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Hello, how can I help you?"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}}, - {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 1.5}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.0}}, + {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 1.5}}, {"timestamp_ms": 1600, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, how can I help you?"}}}, {"timestamp_ms": 1700, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 1700, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1710, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 1.71}}, {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "I need help with my reservation and I also need to"}}}, - {"timestamp_ms": 2200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 2.2}}, + {"timestamp_ms": 2200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 2.2}}, {"timestamp_ms": 2300, "source": "audit_log", "event_type": "user", "data": "I need help with my reservation and I also need to"}, {"timestamp_ms": 2400, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I'd be happy to help you with your reservation."}}, {"timestamp_ms": 2450, "source": "audit_log", "event_type": "assistant", "data": "I'd be"}, - {"timestamp_ms": 2500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 2.5}}, + {"timestamp_ms": 2500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 2.5}}, {"timestamp_ms": 2600, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.6}}, {"timestamp_ms": 2700, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "change my flight date."}}}, {"timestamp_ms": 2800, "source": "audit_log", "event_type": "user", "data": "change my flight date."}, {"timestamp_ms": 2900, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I'd be"}}}, - {"timestamp_ms": 3000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 3.0}}, + {"timestamp_ms": 3000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 3.0}}, {"timestamp_ms": 3100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Sure, what is your confirmation code?"}}, {"timestamp_ms": 3150, "source": "audit_log", "event_type": "assistant", "data": "Sure, what is your confirmation code?"}, - {"timestamp_ms": 3500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 3.5}}, + {"timestamp_ms": 3500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 3.5}}, {"timestamp_ms": 3600, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Sure, what is your confirmation code?"}}}, {"timestamp_ms": 3700, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 3700, "source": "pipecat", "event_type": "turn_start", "data": {}}, @@ -633,8 +633,8 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Hello, how can I help you?"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.2}}, - {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 1.5}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.2}}, + {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 1.5}}, {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, how can I help you?"}}}, {"timestamp_ms": 2010, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "I need to rebook my flight."}}}, {"timestamp_ms": 2020, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.0}}, @@ -645,9 +645,9 @@ {"timestamp_ms": 3000, "source": "audit_log", "event_type": "tool_call", "data": {"tool": "get_reservation", "parameters": {"confirmation_number": "ABC123", "last_name": "Smith"}}}, {"timestamp_ms": 3100, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "error", "message": "Not found"}}}, {"timestamp_ms": 3500, "source": "audit_log", "event_type": "assistant", "data": "I couldn't find that reservation. Could you double-check?"}, - {"timestamp_ms": 3600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 3.6}}, + {"timestamp_ms": 3600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 3.6}}, {"timestamp_ms": 3700, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I couldn't find that reservation. Could you double-check?"}}, - {"timestamp_ms": 5000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 5.0}}, + {"timestamp_ms": 5000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 5.0}}, {"timestamp_ms": 5500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I couldn't find that reservation. Could you double-check?"}}}, {"timestamp_ms": 5510, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 5.5}}, {"timestamp_ms": 5600, "source": "pipecat", "event_type": "turn_end", "data": {}}, @@ -657,9 +657,9 @@ {"timestamp_ms": 6500, "source": "audit_log", "event_type": "user", "data": "L as in Lima"}, {"timestamp_ms": 7000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 7.0}}, {"timestamp_ms": 8000, "source": "audit_log", "event_type": "assistant", "data": "I'm sorry, could you spell that more slowly?"}, - {"timestamp_ms": 8100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 8.1}}, + {"timestamp_ms": 8100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 8.1}}, {"timestamp_ms": 8200, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I'm sorry, could you spell that more slowly?"}}, - {"timestamp_ms": 9000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 9.0}}, + {"timestamp_ms": 9000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 9.0}}, {"timestamp_ms": 9500, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 9.5}}, {"timestamp_ms": 9600, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I'm sorry, could you spell that more slowly?"}}}, {"timestamp_ms": 9700, "source": "pipecat", "event_type": "turn_end", "data": {}}, @@ -700,8 +700,8 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Hello, how can I help you?"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.2}}, - {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 1.5}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.2}}, + {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 1.5}}, {"timestamp_ms": 1600, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, how can I help you?"}}}, {"timestamp_ms": 1700, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 1.7}}, {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "I want to change both flights."}}}, @@ -710,9 +710,9 @@ {"timestamp_ms": 2100, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.1}}, {"timestamp_ms": 2200, "source": "audit_log", "event_type": "user", "data": "I want to change both flights."}, {"timestamp_ms": 2500, "source": "audit_log", "event_type": "assistant", "data": "I found options for both flights. Which would you prefer?"}, - {"timestamp_ms": 2600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 2.6}}, + {"timestamp_ms": 2600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 2.6}}, {"timestamp_ms": 2700, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I found options for both flights. Which would you prefer?"}}, - {"timestamp_ms": 4000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 4.0}}, + {"timestamp_ms": 4000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 4.0}}, {"timestamp_ms": 4200, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I found options for both flights. Which would you prefer?"}}}, {"timestamp_ms": 4300, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.3}}, {"timestamp_ms": 4400, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "That works, main cabin please."}}}, @@ -721,9 +721,9 @@ {"timestamp_ms": 4700, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.7}}, {"timestamp_ms": 4800, "source": "audit_log", "event_type": "user", "data": "That works. Main cabin."}, {"timestamp_ms": 5200, "source": "audit_log", "event_type": "assistant", "data": "Just to confirm before I make the changes: flight one on November third, flight two on November fifth. Shall I proceed?"}, - {"timestamp_ms": 5300, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 5.3}}, + {"timestamp_ms": 5300, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 5.3}}, {"timestamp_ms": 5400, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Just to confirm before I make the changes: flight one on November third, flight two on November fifth. Shall I proceed?"}}, - {"timestamp_ms": 7000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 7.0}}, + {"timestamp_ms": 7000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 7.0}}, {"timestamp_ms": 7100, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 7500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Just to confirm, flight one on November third, flight two on November fifth. Shall I proceed?"}}}, {"timestamp_ms": 7600, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "Yes, go ahead and rebook both."}}}, @@ -759,9 +759,9 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Welcome!"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.0}}, {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Welcome!"}}}, - {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 2.0}}, + {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 2.0}}, {"timestamp_ms": 2100, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 3000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 3100, "source": "audit_log", "event_type": "user", "data": "I need to rebook my flight"}, @@ -770,9 +770,9 @@ {"timestamp_ms": 3800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 3.8}}, {"timestamp_ms": 4000, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Sure, I can help you with that."}}, {"timestamp_ms": 4050, "source": "audit_log", "event_type": "assistant", "data": "Sure, I can help you with that. Could you please provide your confirmation number and last name?"}, - {"timestamp_ms": 4100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 4.0}}, + {"timestamp_ms": 4100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 4.0}}, {"timestamp_ms": 4500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Sure, I can help you with that."}}}, - {"timestamp_ms": 5000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 5.0}}, + {"timestamp_ms": 5000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 5.0}}, {"timestamp_ms": 5100, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 6000, "source": "elevenlabs", "event_type": "connection_state", "data": {"data": {"state": "session_ended"}}} ], @@ -802,8 +802,8 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "llm_response", "data": {"frame": "Welcome to SkyWay Airlines!"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}}, - {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 1.8}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.0}}, + {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 1.8}}, {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Welcome to SkyWay Airlines!"}}}, {"timestamp_ms": 2100, "source": "audit_log", "event_type": "assistant", "data": "Welcome to SkyWay Airlines!"}, {"timestamp_ms": 2200, "source": "pipecat", "event_type": "turn_end", "data": {}}, @@ -814,9 +814,9 @@ {"timestamp_ms": 3100, "source": "audit_log", "event_type": "user", "data": "I need to change my flight date."}, {"timestamp_ms": 3500, "source": "pipecat", "event_type": "llm_response", "data": {"frame": "Sure, I can help you change your flight."}}, {"timestamp_ms": 3550, "source": "audit_log", "event_type": "assistant", "data": "Sure, I can help you change your flight. What is your confirmation number and last name so I can pull up the reservation?"}, - {"timestamp_ms": 3600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 3.6}}, + {"timestamp_ms": 3600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 3.6}}, {"timestamp_ms": 4200, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Sure, I can help you change your flight."}}}, - {"timestamp_ms": 4500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 4.5}}, + {"timestamp_ms": 4500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 4.5}}, {"timestamp_ms": 4600, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 5000, "source": "elevenlabs", "event_type": "connection_state", "data": {"data": {"state": "session_ended"}}} ], @@ -845,9 +845,9 @@ "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 1100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Welcome to SkyWay Airlines!"}}, - {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}}, + {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.0}}, {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Welcome to SkyWay Airlines!"}}}, - {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 1.8}}, + {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 1.8}}, {"timestamp_ms": 1900, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 2000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 2100, "source": "audit_log", "event_type": "user", "data": "Hi, I'm calling"}, @@ -857,9 +857,9 @@ {"timestamp_ms": 3100, "source": "audit_log", "event_type": "user", "data": "Sure, confirmation number is N53W23"}, {"timestamp_ms": 3200, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Could you please provide your last name?"}}, {"timestamp_ms": 3250, "source": "audit_log", "event_type": "assistant", "data": "Could you please provide your last name?"}, - {"timestamp_ms": 3300, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 3.2}}, + {"timestamp_ms": 3300, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 3.2}}, {"timestamp_ms": 3600, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Could you please provide your last name?"}}}, - {"timestamp_ms": 3900, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 3.8}}, + {"timestamp_ms": 3900, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 3.8}}, {"timestamp_ms": 3950, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 4000, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 4100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.0}}, @@ -870,9 +870,9 @@ {"timestamp_ms": 5200, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "success"}}}, {"timestamp_ms": 5500, "source": "audit_log", "event_type": "assistant", "data": "I found your reservation for Natalie Cruz."}, {"timestamp_ms": 5600, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I found your reservation for Natalie Cruz."}}, - {"timestamp_ms": 5700, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 5.6}}, + {"timestamp_ms": 5700, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 5.6}}, {"timestamp_ms": 6000, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I found your reservation for Natalie Cruz."}}}, - {"timestamp_ms": 6300, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 6.2}}, + {"timestamp_ms": 6300, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 6.2}}, {"timestamp_ms": 6400, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 6500, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 6600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 6.5}}, @@ -881,9 +881,9 @@ {"timestamp_ms": 7300, "source": "audit_log", "event_type": "user", "data": "Yes, that is the right reservation."}, {"timestamp_ms": 7500, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Great, is there anything else I can help with?"}}, {"timestamp_ms": 7550, "source": "audit_log", "event_type": "assistant", "data": "Great, is there anything else I can help with?"}, - {"timestamp_ms": 7600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 7.5}}, + {"timestamp_ms": 7600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 7.5}}, {"timestamp_ms": 7900, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Great, is there anything else I can help with?"}}}, - {"timestamp_ms": 8200, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 8.1}}, + {"timestamp_ms": 8200, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 8.1}}, {"timestamp_ms": 8300, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 8500, "source": "elevenlabs", "event_type": "connection_state", "data": {"data": {"state": "session_ended"}}} ], @@ -917,18 +917,18 @@ "description": "Two consecutive empty user audio bursts (ElevenLabs misses speech that Deepgram catches), followed by the user interrupting the assistant's response. The audit_log/user transcript and ElevenLabs user_speech must land at the same turn. Based on record 7.3.1.", "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Hello, how can I help you?"}}, - {"timestamp_ms": 1100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}}, + {"timestamp_ms": 1100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.0}}, {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, how can I help you?"}}}, - {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 1.8}}, + {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 1.8}}, {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.0}}, {"timestamp_ms": 2300, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "I need help rebooking my flight."}}}, {"timestamp_ms": 2800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.8}}, {"timestamp_ms": 2900, "source": "audit_log", "event_type": "user", "data": "I need help rebooking my flight."}, {"timestamp_ms": 3000, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Sure. Was your flight canceled or delayed?"}}, {"timestamp_ms": 3050, "source": "audit_log", "event_type": "assistant", "data": "Sure. Was your flight canceled or delayed?"}, - {"timestamp_ms": 3100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 3.0}}, + {"timestamp_ms": 3100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 3.0}}, {"timestamp_ms": 3400, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Sure. Was your flight canceled or delayed?"}}}, - {"timestamp_ms": 3700, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 3.7}}, + {"timestamp_ms": 3700, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 3.7}}, {"timestamp_ms": 4000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.0}}, {"timestamp_ms": 4300, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.3}}, {"timestamp_ms": 4500, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.5}}, @@ -937,18 +937,18 @@ {"timestamp_ms": 5600, "source": "audit_log", "event_type": "tool_call", "data": {"tool": "get_disruption_info", "parameters": {"flight_number": "SK340"}}}, {"timestamp_ms": 5600, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_disruption_info", "response": {"status": "error", "error_type": "not_found"}}}, {"timestamp_ms": 5900, "source": "audit_log", "event_type": "assistant", "data": "I can not find that flight. Let me transfer you to an agent."}, - {"timestamp_ms": 6000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 5.9}}, + {"timestamp_ms": 6000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 5.9}}, {"timestamp_ms": 6100, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I can not find that flight."}}, {"timestamp_ms": 6200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 6.2}}, {"timestamp_ms": 6300, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "Yeah, my flight was canceled and I need a hotel voucher."}}}, - {"timestamp_ms": 6400, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 6.4}}, + {"timestamp_ms": 6400, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 6.4}}, {"timestamp_ms": 6500, "source": "audit_log", "event_type": "user", "data": "And I need a hotel voucher."}, {"timestamp_ms": 6700, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 6.7}}, {"timestamp_ms": 6900, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I'm sorry, I encountered an error."}}, {"timestamp_ms": 6950, "source": "audit_log", "event_type": "assistant", "data": "I'm sorry, I encountered an error."}, - {"timestamp_ms": 7000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 6.9}}, + {"timestamp_ms": 7000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 6.9}}, {"timestamp_ms": 7300, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I can not find that flight. I'm sorry, I encountered an error."}}}, - {"timestamp_ms": 7600, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 7.5}}, + {"timestamp_ms": 7600, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 7.5}}, {"timestamp_ms": 7800, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 7.8}}, {"timestamp_ms": 7900, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "Okay, goodbye."}}}, {"timestamp_ms": 8100, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 8.1}}, @@ -986,18 +986,18 @@ "description": "Based on record 5.1.3: Assistant audio_start overlaps with user audio by ~50ms (timing artifact, not a real barge-in). This triggers assistant_interrupted_turns and hold_turn. The user then speaks again in a NEW audio session. Tests that hold_turn does NOT prevent the new user audio_start from advancing the turn — user speaking again must always start a new turn.", "history": [ {"timestamp_ms": 1000, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Hello! How can I help you today?"}}, - {"timestamp_ms": 1100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}}, + {"timestamp_ms": 1100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.0}}, {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, how can I help you today?"}}}, - {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 1.8}}, + {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 1.8}}, {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.0}}, {"timestamp_ms": 2100, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "My flight got canceled and I want a full refund."}}}, {"timestamp_ms": 2200, "source": "audit_log", "event_type": "user", "data": "My flight got cancelled and I want a full refund."}, - {"timestamp_ms": 2350, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 2.35}}, + {"timestamp_ms": 2350, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 2.35}}, {"timestamp_ms": 2400, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.4}}, {"timestamp_ms": 2500, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I'm sorry to hear that. Could you provide your confirmation number?"}}, {"timestamp_ms": 2550, "source": "audit_log", "event_type": "assistant", "data": "I'm sorry to hear that. Could you provide your confirmation number?"}, - {"timestamp_ms": 3000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 3.0}}, + {"timestamp_ms": 3000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 3.0}}, {"timestamp_ms": 3100, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I'm sorry to hear that. Could you provide your confirmation number?"}}}, {"timestamp_ms": 4000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.0}}, @@ -1009,8 +1009,8 @@ {"timestamp_ms": 4700, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "success", "reservation": {"confirmation_number": "Z5OROH"}}}}, {"timestamp_ms": 5000, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I found your reservation. Your flight was canceled, so you are eligible for a full refund. Shall I proceed?"}}, {"timestamp_ms": 5050, "source": "audit_log", "event_type": "assistant", "data": "I found your reservation. Your flight was canceled, so you are eligible for a full refund. Shall I proceed?"}, - {"timestamp_ms": 5100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 5.1}}, - {"timestamp_ms": 5500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 5.5}}, + {"timestamp_ms": 5100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 5.1}}, + {"timestamp_ms": 5500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 5.5}}, {"timestamp_ms": 5600, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I found your reservation. Your flight was canceled, so you are eligible for a full refund. Shall I proceed?"}}}, {"timestamp_ms": 6000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 6.0}}, diff --git a/tests/unit/assistant/test_audio_bridge.py b/tests/unit/assistant/test_audio_bridge.py new file mode 100644 index 00000000..117c7a22 --- /dev/null +++ b/tests/unit/assistant/test_audio_bridge.py @@ -0,0 +1,104 @@ +"""Tests for shared audio bridge utilities. + +Covers: PCM↔mulaw round-trip fidelity, PCM16 mixing with clipping, +and Twilio WebSocket protocol message round-trips. +""" + +import audioop +import json +import math +import struct + +import pytest + +from eva.assistant.audio_bridge import ( + create_twilio_media_message, + mulaw_8k_to_pcm16_16k, + mulaw_8k_to_pcm16_24k, + parse_twilio_media_message, + pcm16_16k_to_mulaw_8k, + pcm16_24k_to_mulaw_8k, + pcm16_mix, +) + + +def _generate_mulaw_tone(freq_hz: int = 440, duration_ms: int = 100) -> bytes: + sample_rate = 8000 + n_samples = sample_rate * duration_ms // 1000 + pcm_samples = [int(16000 * math.sin(2 * math.pi * freq_hz * i / sample_rate)) for i in range(n_samples)] + pcm_bytes = struct.pack(f"<{n_samples}h", *pcm_samples) + return audioop.lin2ulaw(pcm_bytes, 2) + + +def _rms(pcm_bytes: bytes) -> float: + n = len(pcm_bytes) // 2 + if n == 0: + return 0.0 + samples = struct.unpack(f"<{n}h", pcm_bytes) + return math.sqrt(sum(s * s for s in samples) / n) + + +class TestAudioConversionRoundTrip: + def test_mulaw_8k_pcm16_24k_round_trip(self): + """Mulaw 8k -> pcm16 24k -> mulaw 8k preserves signal energy.""" + original = _generate_mulaw_tone(440, 100) + + pcm_24k = mulaw_8k_to_pcm16_24k(original) + recovered = pcm16_24k_to_mulaw_8k(pcm_24k) + + assert len(recovered) == len(original) + + orig_pcm = audioop.ulaw2lin(original, 2) + recov_pcm = audioop.ulaw2lin(recovered, 2) + orig_rms = _rms(orig_pcm) + recov_rms = _rms(recov_pcm) + assert orig_rms > 0 + assert recov_rms / orig_rms == pytest.approx(1.0, abs=0.15) + + def test_mulaw_8k_pcm16_16k_round_trip(self): + """Mulaw 8k -> pcm16 16k -> mulaw 8k preserves signal energy.""" + original = _generate_mulaw_tone(440, 100) + + pcm_16k = mulaw_8k_to_pcm16_16k(original) + recovered = pcm16_16k_to_mulaw_8k(pcm_16k) + + assert len(recovered) == len(original) + + orig_pcm = audioop.ulaw2lin(original, 2) + recov_pcm = audioop.ulaw2lin(recovered, 2) + orig_rms = _rms(orig_pcm) + recov_rms = _rms(recov_pcm) + assert orig_rms > 0 + assert recov_rms / orig_rms == pytest.approx(1.0, abs=0.15) + + +class TestPcm16Mix: + def test_adds_samples_and_clips_at_int16_boundaries(self): + """Sample-wise addition with clipping; shorter track is zero-padded.""" + track_a = struct.pack("<2h", 30000, -30000) + track_b = struct.pack("<2h", 10000, -10000) + + mixed = pcm16_mix(track_a, track_b) + result = struct.unpack("<2h", mixed) + assert result == (32767, -32768) + + short_track = struct.pack("<1h", 5000) + long_track = struct.pack("<2h", 100, 200) + mixed = pcm16_mix(short_track, long_track) + result = struct.unpack("<2h", mixed) + assert result == (5100, 200) + + +class TestTwilioProtocol: + def test_create_and_parse_round_trip(self): + """create_twilio_media_message -> parse_twilio_media_message recovers bytes.""" + audio = b"\x80\x90\xa0\xb0\xc0" + msg = create_twilio_media_message("stream-1", audio) + recovered = parse_twilio_media_message(msg) + assert recovered == audio + + parsed = json.loads(msg) + assert parsed["streamSid"] == "stream-1" + + assert parse_twilio_media_message(json.dumps({"event": "start"})) is None + assert parse_twilio_media_message("not json at all {{{") is None diff --git a/tests/unit/assistant/test_server.py b/tests/unit/assistant/test_pipecat_server.py similarity index 96% rename from tests/unit/assistant/test_server.py rename to tests/unit/assistant/test_pipecat_server.py index 6b2a9f14..effdae64 100644 --- a/tests/unit/assistant/test_server.py +++ b/tests/unit/assistant/test_pipecat_server.py @@ -1,4 +1,4 @@ -"""Tests for AssistantServer.""" +"""Tests for PipecatPipecatAssistantServer.""" import asyncio import json @@ -9,12 +9,12 @@ import pytest from eva.assistant.agentic.audit_log import AuditLog -from eva.assistant.server import SAMPLE_RATE, AssistantServer +from eva.assistant.pipecat_server import SAMPLE_RATE, PipecatAssistantServer def _make_server(tmp_path: Path): - """Build a lightweight AssistantServer without invoking __init__ (avoids Pipecat I/O).""" - srv = object.__new__(AssistantServer) + """Build a lightweight PipecatAssistantServer without invoking __init__ (avoids Pipecat I/O).""" + srv = object.__new__(PipecatAssistantServer) srv.output_dir = tmp_path srv.audit_log = AuditLog() srv.agentic_system = None @@ -157,7 +157,7 @@ async def test_saves_audit_log_and_both_scenario_db_snapshots(self, tmp_path): # Add an entry so audit_log is non-trivial srv.audit_log.append_user_input("Hello") - await srv._save_outputs() + await srv.save_outputs() # Audit log contains our entry audit = json.loads((tmp_path / "audit_log.json").read_text()) @@ -181,7 +181,7 @@ async def test_saves_agent_perf_stats_when_agentic_system_present(self, tmp_path mock_system = MagicMock() srv.agentic_system = mock_system - await srv._save_outputs() + await srv.save_outputs() mock_system.save_agent_perf_stats.assert_called_once() diff --git a/tests/unit/user_simulator/test_audio_interface.py b/tests/unit/user_simulator/test_audio_interface.py index 8ae1bb7e..4b882c87 100644 --- a/tests/unit/user_simulator/test_audio_interface.py +++ b/tests/unit/user_simulator/test_audio_interface.py @@ -111,7 +111,8 @@ def test_idle_state_no_silence_sent(self): class TestAudioStateTransitions: """Test that audio start/end callbacks correctly update state and timestamps.""" - def test_user_start_clears_assistant_ended_time(self): + @pytest.mark.asyncio + async def test_user_start_clears_assistant_ended_time(self): """When user starts speaking, we stop waiting for assistant.""" event_logger = MagicMock() iface = _make_interface(event_logger=event_logger) @@ -119,7 +120,7 @@ def test_user_start_clears_assistant_ended_time(self): with patch("eva.user_simulator.audio_interface.asyncio.get_event_loop") as mock_loop: mock_loop.return_value.time.return_value = 100.0 - iface._on_user_audio_start() + await iface._on_user_audio_start() assert iface._user_audio_active is True assert iface._user_audio_ended_time is None @@ -198,7 +199,7 @@ async def ws_messages(): # _on_assistant_audio_start was called (audio_start logged) # Note: _assistant_audio_active is reset in the finally block on disconnect, # so we verify the start event was logged instead - event_logger.log_audio_start.assert_called_once_with("pipecat_agent") + event_logger.log_audio_start.assert_called_once_with("framework_agent") @pytest.mark.asyncio async def test_empty_payload_ignored(self): @@ -279,7 +280,7 @@ async def ws_messages(): await iface._receive_from_assistant() assert iface._assistant_audio_active is False - event_logger.log_audio_end.assert_called_once_with("pipecat_agent") + event_logger.log_audio_end.assert_called_once_with("framework_agent") class TestSendAudioFrame: diff --git a/tests/unit/user_simulator/test_event_logger.py b/tests/unit/user_simulator/test_event_logger.py index 188911a1..c3ba735b 100644 --- a/tests/unit/user_simulator/test_event_logger.py +++ b/tests/unit/user_simulator/test_event_logger.py @@ -92,10 +92,10 @@ def test_log_audio_start_structure(self, logger): assert "data" not in event def test_log_audio_end_structure(self, logger): - logger.log_audio_end("pipecat_agent") + logger.log_audio_end("framework_agent") event = logger._events[0] assert event["event_type"] == "audio_end" - assert event["user"] == "pipecat_agent" + assert event["user"] == "framework_agent" assert isinstance(event["audio_timestamp"], float) def test_save_creates_jsonl(self, logger):