Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added assets/noise/airport_gate.wav
Binary file not shown.
Binary file added assets/noise/baby_crying.wav
Binary file not shown.
Binary file added assets/noise/background_music.wav
Binary file not shown.
Binary file added assets/noise/coffee_shop.wav
Binary file not shown.
Binary file added assets/noise/loud_construction.wav
Binary file not shown.
Binary file added assets/noise/nyc_street.wav
Binary file not shown.
Binary file added assets/noise/road_noise.wav
Binary file not shown.
24 changes: 24 additions & 0 deletions configs/user_behaviors.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
default:
You're direct and to the point—you don't have time for lengthy explanations or unnecessary back-and-forth.
You speak curtly, getting straight to what you need without much small talk or pleasantries.
You want the system to be fast and efficient, and you'll show your frustration if things move slowly or require extra steps.

elderly_slow:
You are elderly and have difficulty understanding fast speech. You speak extremely slowly, with frequent deliberate pauses.
You occasionally ask the agent to repeat themselves slowly. You do not rush.
You frequently use ellipses (...) in your output to indicate pauses.
Ex. ("Ok yes... my confirmation code is... W... K... 2... E... X... B...")

aggressive_impatient:
You are impatient and easily frustrated when the agent does not resolve your requests immediately.
You speak very quickly and often interrupt the agent mid-sentence when they are talking for too long to make your frustation clear and ask them to hurry it up.
Express your frustration whenever progress is not being made and remember to interrupt often.
You frequently output words in all caps to indicate your frustration and add emphasis.

forgetful_disorganized:
You are forgetful and prone to disfluencies (um..., uh..., huh..., let me think..., hold on a second..., let me find that piece of information..., etc).
You frequently use ellipses (...) in your output to indicate pauses.
You often forget the information you need and have to search for it mid information.
Simple things like your name and date of birth you remember easily, but for any specific codes and IDs you need a couple of seconds to find it.
You often lose your train of thought and need a moment to remember what you were saying.
You also make mistakes when you speak and have to repeat yourself (ex. "hmm yeah one second... let me find that... ok its A E 2 B oh wait sorry actually its A F 2 B")
79 changes: 79 additions & 0 deletions src/eva/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import copy
import logging
from datetime import UTC, datetime
from enum import StrEnum
from pathlib import Path
from typing import Annotated, Any, ClassVar, Literal

Expand Down Expand Up @@ -239,6 +240,75 @@ def _strip_other_mode_fields(data: dict) -> dict:
return {k: v for k, v in data.items() if k in _PIPELINE_FIELDS}


class BackgroundNoiseType(StrEnum):
"""Ambient noise type mixed into user audio (speech and silence)."""

coffee_shop = "coffee_shop"
airport_gate = "airport_gate"
bad_connection_static = "bad_connection_static"
road_noise = "road_noise"
nyc_street = "nyc_street"
background_music = "background_music"
loud_construction = "loud_construction"
baby_crying = "baby_crying"


class AccentType(StrEnum):
"""Accent variant — selects a different ElevenLabs agent ID for the user simulator."""

french = "french"
indian = "indian"
spanish = "spanish"
chinese = "chinese"


class BehaviorType(StrEnum):
"""User behavior variant — modifies persona prompt and selects a different agent ID."""

elderly_slow = "elderly_slow"
aggressive_impatient = "aggressive_impatient"
forgetful_disorganized = "forgetful_disorganized"


class PerturbationConfig(BaseModel):
"""Perturbations applied to the simulated user during a benchmark run.

Three independent axes:
- background_noise: ambient audio mixed into user speech and silence
- accent: uses accent-specific ElevenLabs agent IDs (mutually exclusive with behavior)
- behavior: modifies persona prompt + uses behavior-specific agent IDs (mutually exclusive with accent)
- connection_degradation: stacks codec artifacts, packet loss, and volume fluctuation on top

Agent ID env vars follow the pattern EVA_{TYPE}_USER_F / EVA_{TYPE}_USER_M.
Default (no accent/behavior): EVA_DEFAULT_USER_F and EVA_DEFAULT_USER_M.
"""

model_config = ConfigDict(extra="forbid")

background_noise: BackgroundNoiseType | None = Field(
None,
description="Ambient noise type to mix into user audio",
)
snr_db: float = Field(
15.0,
description="Signal-to-noise ratio in dB for file-based background noise (higher = cleaner)",
)
accent: AccentType | None = Field(None, description="Accent variant for the user simulator voice")
behavior: BehaviorType | None = Field(None, description="User behavior variant (modifies persona + agent ID)")
connection_degradation: bool = Field(
False,
description="Apply VoIP degradation (codec artifacts, packet loss, volume fluctuation) on top of other perturbations",
)

@model_validator(mode="after")
def _validate_exclusivity(self) -> "PerturbationConfig":
if self.accent is not None and self.behavior is not None:
raise ValueError(
"accent and behavior cannot both be set — they each require exclusive use of the ElevenLabs agent ID"
)
return self


# Discriminated union so Pydantic picks the right config type from env vars / CLI
ModelConfigUnion = Annotated[
Annotated[PipelineConfig, Tag("pipeline")]
Expand Down Expand Up @@ -399,6 +469,15 @@ class ModelDeployment(DeploymentTypedDict):
description="Recompute EVA aggregate scores from existing metrics.json files without re-running judges",
)

perturbation: PerturbationConfig | None = Field(
None,
description=(
"Perturbations applied to the simulated user. "
"Example: EVA_PERTURBATION__BACKGROUND_NOISE=coffee_shop EVA_PERTURBATION__ACCENT=french. "
"See PerturbationConfig for all options."
),
)

# Debug and filtering
debug: bool = Field(
False,
Expand Down
1 change: 1 addition & 0 deletions src/eva/orchestrator/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ async def _start_user_simulator(self) -> None:
server_url=f"ws://localhost:{self.port}/ws",
output_dir=self.output_dir,
user_simulator_context=self.agent.user_simulator_context,
perturbation_config=self.config.perturbation,
)

async def _run_conversation(self) -> str:
Expand Down
12 changes: 10 additions & 2 deletions src/eva/user_simulator/audio_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

from elevenlabs.conversational_ai.conversation import AudioInterface

from eva.user_simulator.perturbation import AudioPerturbator
from eva.utils.logging import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -74,6 +75,7 @@ def __init__(
record_callback: Callable[[str, bytes], None] | None = None,
event_logger=None,
conversation_done_callback: Callable[[str], None] | None = None,
perturbator: AudioPerturbator | None = None,
):
"""Initialize the audio interface.

Expand All @@ -83,12 +85,14 @@ def __init__(
record_callback: Optional callback for recording audio (source, data)
event_logger: Optional ElevenLabsEventLogger for logging audio timing
conversation_done_callback: Optional callback for signaling conversation end
perturbator: Optional perturbator to apply to user audio before sending
"""
self.websocket_uri = websocket_uri
self.conversation_id = conversation_id
self.record_callback = record_callback
self.event_logger = event_logger
self.conversation_done_callback = conversation_done_callback
self._perturbator = perturbator

self.websocket = None
self.running = False
Expand Down Expand Up @@ -231,6 +235,8 @@ def output(self, audio: bytes) -> None:
"""
if self.running:
try:
if self._perturbator is not None:
audio = self._perturbator.apply(audio)
self.send_queue.put_nowait(audio)
# Record user audio
if self.record_callback:
Expand Down Expand Up @@ -337,8 +343,10 @@ async def _send_silence_frame(self, chunk_size: int = SEND_CHUNK_SIZE_PCM) -> bo
Returns:
True if silence was sent, False otherwise
"""
# Create PCM silence and convert to μ-law
silence_pcm = b"\x00" * chunk_size
if self._perturbator is not None and self._perturbator.has_ambient_noise:
silence_pcm = self._perturbator.get_ambient_chunk(chunk_size)
else:
silence_pcm = b"\x00" * chunk_size
silence_mulaw = self._convert_pcm_to_mulaw(silence_pcm)

if not silence_mulaw:
Expand Down
48 changes: 44 additions & 4 deletions src/eva/user_simulator/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,35 @@
import asyncio
import json
import os
from functools import lru_cache
from pathlib import Path

import httpx
import yaml
from elevenlabs.client import ElevenLabs
from elevenlabs.conversational_ai.conversation import (
Conversation,
ConversationInitiationData,
)

from eva.models.config import PerturbationConfig
from eva.user_simulator.audio_interface import BotToBotAudioInterface
from eva.user_simulator.event_logger import ElevenLabsEventLogger
from eva.user_simulator.perturbation import AudioPerturbator
from eva.utils.logging import get_logger
from eva.utils.prompt_manager import PromptManager

logger = get_logger(__name__)

_BEHAVIORS_PATH = Path(__file__).parent.parent.parent.parent / "configs" / "user_behaviors.yaml"
_PERSONA_GENDER = {1: "F", 2: "M"}


@lru_cache(maxsize=1)
def _load_behavior_prompts() -> dict:
with open(_BEHAVIORS_PATH) as f:
return yaml.safe_load(f)


class UserSimulator:
"""ElevenLabs-based user simulator that connects to the assistant.
Expand All @@ -42,6 +55,7 @@ def __init__(
output_dir: Path,
timeout: int = 600,
user_simulator_context: str = "",
perturbation_config: PerturbationConfig | None = None,
):
"""Initialize the user simulator.

Expand All @@ -53,6 +67,7 @@ def __init__(
output_dir: Directory for output files
timeout: Conversation timeout in seconds
user_simulator_context: Domain-specific context line from agent config
perturbation_config: Optional perturbation to apply to user audio
"""
self.persona_config = persona_config
self.goal = goal
Expand All @@ -61,6 +76,13 @@ def __init__(
self.timeout = timeout
self.current_date_time = current_date_time
self.user_simulator_context = user_simulator_context
self._perturbation_config = perturbation_config
self._perturbator = (
AudioPerturbator(perturbation_config)
if perturbation_config is not None
and (perturbation_config.background_noise is not None or perturbation_config.connection_degradation)
else None
)

# State
self._conversation = None
Expand Down Expand Up @@ -132,6 +154,7 @@ async def _run_elevenlabs_conversation(self, api_key: str) -> str:
record_callback=self._record_audio,
event_logger=self.event_logger,
conversation_done_callback=self._on_conversation_end,
perturbator=self._perturbator,
)

# Start the audio interface WebSocket connection
Expand All @@ -147,7 +170,14 @@ async def _run_elevenlabs_conversation(self, api_key: str) -> str:
httpx_client=http_client,
)

# Build the user simulation prompt
# TODO: test and improve behavior prompts to more closely match desired user behavior
behavior_prompts = _load_behavior_prompts()
if self._perturbation_config and self._perturbation_config.behavior:
behavior_key = self._perturbation_config.behavior.value
user_persona = behavior_prompts[behavior_key]
else:
user_persona = behavior_prompts["default"]

prompt = PromptManager().get_prompt(
"user_simulator.system_prompt",
user_simulator_context=self.user_simulator_context,
Expand All @@ -160,7 +190,7 @@ async def _run_elevenlabs_conversation(self, api_key: str) -> str:
failure_condition=self.goal["decision_tree"]["failure_condition"],
edge_cases=self.goal["decision_tree"]["edge_cases"],
information_required=self.goal["information_required"],
user_persona=self.persona_config["user_persona"],
user_persona=user_persona,
starting_utterance=self.goal["starting_utterance"],
current_date_time=self.current_date_time,
)
Expand All @@ -170,11 +200,21 @@ async def _run_elevenlabs_conversation(self, api_key: str) -> str:

# ElevenLabs user simulator agent ID
persona_id = self.persona_config["user_persona_id"]
ELEVENLABS_USER_AGENT_ID = os.getenv(f"ELEVENLABS_USER_AGENT_ID_USER_PERSONA_{persona_id}")
gender = _PERSONA_GENDER[persona_id]
if self._perturbation_config and self._perturbation_config.accent:
key = self._perturbation_config.accent.value.upper()
env_var = f"EVA_{key}_ACCENT_USER_{gender}"
elif self._perturbation_config and self._perturbation_config.behavior:
key = self._perturbation_config.behavior.value.upper()
env_var = f"EVA_{key}_USER_{gender}"
else:
env_var = f"EVA_DEFAULT_USER_{gender}"
ELEVENLABS_USER_AGENT_ID = os.getenv(env_var)
logger.info(f"Using agent ID from env var: {env_var}")

# Create the conversation
if not ELEVENLABS_USER_AGENT_ID:
raise ValueError(f"Missing elevenlabs agent ID environment variable for user persona {persona_id}")
raise ValueError(f"Missing ElevenLabs agent ID environment variable: {env_var}")

self._client = client

Expand Down
Loading