ServiceNow · raghavm243512 · May 18, 2026 · May 18, 2026 · May 19, 2026 · May 20, 2026
diff --git a/.env.example b/.env.example
@@ -58,6 +58,11 @@ AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key_here
 # Voice Pipeline
 # ==============================================
 
+#i LLM model alias for the assistant. Must match a model_name in EVA_MODEL_LIST.
+#d enum
+#x pipeline_mode=LLM
+EVA_MODEL__LLM=gpt-5.2
+
 # Pipeline mode is controlled by the UI radio (LLM / S2S / AudioLLM).
 # The #x conditions below ensure each variable is only active for the right mode.
 
@@ -73,16 +78,16 @@ EVA_MODEL__STT=cartesia
 #x pipeline_mode=LLM
 EVA_MODEL__STT_PARAMS='{"api_key": "your_cartesia_api_key", "model": "ink-whisper"}'
 
-# --- LLM mode: TTS ---
+# --- TTS (LLM and AudioLLM modes) ---
 #i TTS provider for the voice pipeline.
 #d enum
 #e cartesia,chatterbox,elevenlabs,gemini,kokoro,nvidia-baseten,openai,xtts
-#x pipeline_mode=LLM
+#x pipeline_mode=LLM,AudioLLM
 EVA_MODEL__TTS=cartesia
 
 #i TTS provider parameters. Must include "api_key" and "model". Use "urls" for round-robin load balancing.
 #d json_object
-#x pipeline_mode=LLM
+#x pipeline_mode=LLM,AudioLLM
 EVA_MODEL__TTS_PARAMS='{"api_key": "your_cartesia_api_key", "model": "sonic"}'
 
 # --- S2S mode ---
@@ -150,11 +155,6 @@ EVA_MODEL_LIST='[
   }
 ]'
 
-#i LLM model alias for the assistant. Must match a model_name in EVA_MODEL_LIST.
-#d enum
-#x pipeline_mode=LLM
-EVA_MODEL__LLM=gpt-5.2
-
 # ==============================================
 # Framework & Runtime
 # ==============================================
@@ -234,6 +234,50 @@ EVA_MODEL__LLM=gpt-5.2
 # User Config
 # ==============================================
 
+# --- Language (mutually exclusive with Accent and Behavior) ---
+#i ISO 639-1 language code for the user simulator. Datasets must exist for the selected language. Pattern for the agent ID pairs below: EVA_{LANG}_USER_{F|M}.
+#d enum
+#e en,fr,fr-CA,fr-ca
+#x perturbation_mode=Language
+#v EVA_LANGUAGE=en
+
+# --- Language agent IDs ---
+#i ElevenLabs agent ID — Canadian French, female voice.
+#d string
+#x perturbation_mode=Language
+#x EVA_LANGUAGE=fr-CA
+#v EVA_FR_CA_USER_F=
+
+#i ElevenLabs agent ID — Canadian French, male voice.
+#d string
+#x perturbation_mode=Language
+#x EVA_LANGUAGE=fr-CA
+#v EVA_FR_CA_USER_M=
+
+#i ElevenLabs agent ID — European French, female voice.
+#d string
+#x perturbation_mode=Language
+#x EVA_LANGUAGE=fr
+#v EVA_FR_USER_F=
+
+#i ElevenLabs agent ID — European French, male voice.
+#d string
+#x perturbation_mode=Language
+#x EVA_LANGUAGE=fr
+#v EVA_FR_USER_M=
+
+#i ElevenLabs agent ID — English, female voice.
+#d string
+#x perturbation_mode=Language
+#x EVA_LANGUAGE=en
+#v EVA_EN_USER_F=
+
+#i ElevenLabs agent ID — English, male voice.
+#d string
+#x perturbation_mode=Language
+#x EVA_LANGUAGE=en
+#v EVA_EN_USER_M=
+
 # --- Default user simulator agents ---
 #i ElevenLabs agent ID for the default female-voice user persona.
 #d string

diff --git a/Dockerfile b/Dockerfile
@@ -14,16 +14,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
     && rm -rf /var/lib/apt/lists/*
 
-# Copy dependency files and source code
+# Install dependencies — cached as long as pyproject.toml doesn't change
 COPY pyproject.toml README.md ./
-COPY src/ ./src/
-
-# Install dependencies into a virtual environment
+# Stub src so hatchling can resolve the package during dep install
+RUN mkdir -p src/eva && echo '__version__ = "0.0.0"' > src/eva/__init__.py
 RUN python -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir .
 
+# Copy real source and reinstall only the package (deps already cached above)
+COPY src/ ./src/
+RUN pip install --no-cache-dir --no-deps .
+
 # ============================================
 # Stage 2: Runtime
 # ============================================

diff --git a/README.md b/README.md
@@ -162,6 +162,49 @@ streamlit run apps/config_editor.py
 
 The editor covers all variables grouped by tab (API keys, voice pipeline, model deployments, runtime settings, perturbations, etc.), with proper widgets for each type. See [`apps/README.md`](apps/README.md) for details.
 
+### Adding a Language
+
+**1. Run `add_culture_data.py`** — handles all one-time setup: generates culturally appropriate names and translated utterances for every dataset record, writes a "respond in X" addendum to `configs/agents/language_addenda.yaml`, translates the assistant's opening greeting into `configs/agents/initial_messages.yaml`, generates a WER normalizer config, and patches `.env.example` with the new agent ID stubs.
+
+```bash
+PYTHONPATH=src python scripts/add_culture_data.py \
+    --language it \
+    --language-name Italian \
+    --native-name italiano \
+    --auto-generate-names
+```
+
+Re-running is safe — existing entries are skipped (idempotent). Use `--dry-run` to preview changes before writing.
+
+For languages with significant regional spelling divergence (e.g. Portuguese, where pt-BR and pt-PT differ orthographically), pass `--include-spelling-variation` to also generate a spelling normalization map used during WER evaluation:
+
+```bash
+PYTHONPATH=src python scripts/add_culture_data.py \
+    --language pt \
+    --language-name Portuguese \
+    --auto-generate-names \
+    --include-spelling-variation
+```
+
+See the script's `--help` for the full argument reference.
+
+**2. Add your ElevenLabs agent IDs** — the script adds the variable stubs to `.env.example`; fill in the values in your `.env` (or use the config editor's **User Config** tab):
+
+```bash
+EVA_IT_USER_F=your_elevenlabs_agent_id_female
+EVA_IT_USER_M=your_elevenlabs_agent_id_male
+```
+
+**3. Set `EVA_LANGUAGE` and run**:
+
+```bash
+EVA_LANGUAGE=it EVA_DOMAIN=airline python main.py
+```
+
+#### WER normalization for new languages
+
+There are some automatically generated rules for WER calculation which will be generated with the `add_culture_data.py` script. To see the full implications of this auto generation, see [metrics/stt_wer.md](docs/metrics/stt_wer.md).
+
 ### Exploring Results
 
 EVA includes a Streamlit analysis app for visualizing and comparing results:

diff --git a/apps/README.md b/apps/README.md
@@ -12,7 +12,111 @@ Interactive UI for building and editing `.env` configuration files without hand-
 streamlit run apps/config_editor.py
 ```
 
-The app reads `.env.example` for the full variable set and loads existing values from `.env` if present. Each variable's widget type, enum options, ranges, and tooltips are declared directly in `.env.example` using annotation prefixes (`#i`, `#d`, `#e`, `#r`, `#x`, `#v`). Use the **Preview** button to inspect the generated file before saving, or **Download** to export it without writing to disk.
+The app reads `.env.example` for the full variable set and loads existing values from `.env` if present. Each variable's widget type, enum options, ranges, and tooltips are declared directly in `.env.example` using annotation prefixes. Use the **Preview** button to inspect the generated file before saving, or **Download** to export it without writing to disk.
+
+### `.env.example` Annotation Scheme
+
+The editor is driven entirely from annotated comments in `.env.example` — there is no separate schema file. Each annotation prefix applies to the **immediately following** variable definition (active or inactive). Annotation order doesn't matter, but the block must be contiguous: any blank line or `# ` true-comment between annotations resets the accumulator.
+
+| Prefix | Name | Purpose |
+|---|---|---|
+| `# ` | True comment | Human-readable prose. Preserved verbatim, never parsed as metadata. |
+| `#i ` | Info | Tooltip text shown next to the widget. Multiple `#i` lines join with spaces. |
+| `#d ` | Datatype | Widget type — see table below. If omitted, inferred from name + value. |
+| `#e ` | Enum options | Comma-separated valid values for `enum` / `multi_enum`. |
+| `#r ` | Range | Numeric `min,max` or `min,max,step` for `int` / `float`. |
+| `#g ` | Group | Override tab assignment (otherwise inherited from section header). |
+| `#x ` | Condition | `VAR=value` — only render when that var equals that value. Comma-separated values are OR (`#x pipeline_mode=LLM,AudioLLM`). Multiple `#x` lines = AND. |
+| `#v ` | Inactive var | `#v VARNAME=value` — a variable definition that ships off by default but is fully configurable. |
+
+#### Widget types (`#d`)
+
+| Type | Renders as |
+|---|---|
+| `string` | `st.text_input` |
+| `secret` | `st.text_input(type="password")` |
+| `bool` | `st.checkbox` |
+| `int` | `st.number_input` (integers, range from `#r`) |
+| `float` | `st.number_input` (floats, range from `#r`) |
+| `enum` | `st.selectbox` (options from `#e`) |
+| `multi_enum` | `st.multiselect` (options from `#e`) |
+| `csv_list` | `st.text_input` split/joined on comma |
+| `path` | `st.text_input` with existence hint |
+| `json_object` | Key/value table + raw JSON expander |
+| `json_deployment_list` | Special-cased deployment-card editor for `EVA_MODEL_LIST` |
+
+#### Widget inference (when `#d` is omitted)
+
+- Name contains `KEY`, `SECRET`, `TOKEN`, or `PASSWORD` → `secret`
+- Name contains `CREDENTIALS` or ends with `_PATH` / `_DIR` → `path`
+- Value is `true` / `false` → `bool`
+- Value parses as an integer → `int`, as a float → `float`
+- Value looks like a JSON array containing `model_name` → `json_deployment_list`
+- Value looks like a JSON array or object → `json_object`
+- Otherwise → `string`
+
+#### Section headers
+
+Top-level groups are declared by a 3-line header block. Variables that follow inherit the group name until the next header.
+
+```bash
+# ==============================================
+# Voice Pipeline
+# ==============================================
+```
+
+The section title must match one of the tab name constants in [`config_schema.py`](config_schema.py) (`API Configs`, `Voice Pipeline`, `LiteLLM Deployments`, `Framework & Runtime`, `Turn Detection & VAD`, `User Config`, `Debug & Logging`).
+
+#### Variable states
+
+```bash
+# Just a note — ignored entirely.
+
+#i Maximum parallel conversations.
+#d int
+#r 1,100,1
+EVA_MAX_CONCURRENT_CONVERSATIONS=5        # active — written to .env
+
+#i Domain for dataset/agent paths.
+#d enum
+#e airline,itsm,medical_hr
+#v EVA_DOMAIN=airline                     # inactive — user can enable in UI
+
+#i French accent agent ID.
+#d secret
+#x perturbation_mode=Accent
+#x EVA_PERTURBATION__ACCENT=french
+#v EVA_FRENCH_ACCENT_USER_F=              # only renders when both conditions hold
+```
+
+#### Conditions and modes
+
+`#x` conditions can reference either:
+- Another env variable's value (e.g. `#x EVA_PERTURBATION__ACCENT=french`)
+- A UI-only state key managed by a mutex radio button (e.g. `#x pipeline_mode=LLM`)
+
+Mutex radio buttons are declared in [`config_schema.py`](config_schema.py) via `MUTEX_RADIOS`. Each radio writes to a session-state key (`pipeline_mode`, `perturbation_mode`) that `#x` conditions can match against.
+
+#### Serialization rules
+
+When the user saves `.env`:
+
+| In `.env.example` | User sets a value | Disabled by mutex / `#x` | Output |
+|---|---|---|---|
+| Active (`VAR=…`) | yes | no | `VAR=value` |
+| Active (`VAR=…`) | no | no | original line verbatim |
+| Active (`VAR=…`) | any | yes | `#v VAR=value` (or example value) |
+| Inactive (`#v VAR=…`) | yes | no | `VAR=value` (activated) |
+| Inactive (`#v VAR=…`) | no | any | `#v VAR=…` verbatim |
+| Not in template, in user's loaded `.env` | — | — | appended in matching tab section (KEY/URL → API Configs, `EVA_*` → Framework & Runtime, otherwise Misc) |
+
+Round-tripping is lossless: `serialize_env({}, parse_env_example(...))` reproduces the original file byte-for-byte.
+
+#### Implementation
+
+- [`config_io.py`](config_io.py) — `parse_env_example`, `load_env`, `serialize_env`, `compute_disabled`. Pure functions, no Streamlit dependency.
+- [`config_schema.py`](config_schema.py) — group constants, tab ordering, mutex radio definitions. Everything else lives in `.env.example`.
+- [`config_editor.py`](config_editor.py) — Streamlit UI that dispatches on `AnnotatedVar.widget`.
 
 ---
 

diff --git a/apps/analysis.py b/apps/analysis.py
@@ -2143,8 +2143,8 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat
                 expected_db = context.get("expected_scenario_db")
                 final_db = context.get("final_scenario_db")
                 if expected_db and final_db:
-                    expected_str = json.dumps(expected_db, indent=2, sort_keys=True, default=str)
-                    actual_str = json.dumps(final_db, indent=2, sort_keys=True, default=str)
+                    expected_str = json.dumps(expected_db, indent=2, sort_keys=True, default=str, ensure_ascii=False)
+                    actual_str = json.dumps(final_db, indent=2, sort_keys=True, default=str, ensure_ascii=False)
                     diff_viewer(expected_str, actual_str, lang="json", key="task_completion_diff")
             elif details_to_show:
                 st.json(details_to_show)
@@ -2387,13 +2387,15 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat
                 col_left = st.container()
             with col_left:
                 if entry_type == "tool_call":
-                    params_str = json.dumps(entry.get("parameters", {}), indent=2)
+                    params_str = json.dumps(entry.get("parameters", {}), indent=2, ensure_ascii=False)
                     with st.expander(f"Tool Call — `{tool_name}`", expanded=False):
                         st.code(params_str, language="json")
                 elif entry_type == "tool_response":
                     tool_response = entry.get("tool_response", "")
                     response_str = (
-                        json.dumps(tool_response, indent=2) if isinstance(tool_response, dict) else str(tool_response)
+                        json.dumps(tool_response, indent=2, ensure_ascii=False)
+                        if isinstance(tool_response, dict)
+                        else str(tool_response)
                     )
                     with st.expander(f"Tool Response — `{tool_name}`", expanded=False):
                         st.code(response_str, language="json")