diff --git a/.env.example b/.env.example index dd78e6ef..8ea751ad 100644 --- a/.env.example +++ b/.env.example @@ -58,6 +58,11 @@ AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key_here # Voice Pipeline # ============================================== +#i LLM model alias for the assistant. Must match a model_name in EVA_MODEL_LIST. +#d enum +#x pipeline_mode=LLM +EVA_MODEL__LLM=gpt-5.2 + # Pipeline mode is controlled by the UI radio (LLM / S2S / AudioLLM). # The #x conditions below ensure each variable is only active for the right mode. @@ -73,16 +78,16 @@ EVA_MODEL__STT=cartesia #x pipeline_mode=LLM EVA_MODEL__STT_PARAMS='{"api_key": "your_cartesia_api_key", "model": "ink-whisper"}' -# --- LLM mode: TTS --- +# --- TTS (LLM and AudioLLM modes) --- #i TTS provider for the voice pipeline. #d enum #e cartesia,chatterbox,elevenlabs,gemini,kokoro,nvidia-baseten,openai,xtts -#x pipeline_mode=LLM +#x pipeline_mode=LLM,AudioLLM EVA_MODEL__TTS=cartesia #i TTS provider parameters. Must include "api_key" and "model". Use "urls" for round-robin load balancing. #d json_object -#x pipeline_mode=LLM +#x pipeline_mode=LLM,AudioLLM EVA_MODEL__TTS_PARAMS='{"api_key": "your_cartesia_api_key", "model": "sonic"}' # --- S2S mode --- @@ -150,11 +155,6 @@ EVA_MODEL_LIST='[ } ]' -#i LLM model alias for the assistant. Must match a model_name in EVA_MODEL_LIST. -#d enum -#x pipeline_mode=LLM -EVA_MODEL__LLM=gpt-5.2 - # ============================================== # Framework & Runtime # ============================================== @@ -234,6 +234,50 @@ EVA_MODEL__LLM=gpt-5.2 # User Config # ============================================== +# --- Language (mutually exclusive with Accent and Behavior) --- +#i ISO 639-1 language code for the user simulator. Datasets must exist for the selected language. Pattern for the agent ID pairs below: EVA_{LANG}_USER_{F|M}. +#d enum +#e en,fr,fr-CA,fr-ca +#x perturbation_mode=Language +#v EVA_LANGUAGE=en + +# --- Language agent IDs --- +#i ElevenLabs agent ID — Canadian French, female voice. +#d string +#x perturbation_mode=Language +#x EVA_LANGUAGE=fr-CA +#v EVA_FR_CA_USER_F= + +#i ElevenLabs agent ID — Canadian French, male voice. +#d string +#x perturbation_mode=Language +#x EVA_LANGUAGE=fr-CA +#v EVA_FR_CA_USER_M= + +#i ElevenLabs agent ID — European French, female voice. +#d string +#x perturbation_mode=Language +#x EVA_LANGUAGE=fr +#v EVA_FR_USER_F= + +#i ElevenLabs agent ID — European French, male voice. +#d string +#x perturbation_mode=Language +#x EVA_LANGUAGE=fr +#v EVA_FR_USER_M= + +#i ElevenLabs agent ID — English, female voice. +#d string +#x perturbation_mode=Language +#x EVA_LANGUAGE=en +#v EVA_EN_USER_F= + +#i ElevenLabs agent ID — English, male voice. +#d string +#x perturbation_mode=Language +#x EVA_LANGUAGE=en +#v EVA_EN_USER_M= + # --- Default user simulator agents --- #i ElevenLabs agent ID for the default female-voice user persona. #d string diff --git a/Dockerfile b/Dockerfile index fbb1d789..afe51b83 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,16 +14,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ git \ && rm -rf /var/lib/apt/lists/* -# Copy dependency files and source code +# Install dependencies — cached as long as pyproject.toml doesn't change COPY pyproject.toml README.md ./ -COPY src/ ./src/ - -# Install dependencies into a virtual environment +# Stub src so hatchling can resolve the package during dep install +RUN mkdir -p src/eva && echo '__version__ = "0.0.0"' > src/eva/__init__.py RUN python -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir . +# Copy real source and reinstall only the package (deps already cached above) +COPY src/ ./src/ +RUN pip install --no-cache-dir --no-deps . + # ============================================ # Stage 2: Runtime # ============================================ diff --git a/README.md b/README.md index d28951f9..89d77bde 100644 --- a/README.md +++ b/README.md @@ -162,6 +162,49 @@ streamlit run apps/config_editor.py The editor covers all variables grouped by tab (API keys, voice pipeline, model deployments, runtime settings, perturbations, etc.), with proper widgets for each type. See [`apps/README.md`](apps/README.md) for details. +### Adding a Language + +**1. Run `add_culture_data.py`** — handles all one-time setup: generates culturally appropriate names and translated utterances for every dataset record, writes a "respond in X" addendum to `configs/agents/language_addenda.yaml`, translates the assistant's opening greeting into `configs/agents/initial_messages.yaml`, generates a WER normalizer config, and patches `.env.example` with the new agent ID stubs. + +```bash +PYTHONPATH=src python scripts/add_culture_data.py \ + --language it \ + --language-name Italian \ + --native-name italiano \ + --auto-generate-names +``` + +Re-running is safe — existing entries are skipped (idempotent). Use `--dry-run` to preview changes before writing. + +For languages with significant regional spelling divergence (e.g. Portuguese, where pt-BR and pt-PT differ orthographically), pass `--include-spelling-variation` to also generate a spelling normalization map used during WER evaluation: + +```bash +PYTHONPATH=src python scripts/add_culture_data.py \ + --language pt \ + --language-name Portuguese \ + --auto-generate-names \ + --include-spelling-variation +``` + +See the script's `--help` for the full argument reference. + +**2. Add your ElevenLabs agent IDs** — the script adds the variable stubs to `.env.example`; fill in the values in your `.env` (or use the config editor's **User Config** tab): + +```bash +EVA_IT_USER_F=your_elevenlabs_agent_id_female +EVA_IT_USER_M=your_elevenlabs_agent_id_male +``` + +**3. Set `EVA_LANGUAGE` and run**: + +```bash +EVA_LANGUAGE=it EVA_DOMAIN=airline python main.py +``` + +#### WER normalization for new languages + +There are some automatically generated rules for WER calculation which will be generated with the `add_culture_data.py` script. To see the full implications of this auto generation, see [metrics/stt_wer.md](docs/metrics/stt_wer.md). + ### Exploring Results EVA includes a Streamlit analysis app for visualizing and comparing results: diff --git a/apps/README.md b/apps/README.md index 69a524c2..3a457ac7 100644 --- a/apps/README.md +++ b/apps/README.md @@ -12,7 +12,111 @@ Interactive UI for building and editing `.env` configuration files without hand- streamlit run apps/config_editor.py ``` -The app reads `.env.example` for the full variable set and loads existing values from `.env` if present. Each variable's widget type, enum options, ranges, and tooltips are declared directly in `.env.example` using annotation prefixes (`#i`, `#d`, `#e`, `#r`, `#x`, `#v`). Use the **Preview** button to inspect the generated file before saving, or **Download** to export it without writing to disk. +The app reads `.env.example` for the full variable set and loads existing values from `.env` if present. Each variable's widget type, enum options, ranges, and tooltips are declared directly in `.env.example` using annotation prefixes. Use the **Preview** button to inspect the generated file before saving, or **Download** to export it without writing to disk. + +### `.env.example` Annotation Scheme + +The editor is driven entirely from annotated comments in `.env.example` — there is no separate schema file. Each annotation prefix applies to the **immediately following** variable definition (active or inactive). Annotation order doesn't matter, but the block must be contiguous: any blank line or `# ` true-comment between annotations resets the accumulator. + +| Prefix | Name | Purpose | +|---|---|---| +| `# ` | True comment | Human-readable prose. Preserved verbatim, never parsed as metadata. | +| `#i ` | Info | Tooltip text shown next to the widget. Multiple `#i` lines join with spaces. | +| `#d ` | Datatype | Widget type — see table below. If omitted, inferred from name + value. | +| `#e ` | Enum options | Comma-separated valid values for `enum` / `multi_enum`. | +| `#r ` | Range | Numeric `min,max` or `min,max,step` for `int` / `float`. | +| `#g ` | Group | Override tab assignment (otherwise inherited from section header). | +| `#x ` | Condition | `VAR=value` — only render when that var equals that value. Comma-separated values are OR (`#x pipeline_mode=LLM,AudioLLM`). Multiple `#x` lines = AND. | +| `#v ` | Inactive var | `#v VARNAME=value` — a variable definition that ships off by default but is fully configurable. | + +#### Widget types (`#d`) + +| Type | Renders as | +|---|---| +| `string` | `st.text_input` | +| `secret` | `st.text_input(type="password")` | +| `bool` | `st.checkbox` | +| `int` | `st.number_input` (integers, range from `#r`) | +| `float` | `st.number_input` (floats, range from `#r`) | +| `enum` | `st.selectbox` (options from `#e`) | +| `multi_enum` | `st.multiselect` (options from `#e`) | +| `csv_list` | `st.text_input` split/joined on comma | +| `path` | `st.text_input` with existence hint | +| `json_object` | Key/value table + raw JSON expander | +| `json_deployment_list` | Special-cased deployment-card editor for `EVA_MODEL_LIST` | + +#### Widget inference (when `#d` is omitted) + +- Name contains `KEY`, `SECRET`, `TOKEN`, or `PASSWORD` → `secret` +- Name contains `CREDENTIALS` or ends with `_PATH` / `_DIR` → `path` +- Value is `true` / `false` → `bool` +- Value parses as an integer → `int`, as a float → `float` +- Value looks like a JSON array containing `model_name` → `json_deployment_list` +- Value looks like a JSON array or object → `json_object` +- Otherwise → `string` + +#### Section headers + +Top-level groups are declared by a 3-line header block. Variables that follow inherit the group name until the next header. + +```bash +# ============================================== +# Voice Pipeline +# ============================================== +``` + +The section title must match one of the tab name constants in [`config_schema.py`](config_schema.py) (`API Configs`, `Voice Pipeline`, `LiteLLM Deployments`, `Framework & Runtime`, `Turn Detection & VAD`, `User Config`, `Debug & Logging`). + +#### Variable states + +```bash +# Just a note — ignored entirely. + +#i Maximum parallel conversations. +#d int +#r 1,100,1 +EVA_MAX_CONCURRENT_CONVERSATIONS=5 # active — written to .env + +#i Domain for dataset/agent paths. +#d enum +#e airline,itsm,medical_hr +#v EVA_DOMAIN=airline # inactive — user can enable in UI + +#i French accent agent ID. +#d secret +#x perturbation_mode=Accent +#x EVA_PERTURBATION__ACCENT=french +#v EVA_FRENCH_ACCENT_USER_F= # only renders when both conditions hold +``` + +#### Conditions and modes + +`#x` conditions can reference either: +- Another env variable's value (e.g. `#x EVA_PERTURBATION__ACCENT=french`) +- A UI-only state key managed by a mutex radio button (e.g. `#x pipeline_mode=LLM`) + +Mutex radio buttons are declared in [`config_schema.py`](config_schema.py) via `MUTEX_RADIOS`. Each radio writes to a session-state key (`pipeline_mode`, `perturbation_mode`) that `#x` conditions can match against. + +#### Serialization rules + +When the user saves `.env`: + +| In `.env.example` | User sets a value | Disabled by mutex / `#x` | Output | +|---|---|---|---| +| Active (`VAR=…`) | yes | no | `VAR=value` | +| Active (`VAR=…`) | no | no | original line verbatim | +| Active (`VAR=…`) | any | yes | `#v VAR=value` (or example value) | +| Inactive (`#v VAR=…`) | yes | no | `VAR=value` (activated) | +| Inactive (`#v VAR=…`) | no | any | `#v VAR=…` verbatim | +| Not in template, in user's loaded `.env` | — | — | appended in matching tab section (KEY/URL → API Configs, `EVA_*` → Framework & Runtime, otherwise Misc) | + +Round-tripping is lossless: `serialize_env({}, parse_env_example(...))` reproduces the original file byte-for-byte. + +#### Implementation + +- [`config_io.py`](config_io.py) — `parse_env_example`, `load_env`, `serialize_env`, `compute_disabled`. Pure functions, no Streamlit dependency. +- [`config_schema.py`](config_schema.py) — group constants, tab ordering, mutex radio definitions. Everything else lives in `.env.example`. +- [`config_editor.py`](config_editor.py) — Streamlit UI that dispatches on `AnnotatedVar.widget`. --- diff --git a/apps/analysis.py b/apps/analysis.py index 101fdcdc..e4ff8d5d 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -2143,8 +2143,8 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat expected_db = context.get("expected_scenario_db") final_db = context.get("final_scenario_db") if expected_db and final_db: - expected_str = json.dumps(expected_db, indent=2, sort_keys=True, default=str) - actual_str = json.dumps(final_db, indent=2, sort_keys=True, default=str) + expected_str = json.dumps(expected_db, indent=2, sort_keys=True, default=str, ensure_ascii=False) + actual_str = json.dumps(final_db, indent=2, sort_keys=True, default=str, ensure_ascii=False) diff_viewer(expected_str, actual_str, lang="json", key="task_completion_diff") elif details_to_show: st.json(details_to_show) @@ -2387,13 +2387,15 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat col_left = st.container() with col_left: if entry_type == "tool_call": - params_str = json.dumps(entry.get("parameters", {}), indent=2) + params_str = json.dumps(entry.get("parameters", {}), indent=2, ensure_ascii=False) with st.expander(f"Tool Call — `{tool_name}`", expanded=False): st.code(params_str, language="json") elif entry_type == "tool_response": tool_response = entry.get("tool_response", "") response_str = ( - json.dumps(tool_response, indent=2) if isinstance(tool_response, dict) else str(tool_response) + json.dumps(tool_response, indent=2, ensure_ascii=False) + if isinstance(tool_response, dict) + else str(tool_response) ) with st.expander(f"Tool Response — `{tool_name}`", expanded=False): st.code(response_str, language="json") diff --git a/apps/config_editor.py b/apps/config_editor.py index f59995b4..1a62767a 100644 --- a/apps/config_editor.py +++ b/apps/config_editor.py @@ -12,6 +12,7 @@ from __future__ import annotations +import hashlib import html as html_module import json import sys @@ -133,12 +134,17 @@ def _init_state() -> None: def _is_visible_av(var: AnnotatedVar) -> bool: - """Return True when all #x conditions for this var are satisfied.""" + """Return True when all #x conditions for this var are satisfied. + + Comma-separated values in a single condition are treated as OR + (e.g. `#x pipeline_mode=LLM,AudioLLM`). + """ for cond_key, cond_val in var.conditions: actual = st.session_state.get(cond_key) if actual is None: actual = st.session_state.get("field_values", {}).get(cond_key) - if actual != cond_val: + allowed = {v.strip() for v in cond_val.split(",") if v.strip()} + if actual not in allowed: return False return True @@ -215,9 +221,10 @@ def _enum_options_for(var: AnnotatedVar) -> list[str]: def _render_json_object(name: str, info: str, current: dict) -> None: st.markdown(f"**{name}**" + (f" — {info}" if info else "")) - raw_key = f"raw_{name}" - if raw_key not in st.session_state: - st.session_state[raw_key] = json.dumps(current, indent=2) if current else "" + + # Both widgets are keyed by a hash of the current value so they always + # re-initialize from field_values after any write + rerun. + val_hash = hashlib.md5(json.dumps(current, sort_keys=True, ensure_ascii=False).encode()).hexdigest()[:8] rows = [{"key": k, "value": _scalar_to_str(v)} for k, v in current.items()] or [{"key": "", "value": ""}] edited = st.data_editor( @@ -228,30 +235,43 @@ def _render_json_object(name: str, info: str, current: dict) -> None: "key": st.column_config.TextColumn("key", required=False), "value": st.column_config.TextColumn("value", required=False), }, - key=f"de_{name}", + key=f"_de_{name}_{val_hash}", ) - parsed_kv: dict[str, Any] = {} - for row in edited: - k = (row.get("key") or "").strip() - if k: - parsed_kv[k] = _str_to_scalar(row.get("value")) + parsed_from_table: dict[str, Any] = { + (r.get("key") or "").strip(): _str_to_scalar(r.get("value")) for r in edited if (r.get("key") or "").strip() + } + + if json.dumps(parsed_from_table, sort_keys=True, ensure_ascii=False) != json.dumps( + current, sort_keys=True, ensure_ascii=False + ): + st.session_state.field_values[name] = parsed_from_table + st.rerun() with st.expander("Raw JSON", expanded=False): text = st.text_area( - "Edit as JSON", value=json.dumps(parsed_kv, indent=2) if parsed_kv else "", key=raw_key, height=140 + "Edit as JSON", + value=json.dumps(current, indent=2, ensure_ascii=False) if current else "", + key=f"_rawtxt_{name}_{val_hash}", + height=140, ) - if text.strip(): - try: - parsed_kv = json.loads(text) - except json.JSONDecodeError as e: - st.warning(f"Invalid JSON: {e}") - st.session_state.field_values[name] = parsed_kv + if text.strip(): + try: + parsed_kv = json.loads(text) + if json.dumps(parsed_kv, sort_keys=True, ensure_ascii=False) != json.dumps( + current, sort_keys=True, ensure_ascii=False + ): + st.session_state.field_values[name] = parsed_kv + st.rerun() + except json.JSONDecodeError as e: + st.warning(f"Invalid JSON: {e}") + + st.session_state.field_values[name] = current def _scalar_to_str(v: Any) -> str: if isinstance(v, (dict, list)): - return json.dumps(v) + return json.dumps(v, ensure_ascii=False) if isinstance(v, bool): return "true" if v else "false" if v is None: @@ -397,7 +417,7 @@ def _render_unmapped_var(name: str) -> None: values = st.session_state.field_values v = values.get(name, "") if not isinstance(v, str): - v = json.dumps(v) if v else "" + v = json.dumps(v, ensure_ascii=False) if v else "" widget_type = "password" if "KEY" in name else "default" values[name] = st.text_input(name, value=v, key=f"w_{name}", type=widget_type) @@ -623,7 +643,7 @@ def main() -> None: mime="text/plain", width="stretch", ) - data_attr = html_module.escape(json.dumps(text), quote=True) + data_attr = html_module.escape(json.dumps(text, ensure_ascii=False), quote=True) st_components.html( f"""