Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
a4bcb4d
initial multilang impl
raghavm243512 May 18, 2026
8ac2aaa
test fix
raghavm243512 May 18, 2026
f5a5b52
date formats
raghavm243512 May 19, 2026
ada9699
translations and supporting stuff
raghavm243512 May 20, 2026
abc80e0
use display name for client
raghavm243512 May 21, 2026
b3ad5dc
many finer points
raghavm243512 May 22, 2026
ff7bd59
alias itsm
raghavm243512 May 22, 2026
9b368f0
Apply pre-commit
raghavm243512 May 22, 2026
e3d90a4
updated expected_db in dataset when adding culture data. update user_…
katstankiewicz May 25, 2026
990eb76
update test
katstankiewicz May 25, 2026
b0d3a13
add french number normalizer
katstankiewicz May 26, 2026
27be206
simplify adding languages
raghavm243512 May 26, 2026
81c20a9
add language for elevenlabs
katstankiewicz May 26, 2026
5a2377a
update stt_wer to handle french numbers
katstankiewicz May 26, 2026
bdbfe0b
initial WER schema generation
raghavm243512 May 28, 2026
132d19d
docs and cleanup
raghavm243512 May 28, 2026
30c6d43
result improvements
raghavm243512 May 28, 2026
dc910d9
update services to use 'settings' from pipecat update
katstankiewicz May 28, 2026
dcb3750
comments and kokoro
raghavm243512 Jun 1, 2026
4ea93bd
phone numbers
raghavm243512 Jun 1, 2026
b8362cb
Apply pre-commit
raghavm243512 Jun 1, 2026
d09a634
eng only starting point
raghavm243512 Jun 2, 2026
1a31f94
European French
raghavm243512 Jun 2, 2026
2373194
canadian french
raghavm243512 Jun 2, 2026
51aec05
test failure
raghavm243512 Jun 2, 2026
c0d7a65
test failures
raghavm243512 Jun 2, 2026
2b37975
Merge branch 'main' of github.com:ServiceNow/eva into pr/multilingual
raghavm243512 Jun 2, 2026
bfd4822
fr to EU french
raghavm243512 Jun 2, 2026
588f7ea
Add normalization tests
JosephMarinier Jun 3, 2026
ec788c1
Fix French cardinal numbers
JosephMarinier Jun 3, 2026
000f9f0
Test both fr and fr-CA
JosephMarinier Jun 3, 2026
ffc0f05
french adjustment
raghavm243512 Jun 3, 2026
697a1a5
separate test cases in generation
raghavm243512 Jun 3, 2026
fdf3e6a
Add data/*_dataset.yaml
JosephMarinier Jun 3, 2026
e8a8168
Format fr-CA phone numbers
JosephMarinier Jun 3, 2026
c09a732
Fix some airline French utterances
JosephMarinier Jun 3, 2026
cc3a550
phone separators
raghavm243512 Jun 3, 2026
b215213
comments, json migration
raghavm243512 Jun 3, 2026
222e229
test fix
raghavm243512 Jun 3, 2026
9c9928d
Add missing accent
JosephMarinier Jun 4, 2026
207bba8
comments
raghavm243512 Jun 4, 2026
44c0aa2
merge main
raghavm243512 Jun 4, 2026
aeb4654
Replace "deux ou trois choses"
JosephMarinier Jun 4, 2026
6846e6e
Replace "deux ou trois"
JosephMarinier Jun 4, 2026
379c6ef
Replace "en TI"
JosephMarinier Jun 4, 2026
94f30e5
comments and fixes
raghavm243512 Jun 4, 2026
6665c59
merge main
raghavm243512 Jun 4, 2026
db45b5e
Merge branch 'pr/multilingual' of github.com:ServiceNow/eva into pr/m…
raghavm243512 Jun 4, 2026
0cf919e
Refactor aliases into object files
raghavm243512 Jun 4, 2026
e85cda2
area codes, romanization changes, cleanup, corrections
raghavm243512 Jun 5, 2026
640f8c8
alias consolidation and bases
raghavm243512 Jun 5, 2026
83c0f6b
translation update
raghavm243512 Jun 5, 2026
13eb7c3
base main garage
raghavm243512 Jun 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
60 changes: 52 additions & 8 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key_here
# Voice Pipeline
# ==============================================

#i LLM model alias for the assistant. Must match a model_name in EVA_MODEL_LIST.
#d enum
#x pipeline_mode=LLM
EVA_MODEL__LLM=gpt-5.2

# Pipeline mode is controlled by the UI radio (LLM / S2S / AudioLLM).
# The #x conditions below ensure each variable is only active for the right mode.

Expand All @@ -73,16 +78,16 @@ EVA_MODEL__STT=cartesia
#x pipeline_mode=LLM
EVA_MODEL__STT_PARAMS='{"api_key": "your_cartesia_api_key", "model": "ink-whisper"}'

# --- LLM mode: TTS ---
# --- TTS (LLM and AudioLLM modes) ---
#i TTS provider for the voice pipeline.
#d enum
#e cartesia,chatterbox,elevenlabs,gemini,kokoro,nvidia-baseten,openai,xtts
#x pipeline_mode=LLM
#x pipeline_mode=LLM,AudioLLM
EVA_MODEL__TTS=cartesia

#i TTS provider parameters. Must include "api_key" and "model". Use "urls" for round-robin load balancing.
#d json_object
#x pipeline_mode=LLM
#x pipeline_mode=LLM,AudioLLM
EVA_MODEL__TTS_PARAMS='{"api_key": "your_cartesia_api_key", "model": "sonic"}'

# --- S2S mode ---
Expand Down Expand Up @@ -150,11 +155,6 @@ EVA_MODEL_LIST='[
}
]'

#i LLM model alias for the assistant. Must match a model_name in EVA_MODEL_LIST.
#d enum
#x pipeline_mode=LLM
EVA_MODEL__LLM=gpt-5.2

# ==============================================
# Framework & Runtime
# ==============================================
Expand Down Expand Up @@ -234,6 +234,50 @@ EVA_MODEL__LLM=gpt-5.2
# User Config
# ==============================================

# --- Language (mutually exclusive with Accent and Behavior) ---
#i ISO 639-1 language code for the user simulator. Datasets must exist for the selected language. Pattern for the agent ID pairs below: EVA_{LANG}_USER_{F|M}.
#d enum
#e en,fr,fr-CA,fr-ca
#x perturbation_mode=Language
#v EVA_LANGUAGE=en

# --- Language agent IDs ---
#i ElevenLabs agent ID — Canadian French, female voice.
#d string
#x perturbation_mode=Language
#x EVA_LANGUAGE=fr-CA
#v EVA_FR_CA_USER_F=

#i ElevenLabs agent ID — Canadian French, male voice.
#d string
#x perturbation_mode=Language
#x EVA_LANGUAGE=fr-CA
#v EVA_FR_CA_USER_M=

#i ElevenLabs agent ID — European French, female voice.
#d string
#x perturbation_mode=Language
#x EVA_LANGUAGE=fr
#v EVA_FR_USER_F=

#i ElevenLabs agent ID — European French, male voice.
#d string
#x perturbation_mode=Language
#x EVA_LANGUAGE=fr
#v EVA_FR_USER_M=

#i ElevenLabs agent ID — English, female voice.
#d string
#x perturbation_mode=Language
#x EVA_LANGUAGE=en
#v EVA_EN_USER_F=

#i ElevenLabs agent ID — English, male voice.
#d string
#x perturbation_mode=Language
#x EVA_LANGUAGE=en
#v EVA_EN_USER_M=

# --- Default user simulator agents ---
#i ElevenLabs agent ID for the default female-voice user persona.
#d string
Expand Down
11 changes: 7 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
git \
&& rm -rf /var/lib/apt/lists/*

# Copy dependency files and source code
# Install dependencies — cached as long as pyproject.toml doesn't change
COPY pyproject.toml README.md ./
COPY src/ ./src/

# Install dependencies into a virtual environment
# Stub src so hatchling can resolve the package during dep install
RUN mkdir -p src/eva && echo '__version__ = "0.0.0"' > src/eva/__init__.py
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir .

# Copy real source and reinstall only the package (deps already cached above)
COPY src/ ./src/
RUN pip install --no-cache-dir --no-deps .

# ============================================
# Stage 2: Runtime
# ============================================
Expand Down
43 changes: 43 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,49 @@ streamlit run apps/config_editor.py

The editor covers all variables grouped by tab (API keys, voice pipeline, model deployments, runtime settings, perturbations, etc.), with proper widgets for each type. See [`apps/README.md`](apps/README.md) for details.

### Adding a Language

**1. Run `add_culture_data.py`** — handles all one-time setup: generates culturally appropriate names and translated utterances for every dataset record, writes a "respond in X" addendum to `configs/agents/language_addenda.yaml`, translates the assistant's opening greeting into `configs/agents/initial_messages.yaml`, generates a WER normalizer config, and patches `.env.example` with the new agent ID stubs.

```bash
PYTHONPATH=src python scripts/add_culture_data.py \
--language it \
--language-name Italian \
--native-name italiano \
--auto-generate-names
```

Re-running is safe — existing entries are skipped (idempotent). Use `--dry-run` to preview changes before writing.

For languages with significant regional spelling divergence (e.g. Portuguese, where pt-BR and pt-PT differ orthographically), pass `--include-spelling-variation` to also generate a spelling normalization map used during WER evaluation:

```bash
PYTHONPATH=src python scripts/add_culture_data.py \
--language pt \
--language-name Portuguese \
--auto-generate-names \
--include-spelling-variation
```

See the script's `--help` for the full argument reference.

**2. Add your ElevenLabs agent IDs** — the script adds the variable stubs to `.env.example`; fill in the values in your `.env` (or use the config editor's **User Config** tab):

```bash
EVA_IT_USER_F=your_elevenlabs_agent_id_female
EVA_IT_USER_M=your_elevenlabs_agent_id_male
```

**3. Set `EVA_LANGUAGE` and run**:

```bash
EVA_LANGUAGE=it EVA_DOMAIN=airline python main.py
```

#### WER normalization for new languages

There are some automatically generated rules for WER calculation which will be generated with the `add_culture_data.py` script. To see the full implications of this auto generation, see [metrics/stt_wer.md](docs/metrics/stt_wer.md).

### Exploring Results

EVA includes a Streamlit analysis app for visualizing and comparing results:
Expand Down
106 changes: 105 additions & 1 deletion apps/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,111 @@ Interactive UI for building and editing `.env` configuration files without hand-
streamlit run apps/config_editor.py
```

The app reads `.env.example` for the full variable set and loads existing values from `.env` if present. Each variable's widget type, enum options, ranges, and tooltips are declared directly in `.env.example` using annotation prefixes (`#i`, `#d`, `#e`, `#r`, `#x`, `#v`). Use the **Preview** button to inspect the generated file before saving, or **Download** to export it without writing to disk.
The app reads `.env.example` for the full variable set and loads existing values from `.env` if present. Each variable's widget type, enum options, ranges, and tooltips are declared directly in `.env.example` using annotation prefixes. Use the **Preview** button to inspect the generated file before saving, or **Download** to export it without writing to disk.

### `.env.example` Annotation Scheme

The editor is driven entirely from annotated comments in `.env.example` — there is no separate schema file. Each annotation prefix applies to the **immediately following** variable definition (active or inactive). Annotation order doesn't matter, but the block must be contiguous: any blank line or `# ` true-comment between annotations resets the accumulator.

| Prefix | Name | Purpose |
|---|---|---|
| `# ` | True comment | Human-readable prose. Preserved verbatim, never parsed as metadata. |
| `#i ` | Info | Tooltip text shown next to the widget. Multiple `#i` lines join with spaces. |
| `#d ` | Datatype | Widget type — see table below. If omitted, inferred from name + value. |
| `#e ` | Enum options | Comma-separated valid values for `enum` / `multi_enum`. |
| `#r ` | Range | Numeric `min,max` or `min,max,step` for `int` / `float`. |
| `#g ` | Group | Override tab assignment (otherwise inherited from section header). |
| `#x ` | Condition | `VAR=value` — only render when that var equals that value. Comma-separated values are OR (`#x pipeline_mode=LLM,AudioLLM`). Multiple `#x` lines = AND. |
| `#v ` | Inactive var | `#v VARNAME=value` — a variable definition that ships off by default but is fully configurable. |

#### Widget types (`#d`)

| Type | Renders as |
|---|---|
| `string` | `st.text_input` |
| `secret` | `st.text_input(type="password")` |
| `bool` | `st.checkbox` |
| `int` | `st.number_input` (integers, range from `#r`) |
| `float` | `st.number_input` (floats, range from `#r`) |
| `enum` | `st.selectbox` (options from `#e`) |
| `multi_enum` | `st.multiselect` (options from `#e`) |
| `csv_list` | `st.text_input` split/joined on comma |
| `path` | `st.text_input` with existence hint |
| `json_object` | Key/value table + raw JSON expander |
| `json_deployment_list` | Special-cased deployment-card editor for `EVA_MODEL_LIST` |

#### Widget inference (when `#d` is omitted)

- Name contains `KEY`, `SECRET`, `TOKEN`, or `PASSWORD` → `secret`
- Name contains `CREDENTIALS` or ends with `_PATH` / `_DIR` → `path`
- Value is `true` / `false` → `bool`
- Value parses as an integer → `int`, as a float → `float`
- Value looks like a JSON array containing `model_name` → `json_deployment_list`
- Value looks like a JSON array or object → `json_object`
- Otherwise → `string`

#### Section headers

Top-level groups are declared by a 3-line header block. Variables that follow inherit the group name until the next header.

```bash
# ==============================================
# Voice Pipeline
# ==============================================
```

The section title must match one of the tab name constants in [`config_schema.py`](config_schema.py) (`API Configs`, `Voice Pipeline`, `LiteLLM Deployments`, `Framework & Runtime`, `Turn Detection & VAD`, `User Config`, `Debug & Logging`).

#### Variable states

```bash
# Just a note — ignored entirely.

#i Maximum parallel conversations.
#d int
#r 1,100,1
EVA_MAX_CONCURRENT_CONVERSATIONS=5 # active — written to .env

#i Domain for dataset/agent paths.
#d enum
#e airline,itsm,medical_hr
#v EVA_DOMAIN=airline # inactive — user can enable in UI

#i French accent agent ID.
#d secret
#x perturbation_mode=Accent
#x EVA_PERTURBATION__ACCENT=french
#v EVA_FRENCH_ACCENT_USER_F= # only renders when both conditions hold
```

#### Conditions and modes

`#x` conditions can reference either:
- Another env variable's value (e.g. `#x EVA_PERTURBATION__ACCENT=french`)
- A UI-only state key managed by a mutex radio button (e.g. `#x pipeline_mode=LLM`)

Mutex radio buttons are declared in [`config_schema.py`](config_schema.py) via `MUTEX_RADIOS`. Each radio writes to a session-state key (`pipeline_mode`, `perturbation_mode`) that `#x` conditions can match against.

#### Serialization rules

When the user saves `.env`:

| In `.env.example` | User sets a value | Disabled by mutex / `#x` | Output |
|---|---|---|---|
| Active (`VAR=…`) | yes | no | `VAR=value` |
| Active (`VAR=…`) | no | no | original line verbatim |
| Active (`VAR=…`) | any | yes | `#v VAR=value` (or example value) |
| Inactive (`#v VAR=…`) | yes | no | `VAR=value` (activated) |
| Inactive (`#v VAR=…`) | no | any | `#v VAR=…` verbatim |
| Not in template, in user's loaded `.env` | — | — | appended in matching tab section (KEY/URL → API Configs, `EVA_*` → Framework & Runtime, otherwise Misc) |

Round-tripping is lossless: `serialize_env({}, parse_env_example(...))` reproduces the original file byte-for-byte.

#### Implementation

- [`config_io.py`](config_io.py) — `parse_env_example`, `load_env`, `serialize_env`, `compute_disabled`. Pure functions, no Streamlit dependency.
- [`config_schema.py`](config_schema.py) — group constants, tab ordering, mutex radio definitions. Everything else lives in `.env.example`.
- [`config_editor.py`](config_editor.py) — Streamlit UI that dispatches on `AnnotatedVar.widget`.

---

Expand Down
10 changes: 6 additions & 4 deletions apps/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2143,8 +2143,8 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat
expected_db = context.get("expected_scenario_db")
final_db = context.get("final_scenario_db")
if expected_db and final_db:
expected_str = json.dumps(expected_db, indent=2, sort_keys=True, default=str)
actual_str = json.dumps(final_db, indent=2, sort_keys=True, default=str)
expected_str = json.dumps(expected_db, indent=2, sort_keys=True, default=str, ensure_ascii=False)
actual_str = json.dumps(final_db, indent=2, sort_keys=True, default=str, ensure_ascii=False)
diff_viewer(expected_str, actual_str, lang="json", key="task_completion_diff")
elif details_to_show:
st.json(details_to_show)
Expand Down Expand Up @@ -2387,13 +2387,15 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat
col_left = st.container()
with col_left:
if entry_type == "tool_call":
params_str = json.dumps(entry.get("parameters", {}), indent=2)
params_str = json.dumps(entry.get("parameters", {}), indent=2, ensure_ascii=False)
with st.expander(f"Tool Call — `{tool_name}`", expanded=False):
st.code(params_str, language="json")
elif entry_type == "tool_response":
tool_response = entry.get("tool_response", "")
response_str = (
json.dumps(tool_response, indent=2) if isinstance(tool_response, dict) else str(tool_response)
json.dumps(tool_response, indent=2, ensure_ascii=False)
if isinstance(tool_response, dict)
else str(tool_response)
)
with st.expander(f"Tool Response — `{tool_name}`", expanded=False):
st.code(response_str, language="json")
Expand Down
Loading
Loading