Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,6 @@ eva/
│ ├── run_text_only.py # Text-only evaluation runner
│ ├── docker_entrypoint.py # Docker entry point
│ ├── check_version_bump.py # Version checking
│ └── check_version_bump.py # Version checking
├── configs/ # Configuration files
│ ├── prompts/ # Judge and simulation prompts
│ │ ├── judge.yaml # Judge metric prompts
Expand All @@ -333,7 +332,7 @@ eva/
│ ├── limitations.md # Known limitations
│ └── demo/ # Demo audio files
├── data/ # Data files
│ ├── airline_dataset.jsonl # Evaluation dataset
│ ├── airline_dataset.json # Evaluation dataset
│ └── airline_scenarios/ # Per-record scenario databases
├── tests/ # Test suite
│ ├── unit/ # Unit tests
Expand Down
30,920 changes: 30,920 additions & 0 deletions data/airline_dataset.json

Large diffs are not rendered by default.

50 changes: 0 additions & 50 deletions data/airline_dataset.jsonl

This file was deleted.

83,732 changes: 83,732 additions & 0 deletions data/itsm_dataset.json

Large diffs are not rendered by default.

80 changes: 0 additions & 80 deletions data/itsm_dataset.jsonl

This file was deleted.

60,668 changes: 60,668 additions & 0 deletions data/medical_hr_dataset.json

Large diffs are not rendered by default.

83 changes: 0 additions & 83 deletions data/medical_hr_dataset.jsonl

This file was deleted.

2 changes: 1 addition & 1 deletion docs/metric_context.md
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ Minor discrepancies are still possible (turn IDs off by one, audio timestamps no

```
Benchmark Execution:
├─ EvaluationRecord (dataset.jsonl)
├─ EvaluationRecord (dataset.json)
│ ├─ user_goal, user_persona, scenario_db → MetricContext
│ └─ Feeds to AssistantServer + UserSimulator
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_text_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def build_text_composites(

def resolve_paths(domain: str) -> tuple[Path, Path, Path]:
"""Resolve dataset, scenario-db-dir, and agent-config paths from a domain name."""
dataset = Path(f"data/{domain}_dataset.jsonl")
dataset = Path(f"data/{domain}_dataset.json")
scenario_db_dir = Path(f"data/{domain}_scenarios")
agent_config = Path(f"configs/agents/{domain}_agent.yaml")

Expand Down
2 changes: 1 addition & 1 deletion src/eva/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ class ModelDeployment(DeploymentTypedDict):
@computed_field
@property
def dataset_path(self) -> Path:
return Path(f"data/{self.domain}_dataset.jsonl")
return Path(f"data/{self.domain}_dataset.json")

@computed_field
@property
Expand Down
15 changes: 6 additions & 9 deletions src/eva/models/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,19 +126,16 @@ class EvaluationRecord(BaseModel):

@classmethod
def load_dataset(cls, path: Path | str) -> list["EvaluationRecord"]:
"""Load records from JSONL file."""
"""Load records from JSON file (array of objects)."""
path = Path(path)
records = []
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
records.append(cls.model_validate_json(line))
return records
data = json.load(f)
return [cls.model_validate(record) for record in data]

@classmethod
def save_dataset(cls, records: list["EvaluationRecord"], path: Path | str) -> None:
"""Save records to JSONL file."""
"""Save records to JSON file (array of objects)."""
path = Path(path)
with open(path, "w", encoding="utf-8") as f:
f.writelines(record.model_dump_json() + "\n" for record in records)
json.dump([record.model_dump() for record in records], f, indent=2, ensure_ascii=False)
f.write("\n")
6 changes: 3 additions & 3 deletions tests/unit/models/test_config_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def test_create_minimal_config(self):
"""Test creating a minimal RunConfig."""
config = _config(env_vars=_BASE_ENV | {"EVA_DOMAIN": "airline", "EVA_MODEL__LLM": "gpt-5.2"})

assert config.dataset_path == Path("data/airline_dataset.jsonl")
assert config.dataset_path == Path("data/airline_dataset.json")
assert config.tool_mocks_path == Path("data/airline_scenarios")
# run_id = timestamp + model suffix (e.g. "2024-01-15_14-30-45.123456_nova-2_gpt-5.2_sonic")
assert config.run_id.endswith("nova-2_gpt-5.2_sonic")
Expand Down Expand Up @@ -511,7 +511,7 @@ class TestDefaults:
def test_defaults(self):
c = _config(env_vars=_BASE_ENV)
assert c.domain == "airline"
assert c.dataset_path == Path("data/airline_dataset.jsonl")
assert c.dataset_path == Path("data/airline_dataset.json")
assert c.tool_mocks_path == Path("data/airline_scenarios")
assert c.agent_config_path == Path("configs/agents/airline_agent.yaml")
assert c.output_dir == Path("output")
Expand Down Expand Up @@ -642,7 +642,7 @@ class TestDomainResolution:

def test_domain_sets_paths(self):
c = _config(env_vars=_BASE_ENV | {"EVA_DOMAIN": "airline"})
assert c.dataset_path == Path("data/airline_dataset.jsonl")
assert c.dataset_path == Path("data/airline_dataset.json")
assert c.agent_config_path == Path("configs/agents/airline_agent.yaml")
assert c.tool_mocks_path == Path("data/airline_scenarios")

Expand Down
41 changes: 18 additions & 23 deletions tests/unit/test_current_date_consistency.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

# Discover all (dataset, scenarios_dir) pairs
DOMAIN_CONFIGS = [
("airline_dataset.jsonl", "airline_scenarios"),
("medical_hr_dataset.jsonl", "medical_hr_scenarios"),
("itsm_dataset.jsonl", "itsm_scenarios"),
("airline_dataset.json", "airline_scenarios"),
("medical_hr_dataset.json", "medical_hr_scenarios"),
("itsm_dataset.json", "itsm_scenarios"),
]


Expand All @@ -29,28 +29,23 @@ def _load_records():
for dataset_file, scenarios_dir in DOMAIN_CONFIGS:
dataset_path = DATA_DIR / dataset_file
scenarios_path = DATA_DIR / scenarios_dir
if not dataset_path.exists():
continue

with open(dataset_path) as f:
for line in f:
line = line.strip()
if not line:
continue
record = json.loads(line)
record_id = record["id"]
current_date_time = record.get("current_date_time", "")
expected_db = record.get("ground_truth", {}).get("expected_scenario_db", {})

initial_db_path = scenarios_path / f"{record_id}.json"
if initial_db_path.exists():
with open(initial_db_path) as sf:
initial_db = json.load(sf)
else:
initial_db = {}

domain = dataset_file.replace("_dataset.jsonl", "")
yield domain, record_id, current_date_time, initial_db, expected_db
records = json.load(f)
for record in records:
record_id = record["id"]
current_date_time = record.get("current_date_time", "")
expected_db = record.get("ground_truth", {}).get("expected_scenario_db", {})

initial_db_path = scenarios_path / f"{record_id}.json"
if initial_db_path.exists():
with open(initial_db_path) as sf:
initial_db = json.load(sf)
else:
initial_db = {}

domain = dataset_file.replace("_dataset.json", "")
yield domain, record_id, current_date_time, initial_db, expected_db


_ALL_RECORDS = list(_load_records())
Expand Down
Loading