Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1105,7 +1105,7 @@ def _show_subtable(heading: str, composites: list, metrics: list) -> None:

_show_subtable("Accuracy Metrics (EVA-A)", eva_a_composites, accuracy_metrics)
_show_subtable("Experience Metrics (EVA-X)", eva_x_composites, experience_metrics)
_show_subtable("Diagnostic & Other Metrics", [], other_metrics)
_show_subtable("Diagnostic & Validation Metrics", [], other_metrics)

csv = summary_df.drop(columns=["label"]).to_csv(index=False)
st.download_button("Download CSV", csv, file_name="cross_run_comparison.csv", mime="text/csv")
Expand Down
2 changes: 1 addition & 1 deletion src/eva/metrics/diagnostic/response_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def _compute_speed_stats(latencies: list[float]) -> dict | None:

Returns None if no valid values remain after filtering.
"""
valid = [v for v in latencies if 0 < v < 1000]
valid = [v for v in latencies if v is not None and 0 < v < 1000]
if not valid:
return None
return {
Expand Down
39 changes: 28 additions & 11 deletions src/eva/metrics/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,14 +792,17 @@ async def _save_summary(
if not all_metrics:
return {}

metric_names = [m.name for m in self.metrics]
run_metric_names = [m.name for m in self.metrics]
# Aggregate per_metric for ALL metrics present across records (not just those just run),
# so that a partial re-run (e.g. --metrics response_speed) preserves other metrics.
all_metric_names = sorted({name for rm in all_metrics.values() for name in rm.metrics})
metric_aggregates = self._build_per_metric_aggregates(
all_metrics, metric_names, pass_at_k_results, self.num_draws
all_metrics, all_metric_names, pass_at_k_results, self.num_draws
)

# Compute metric failures for MetricsRunResult
# Compute metric failures for MetricsRunResult (only for metrics just run)
metric_failures: dict[str, list[str]] = {}
for name in metric_names:
for name in run_metric_names:
for record_id, record_metrics in all_metrics.items():
if name in record_metrics.metrics:
score = record_metrics.metrics[name]
Expand All @@ -809,14 +812,27 @@ async def _save_summary(
# Compute EVA composite run-level aggregates
overall_scores = compute_run_level_aggregates(all_metrics, self.num_draws)

# Build metric_errors for summary JSON (record IDs only; full errors are in per-record metrics.json)
metric_errors_summary: dict[str, dict[str, Any]] = {}
# Load existing summary to preserve fields for metrics not being re-run
summary_path = self.run_dir / "metrics_summary.json"
existing_summary: dict[str, Any] = {}
if summary_path.exists():
try:
existing_summary = json.loads(summary_path.read_text())
except Exception as e:
logger.warning(f"Failed to read existing metrics_summary.json: {e}")

# Merge metric_errors: preserve existing errors for metrics not being re-run
merged_metric_errors: dict[str, dict[str, Any]] = dict(existing_summary.get("metric_errors") or {})
for metric_name, failed_record_ids in metric_failures.items():
metric_errors_summary[metric_name] = {
merged_metric_errors[metric_name] = {
"failed_count": len(failed_record_ids),
"total_count": len(all_metrics),
"failed_records": failed_record_ids,
}
# Remove error entries for metrics that are now in run_metric_names but had no failures
for name in run_metric_names:
if name not in metric_failures:
merged_metric_errors.pop(name, None)

data_quality = self._build_data_quality(all_metrics, metric_aggregates)

Expand All @@ -827,8 +843,8 @@ async def _save_summary(
"per_metric": metric_aggregates,
}

if metric_errors_summary:
summary["metric_errors"] = metric_errors_summary
if merged_metric_errors:
summary["metric_errors"] = merged_metric_errors

# Add pass@k configuration if applicable
if pass_at_k_results:
Expand All @@ -839,15 +855,16 @@ async def _save_summary(
},
"exclude_metrics": sorted(m.name for m in self.metrics if m.exclude_from_pass_at_k),
}
elif existing_summary.get("pass_at_k_config"):
summary["pass_at_k_config"] = existing_summary["pass_at_k_config"]

try:
run_config = json.loads((self.run_dir / "config.json").read_text())
provenance = capture_metrics_provenance(metric_names, run_config=run_config)
provenance = capture_metrics_provenance(run_metric_names, run_config=run_config)
summary["provenance"] = provenance.model_dump(mode="json")
except Exception as e:
logger.warning(f"Failed to capture metrics provenance: {e}")

summary_path = self.run_dir / "metrics_summary.json"
summary_path.write_text(json.dumps(summary, indent=2))

logger.info(f"Metrics summary saved to {summary_path}")
Expand Down
27 changes: 20 additions & 7 deletions src/eva/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,12 +203,16 @@ def is_audio_native_pipeline(model_data: dict | Any) -> bool:
return False


def _strip_other_mode_fields(data: dict) -> dict:
def _strip_other_mode_fields(data: dict, strict: bool = True) -> dict:
"""Validate pipeline mode exclusivity, then strip irrelevant shared fields.

Raises ``ValueError`` if multiple pipeline modes are specified.
Raises ``ValueError`` if multiple pipeline modes are specified (when strict=True).
Then strips shared fields (e.g. ``tts`` from S2S mode) so that
``extra="forbid"`` on each config class doesn't reject them.

Args:
strict: If False, skip the conflict error (used for metrics-only re-runs
where the model config is not needed).
"""
# --- Mutual exclusivity: only one pipeline mode allowed ---
has_llm = bool(data.get("llm") or data.get("llm_model"))
Expand All @@ -223,7 +227,7 @@ def _strip_other_mode_fields(data: dict) -> dict:
]
if flag
]
if len(active) > 1:
if len(active) > 1 and strict:
raise ValueError(
f"Multiple pipeline modes set: {', '.join(active)}. "
f"Set exactly one of: EVA_MODEL__LLM (ASR-LLM-TTS), "
Expand Down Expand Up @@ -483,8 +487,10 @@ def _warn_deprecated_aliases(cls, data: Any) -> Any:
raise ValueError("Deprecated environment variables detected:\n" + "\n".join(found))

# Strip env-var fields from other pipeline modes so extra="forbid" doesn't reject them.
# For metrics-only re-runs, skip the strict conflict check — the model isn't used.
if isinstance(data.get("model"), dict):
data["model"] = _strip_other_mode_fields(data["model"])
force_rerun = bool(data.get("force_rerun_metrics"))
data["model"] = _strip_other_mode_fields(data["model"], strict=not force_rerun)

return data

Expand Down Expand Up @@ -576,14 +582,21 @@ def _redact_model_params(cls, model: ModelConfigUnion) -> dict:
data[field_name] = cls._redact_dict(value)
return data

def apply_env_overrides(self, live: "RunConfig") -> None:
def apply_env_overrides(self, live: "RunConfig", strict_llm: bool = True) -> None:
"""Apply environment-dependent values from *live* config onto this (saved) config.

Restores redacted secrets (``***``) and overrides dynamic fields (``url``,
``urls``) in ``model.*_params`` and ``model_list[].litellm_params``.

Args:
live: The live RunConfig with current environment values.
strict_llm: If True (default), raise when the active LLM deployment has
redacted secrets but is not in the current EVA_MODEL_LIST. Set to False
for metrics-only re-runs where the LLM is not needed.

Raises:
ValueError: If provider or alias differs for a service with redacted secrets.
ValueError: If provider or alias differs for a service with redacted secrets,
or (when strict_llm=True) if the active LLM deployment is missing.
"""
# ── model.*_params (STT / TTS / S2S / AudioLLM) ──
for params_field, provider_field in self._PARAMS_TO_PROVIDER.items():
Expand Down Expand Up @@ -649,7 +662,7 @@ def apply_env_overrides(self, live: "RunConfig") -> None:
continue
if name not in live_by_name:
active_llm = getattr(self.model, "llm", None)
if name == active_llm:
if name == active_llm and strict_llm:
raise ValueError(
f"Cannot restore secrets: deployment {name!r} not found in "
f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
Expand Down
11 changes: 10 additions & 1 deletion src/eva/orchestrator/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,7 +916,16 @@ def from_existing_run(cls, run_dir: Path) -> "BenchmarkRunner":
if not config_path.exists():
raise FileNotFoundError(f"config.json not found in {run_dir}")

config = RunConfig.model_validate_json(config_path.read_text())
# Load the saved config without reading from env vars or .env file.
# This prevents conflicts when the current environment has a different pipeline
# mode set (e.g. EVA_MODEL__LLM in env but the saved run used S2S).
class _StoredRunConfig(RunConfig):
@classmethod
def settings_customise_sources(cls, settings_cls, init_settings, **kwargs):
return (init_settings,)

config_data = json.loads(config_path.read_text())
config = _StoredRunConfig(**config_data)
runner = cls(config)
runner.output_dir = run_dir # Use existing output dir, don't create new
return runner
Expand Down
5 changes: 3 additions & 2 deletions src/eva/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ async def run_benchmark(config: RunConfig) -> int:
logger.error(str(e))
return 1

# Apply env-dependent values (secrets, urls) from live env onto saved config
runner.config.apply_env_overrides(config)
# Apply env-dependent values (secrets, urls) from live env onto saved config.
# Skip strict LLM check when force-rerunning metrics only — the LLM is not needed.
runner.config.apply_env_overrides(config, strict_llm=not config.force_rerun_metrics)

# Apply CLI overrides
runner.config.max_rerun_attempts = config.max_rerun_attempts
Expand Down
Loading