From b263eeee4ff1aadf83b856256d98e74f446100ee Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Wed, 15 Apr 2026 11:34:11 -0400 Subject: [PATCH 1/4] Preserve all metrics in metrics_summary.json on partial re-run When re-running a subset of metrics (e.g. --metrics response_speed --force-rerun-metrics), the summary now aggregates per_metric for all metrics found across records rather than only the re-run ones. Also merges metric_errors and pass_at_k_config from the existing file so unrelated fields are not lost. --- src/eva/metrics/runner.py | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py index 892fcbe7..79b8cdde 100644 --- a/src/eva/metrics/runner.py +++ b/src/eva/metrics/runner.py @@ -792,14 +792,17 @@ async def _save_summary( if not all_metrics: return {} - metric_names = [m.name for m in self.metrics] + run_metric_names = [m.name for m in self.metrics] + # Aggregate per_metric for ALL metrics present across records (not just those just run), + # so that a partial re-run (e.g. --metrics response_speed) preserves other metrics. + all_metric_names = sorted({name for rm in all_metrics.values() for name in rm.metrics}) metric_aggregates = self._build_per_metric_aggregates( - all_metrics, metric_names, pass_at_k_results, self.num_draws + all_metrics, all_metric_names, pass_at_k_results, self.num_draws ) - # Compute metric failures for MetricsRunResult + # Compute metric failures for MetricsRunResult (only for metrics just run) metric_failures: dict[str, list[str]] = {} - for name in metric_names: + for name in run_metric_names: for record_id, record_metrics in all_metrics.items(): if name in record_metrics.metrics: score = record_metrics.metrics[name] @@ -809,14 +812,27 @@ async def _save_summary( # Compute EVA composite run-level aggregates overall_scores = compute_run_level_aggregates(all_metrics, self.num_draws) - # Build metric_errors for summary JSON (record IDs only; full errors are in per-record metrics.json) - metric_errors_summary: dict[str, dict[str, Any]] = {} + # Load existing summary to preserve fields for metrics not being re-run + summary_path = self.run_dir / "metrics_summary.json" + existing_summary: dict[str, Any] = {} + if summary_path.exists(): + try: + existing_summary = json.loads(summary_path.read_text()) + except Exception as e: + logger.warning(f"Failed to read existing metrics_summary.json: {e}") + + # Merge metric_errors: preserve existing errors for metrics not being re-run + merged_metric_errors: dict[str, dict[str, Any]] = dict(existing_summary.get("metric_errors") or {}) for metric_name, failed_record_ids in metric_failures.items(): - metric_errors_summary[metric_name] = { + merged_metric_errors[metric_name] = { "failed_count": len(failed_record_ids), "total_count": len(all_metrics), "failed_records": failed_record_ids, } + # Remove error entries for metrics that are now in run_metric_names but had no failures + for name in run_metric_names: + if name not in metric_failures: + merged_metric_errors.pop(name, None) data_quality = self._build_data_quality(all_metrics, metric_aggregates) @@ -827,8 +843,8 @@ async def _save_summary( "per_metric": metric_aggregates, } - if metric_errors_summary: - summary["metric_errors"] = metric_errors_summary + if merged_metric_errors: + summary["metric_errors"] = merged_metric_errors # Add pass@k configuration if applicable if pass_at_k_results: @@ -839,15 +855,16 @@ async def _save_summary( }, "exclude_metrics": sorted(m.name for m in self.metrics if m.exclude_from_pass_at_k), } + elif existing_summary.get("pass_at_k_config"): + summary["pass_at_k_config"] = existing_summary["pass_at_k_config"] try: run_config = json.loads((self.run_dir / "config.json").read_text()) - provenance = capture_metrics_provenance(metric_names, run_config=run_config) + provenance = capture_metrics_provenance(run_metric_names, run_config=run_config) summary["provenance"] = provenance.model_dump(mode="json") except Exception as e: logger.warning(f"Failed to capture metrics provenance: {e}") - summary_path = self.run_dir / "metrics_summary.json" summary_path.write_text(json.dumps(summary, indent=2)) logger.info(f"Metrics summary saved to {summary_path}") From 6456cf87aa53fe7442bf6ebc29fe955e9cbc1e5e Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Wed, 15 Apr 2026 11:34:21 -0400 Subject: [PATCH 2/4] Skip active-LLM deployment check when force-rerunning metrics add strict_llm param to apply_env_overrides; pass strict_llm=False when --force-rerun-metrics is set so metrics-only re-runs on runs whose LLM deployment is no longer in EVA_MODEL_LIST don't fail --- src/eva/models/config.py | 13 ++++++++++--- src/eva/run_benchmark.py | 5 +++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index f3885c54..fa82a277 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -576,14 +576,21 @@ def _redact_model_params(cls, model: ModelConfigUnion) -> dict: data[field_name] = cls._redact_dict(value) return data - def apply_env_overrides(self, live: "RunConfig") -> None: + def apply_env_overrides(self, live: "RunConfig", strict_llm: bool = True) -> None: """Apply environment-dependent values from *live* config onto this (saved) config. Restores redacted secrets (``***``) and overrides dynamic fields (``url``, ``urls``) in ``model.*_params`` and ``model_list[].litellm_params``. + Args: + live: The live RunConfig with current environment values. + strict_llm: If True (default), raise when the active LLM deployment has + redacted secrets but is not in the current EVA_MODEL_LIST. Set to False + for metrics-only re-runs where the LLM is not needed. + Raises: - ValueError: If provider or alias differs for a service with redacted secrets. + ValueError: If provider or alias differs for a service with redacted secrets, + or (when strict_llm=True) if the active LLM deployment is missing. """ # ── model.*_params (STT / TTS / S2S / AudioLLM) ── for params_field, provider_field in self._PARAMS_TO_PROVIDER.items(): @@ -649,7 +656,7 @@ def apply_env_overrides(self, live: "RunConfig") -> None: continue if name not in live_by_name: active_llm = getattr(self.model, "llm", None) - if name == active_llm: + if name == active_llm and strict_llm: raise ValueError( f"Cannot restore secrets: deployment {name!r} not found in " f"current EVA_MODEL_LIST (available: {list(live_by_name)})" diff --git a/src/eva/run_benchmark.py b/src/eva/run_benchmark.py index 49096448..8e581815 100644 --- a/src/eva/run_benchmark.py +++ b/src/eva/run_benchmark.py @@ -42,8 +42,9 @@ async def run_benchmark(config: RunConfig) -> int: logger.error(str(e)) return 1 - # Apply env-dependent values (secrets, urls) from live env onto saved config - runner.config.apply_env_overrides(config) + # Apply env-dependent values (secrets, urls) from live env onto saved config. + # Skip strict LLM check when force-rerunning metrics only — the LLM is not needed. + runner.config.apply_env_overrides(config, strict_llm=not config.force_rerun_metrics) # Apply CLI overrides runner.config.max_rerun_attempts = config.max_rerun_attempts From e577b40bebc351f2109d8e0acfb1b566a8139a92 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Wed, 15 Apr 2026 11:34:30 -0400 Subject: [PATCH 3/4] Skip None latency values in response_speed instead of crashing Turns with missing audio timestamps store None in per_turn_latency; guard against this in _compute_speed_stats and the main latency loop. Also rename section header to "Diagnostic & Validation Metrics". --- apps/analysis.py | 2 +- src/eva/metrics/diagnostic/response_speed.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/analysis.py b/apps/analysis.py index aba9c101..9542298b 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -1105,7 +1105,7 @@ def _show_subtable(heading: str, composites: list, metrics: list) -> None: _show_subtable("Accuracy Metrics (EVA-A)", eva_a_composites, accuracy_metrics) _show_subtable("Experience Metrics (EVA-X)", eva_x_composites, experience_metrics) - _show_subtable("Diagnostic & Other Metrics", [], other_metrics) + _show_subtable("Diagnostic & Validation Metrics", [], other_metrics) csv = summary_df.drop(columns=["label"]).to_csv(index=False) st.download_button("Download CSV", csv, file_name="cross_run_comparison.csv", mime="text/csv") diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py index 1b2d5bee..da87ec95 100644 --- a/src/eva/metrics/diagnostic/response_speed.py +++ b/src/eva/metrics/diagnostic/response_speed.py @@ -58,7 +58,7 @@ def _compute_speed_stats(latencies: list[float]) -> dict | None: Returns None if no valid values remain after filtering. """ - valid = [v for v in latencies if 0 < v < 1000] + valid = [v for v in latencies if v is not None and 0 < v < 1000] if not valid: return None return { From d6fe6be2cebc922eb733f05de22fa7cbd379f28d Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Wed, 15 Apr 2026 11:59:28 -0400 Subject: [PATCH 4/4] Fix RunConfig loading conflicts when env has multiple pipeline modes set from_existing_run now loads the saved config using only init_settings (no env vars / .env file), preventing the saved model config from being contaminated by the current environment's pipeline mode vars. Also skip the pipeline mode conflict check in _strip_other_mode_fields when --force-rerun-metrics is set, as the model config is unused. --- src/eva/models/config.py | 14 ++++++++++---- src/eva/orchestrator/runner.py | 11 ++++++++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index fa82a277..51b58bd8 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -203,12 +203,16 @@ def is_audio_native_pipeline(model_data: dict | Any) -> bool: return False -def _strip_other_mode_fields(data: dict) -> dict: +def _strip_other_mode_fields(data: dict, strict: bool = True) -> dict: """Validate pipeline mode exclusivity, then strip irrelevant shared fields. - Raises ``ValueError`` if multiple pipeline modes are specified. + Raises ``ValueError`` if multiple pipeline modes are specified (when strict=True). Then strips shared fields (e.g. ``tts`` from S2S mode) so that ``extra="forbid"`` on each config class doesn't reject them. + + Args: + strict: If False, skip the conflict error (used for metrics-only re-runs + where the model config is not needed). """ # --- Mutual exclusivity: only one pipeline mode allowed --- has_llm = bool(data.get("llm") or data.get("llm_model")) @@ -223,7 +227,7 @@ def _strip_other_mode_fields(data: dict) -> dict: ] if flag ] - if len(active) > 1: + if len(active) > 1 and strict: raise ValueError( f"Multiple pipeline modes set: {', '.join(active)}. " f"Set exactly one of: EVA_MODEL__LLM (ASR-LLM-TTS), " @@ -483,8 +487,10 @@ def _warn_deprecated_aliases(cls, data: Any) -> Any: raise ValueError("Deprecated environment variables detected:\n" + "\n".join(found)) # Strip env-var fields from other pipeline modes so extra="forbid" doesn't reject them. + # For metrics-only re-runs, skip the strict conflict check — the model isn't used. if isinstance(data.get("model"), dict): - data["model"] = _strip_other_mode_fields(data["model"]) + force_rerun = bool(data.get("force_rerun_metrics")) + data["model"] = _strip_other_mode_fields(data["model"], strict=not force_rerun) return data diff --git a/src/eva/orchestrator/runner.py b/src/eva/orchestrator/runner.py index 590d0828..be2f94b7 100644 --- a/src/eva/orchestrator/runner.py +++ b/src/eva/orchestrator/runner.py @@ -916,7 +916,16 @@ def from_existing_run(cls, run_dir: Path) -> "BenchmarkRunner": if not config_path.exists(): raise FileNotFoundError(f"config.json not found in {run_dir}") - config = RunConfig.model_validate_json(config_path.read_text()) + # Load the saved config without reading from env vars or .env file. + # This prevents conflicts when the current environment has a different pipeline + # mode set (e.g. EVA_MODEL__LLM in env but the saved run used S2S). + class _StoredRunConfig(RunConfig): + @classmethod + def settings_customise_sources(cls, settings_cls, init_settings, **kwargs): + return (init_settings,) + + config_data = json.loads(config_path.read_text()) + config = _StoredRunConfig(**config_data) runner = cls(config) runner.output_dir = run_dir # Use existing output dir, don't create new return runner