From b263eeee4ff1aadf83b856256d98e74f446100ee Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Wed, 15 Apr 2026 11:34:11 -0400
Subject: [PATCH 1/4] Preserve all metrics in metrics_summary.json on partial
 re-run

When re-running a subset of metrics (e.g. --metrics response_speed
--force-rerun-metrics), the summary now aggregates per_metric for all
metrics found across records rather than only the re-run ones. Also
merges metric_errors and pass_at_k_config from the existing file so
unrelated fields are not lost.
---
 src/eva/metrics/runner.py | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py
index 892fcbe7..79b8cdde 100644
--- a/src/eva/metrics/runner.py
+++ b/src/eva/metrics/runner.py
@@ -792,14 +792,17 @@ async def _save_summary(
         if not all_metrics:
             return {}
 
-        metric_names = [m.name for m in self.metrics]
+        run_metric_names = [m.name for m in self.metrics]
+        # Aggregate per_metric for ALL metrics present across records (not just those just run),
+        # so that a partial re-run (e.g. --metrics response_speed) preserves other metrics.
+        all_metric_names = sorted({name for rm in all_metrics.values() for name in rm.metrics})
         metric_aggregates = self._build_per_metric_aggregates(
-            all_metrics, metric_names, pass_at_k_results, self.num_draws
+            all_metrics, all_metric_names, pass_at_k_results, self.num_draws
         )
 
-        # Compute metric failures for MetricsRunResult
+        # Compute metric failures for MetricsRunResult (only for metrics just run)
         metric_failures: dict[str, list[str]] = {}
-        for name in metric_names:
+        for name in run_metric_names:
             for record_id, record_metrics in all_metrics.items():
                 if name in record_metrics.metrics:
                     score = record_metrics.metrics[name]
@@ -809,14 +812,27 @@ async def _save_summary(
         # Compute EVA composite run-level aggregates
         overall_scores = compute_run_level_aggregates(all_metrics, self.num_draws)
 
-        # Build metric_errors for summary JSON (record IDs only; full errors are in per-record metrics.json)
-        metric_errors_summary: dict[str, dict[str, Any]] = {}
+        # Load existing summary to preserve fields for metrics not being re-run
+        summary_path = self.run_dir / "metrics_summary.json"
+        existing_summary: dict[str, Any] = {}
+        if summary_path.exists():
+            try:
+                existing_summary = json.loads(summary_path.read_text())
+            except Exception as e:
+                logger.warning(f"Failed to read existing metrics_summary.json: {e}")
+
+        # Merge metric_errors: preserve existing errors for metrics not being re-run
+        merged_metric_errors: dict[str, dict[str, Any]] = dict(existing_summary.get("metric_errors") or {})
         for metric_name, failed_record_ids in metric_failures.items():
-            metric_errors_summary[metric_name] = {
+            merged_metric_errors[metric_name] = {
                 "failed_count": len(failed_record_ids),
                 "total_count": len(all_metrics),
                 "failed_records": failed_record_ids,
             }
+        # Remove error entries for metrics that are now in run_metric_names but had no failures
+        for name in run_metric_names:
+            if name not in metric_failures:
+                merged_metric_errors.pop(name, None)
 
         data_quality = self._build_data_quality(all_metrics, metric_aggregates)
 
@@ -827,8 +843,8 @@ async def _save_summary(
             "per_metric": metric_aggregates,
         }
 
-        if metric_errors_summary:
-            summary["metric_errors"] = metric_errors_summary
+        if merged_metric_errors:
+            summary["metric_errors"] = merged_metric_errors
 
         # Add pass@k configuration if applicable
         if pass_at_k_results:
@@ -839,15 +855,16 @@ async def _save_summary(
                 },
                 "exclude_metrics": sorted(m.name for m in self.metrics if m.exclude_from_pass_at_k),
             }
+        elif existing_summary.get("pass_at_k_config"):
+            summary["pass_at_k_config"] = existing_summary["pass_at_k_config"]
 
         try:
             run_config = json.loads((self.run_dir / "config.json").read_text())
-            provenance = capture_metrics_provenance(metric_names, run_config=run_config)
+            provenance = capture_metrics_provenance(run_metric_names, run_config=run_config)
             summary["provenance"] = provenance.model_dump(mode="json")
         except Exception as e:
             logger.warning(f"Failed to capture metrics provenance: {e}")
 
-        summary_path = self.run_dir / "metrics_summary.json"
         summary_path.write_text(json.dumps(summary, indent=2))
 
         logger.info(f"Metrics summary saved to {summary_path}")

From 6456cf87aa53fe7442bf6ebc29fe955e9cbc1e5e Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Wed, 15 Apr 2026 11:34:21 -0400
Subject: [PATCH 2/4] Skip active-LLM deployment check when force-rerunning
 metrics

add strict_llm param to apply_env_overrides; pass strict_llm=False
when --force-rerun-metrics is set so metrics-only re-runs on runs
whose LLM deployment is no longer in EVA_MODEL_LIST don't fail
---
 src/eva/models/config.py | 13 ++++++++++---
 src/eva/run_benchmark.py |  5 +++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index f3885c54..fa82a277 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -576,14 +576,21 @@ def _redact_model_params(cls, model: ModelConfigUnion) -> dict:
                 data[field_name] = cls._redact_dict(value)
         return data
 
-    def apply_env_overrides(self, live: "RunConfig") -> None:
+    def apply_env_overrides(self, live: "RunConfig", strict_llm: bool = True) -> None:
         """Apply environment-dependent values from *live* config onto this (saved) config.
 
         Restores redacted secrets (``***``) and overrides dynamic fields (``url``,
         ``urls``) in ``model.*_params`` and ``model_list[].litellm_params``.
 
+        Args:
+            live: The live RunConfig with current environment values.
+            strict_llm: If True (default), raise when the active LLM deployment has
+                redacted secrets but is not in the current EVA_MODEL_LIST. Set to False
+                for metrics-only re-runs where the LLM is not needed.
+
         Raises:
-            ValueError: If provider or alias differs for a service with redacted secrets.
+            ValueError: If provider or alias differs for a service with redacted secrets,
+                or (when strict_llm=True) if the active LLM deployment is missing.
         """
         # ── model.*_params (STT / TTS / S2S / AudioLLM) ──
         for params_field, provider_field in self._PARAMS_TO_PROVIDER.items():
@@ -649,7 +656,7 @@ def apply_env_overrides(self, live: "RunConfig") -> None:
                 continue
             if name not in live_by_name:
                 active_llm = getattr(self.model, "llm", None)
-                if name == active_llm:
+                if name == active_llm and strict_llm:
                     raise ValueError(
                         f"Cannot restore secrets: deployment {name!r} not found in "
                         f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
diff --git a/src/eva/run_benchmark.py b/src/eva/run_benchmark.py
index 49096448..8e581815 100644
--- a/src/eva/run_benchmark.py
+++ b/src/eva/run_benchmark.py
@@ -42,8 +42,9 @@ async def run_benchmark(config: RunConfig) -> int:
             logger.error(str(e))
             return 1
 
-        # Apply env-dependent values (secrets, urls) from live env onto saved config
-        runner.config.apply_env_overrides(config)
+        # Apply env-dependent values (secrets, urls) from live env onto saved config.
+        # Skip strict LLM check when force-rerunning metrics only — the LLM is not needed.
+        runner.config.apply_env_overrides(config, strict_llm=not config.force_rerun_metrics)
 
         # Apply CLI overrides
         runner.config.max_rerun_attempts = config.max_rerun_attempts

From e577b40bebc351f2109d8e0acfb1b566a8139a92 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Wed, 15 Apr 2026 11:34:30 -0400
Subject: [PATCH 3/4] Skip None latency values in response_speed instead of
 crashing

Turns with missing audio timestamps store None in per_turn_latency;
guard against this in _compute_speed_stats and the main latency loop.
Also rename section header to "Diagnostic & Validation Metrics".
---
 apps/analysis.py                             | 2 +-
 src/eva/metrics/diagnostic/response_speed.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index aba9c101..9542298b 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -1105,7 +1105,7 @@ def _show_subtable(heading: str, composites: list, metrics: list) -> None:
 
     _show_subtable("Accuracy Metrics (EVA-A)", eva_a_composites, accuracy_metrics)
     _show_subtable("Experience Metrics (EVA-X)", eva_x_composites, experience_metrics)
-    _show_subtable("Diagnostic & Other Metrics", [], other_metrics)
+    _show_subtable("Diagnostic & Validation Metrics", [], other_metrics)
 
     csv = summary_df.drop(columns=["label"]).to_csv(index=False)
     st.download_button("Download CSV", csv, file_name="cross_run_comparison.csv", mime="text/csv")
diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py
index 1b2d5bee..da87ec95 100644
--- a/src/eva/metrics/diagnostic/response_speed.py
+++ b/src/eva/metrics/diagnostic/response_speed.py
@@ -58,7 +58,7 @@ def _compute_speed_stats(latencies: list[float]) -> dict | None:
 
     Returns None if no valid values remain after filtering.
     """
-    valid = [v for v in latencies if 0 < v < 1000]
+    valid = [v for v in latencies if v is not None and 0 < v < 1000]
     if not valid:
         return None
     return {

From d6fe6be2cebc922eb733f05de22fa7cbd379f28d Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Wed, 15 Apr 2026 11:59:28 -0400
Subject: [PATCH 4/4] Fix RunConfig loading conflicts when env has multiple
 pipeline modes set

from_existing_run now loads the saved config using only init_settings
(no env vars / .env file), preventing the saved model config from
being contaminated by the current environment's pipeline mode vars.

Also skip the pipeline mode conflict check in _strip_other_mode_fields
when --force-rerun-metrics is set, as the model config is unused.
---
 src/eva/models/config.py       | 14 ++++++++++----
 src/eva/orchestrator/runner.py | 11 ++++++++++-
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
index fa82a277..51b58bd8 100644
--- a/src/eva/models/config.py
+++ b/src/eva/models/config.py
@@ -203,12 +203,16 @@ def is_audio_native_pipeline(model_data: dict | Any) -> bool:
     return False
 
 
-def _strip_other_mode_fields(data: dict) -> dict:
+def _strip_other_mode_fields(data: dict, strict: bool = True) -> dict:
     """Validate pipeline mode exclusivity, then strip irrelevant shared fields.
 
-    Raises ``ValueError`` if multiple pipeline modes are specified.
+    Raises ``ValueError`` if multiple pipeline modes are specified (when strict=True).
     Then strips shared fields (e.g. ``tts`` from S2S mode) so that
     ``extra="forbid"`` on each config class doesn't reject them.
+
+    Args:
+        strict: If False, skip the conflict error (used for metrics-only re-runs
+            where the model config is not needed).
     """
     # --- Mutual exclusivity: only one pipeline mode allowed ---
     has_llm = bool(data.get("llm") or data.get("llm_model"))
@@ -223,7 +227,7 @@ def _strip_other_mode_fields(data: dict) -> dict:
         ]
         if flag
     ]
-    if len(active) > 1:
+    if len(active) > 1 and strict:
         raise ValueError(
             f"Multiple pipeline modes set: {', '.join(active)}. "
             f"Set exactly one of: EVA_MODEL__LLM (ASR-LLM-TTS), "
@@ -483,8 +487,10 @@ def _warn_deprecated_aliases(cls, data: Any) -> Any:
             raise ValueError("Deprecated environment variables detected:\n" + "\n".join(found))
 
         # Strip env-var fields from other pipeline modes so extra="forbid" doesn't reject them.
+        # For metrics-only re-runs, skip the strict conflict check — the model isn't used.
         if isinstance(data.get("model"), dict):
-            data["model"] = _strip_other_mode_fields(data["model"])
+            force_rerun = bool(data.get("force_rerun_metrics"))
+            data["model"] = _strip_other_mode_fields(data["model"], strict=not force_rerun)
 
         return data
 
diff --git a/src/eva/orchestrator/runner.py b/src/eva/orchestrator/runner.py
index 590d0828..be2f94b7 100644
--- a/src/eva/orchestrator/runner.py
+++ b/src/eva/orchestrator/runner.py
@@ -916,7 +916,16 @@ def from_existing_run(cls, run_dir: Path) -> "BenchmarkRunner":
         if not config_path.exists():
             raise FileNotFoundError(f"config.json not found in {run_dir}")
 
-        config = RunConfig.model_validate_json(config_path.read_text())
+        # Load the saved config without reading from env vars or .env file.
+        # This prevents conflicts when the current environment has a different pipeline
+        # mode set (e.g. EVA_MODEL__LLM in env but the saved run used S2S).
+        class _StoredRunConfig(RunConfig):
+            @classmethod
+            def settings_customise_sources(cls, settings_cls, init_settings, **kwargs):
+                return (init_settings,)
+
+        config_data = json.loads(config_path.read_text())
+        config = _StoredRunConfig(**config_data)
         runner = cls(config)
         runner.output_dir = run_dir  # Use existing output dir, don't create new
         return runner