From c6ade7e7a89edac8cbe44266a1779e521bd52856 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 7 Feb 2026 03:21:12 +0000 Subject: [PATCH] Fix numerically unstable variance calculation in CapacityEnvelope.from_values() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The computational variance formula E[X²] - E[X]² suffers from catastrophic floating-point cancellation when capacity values are large or nearly identical. This produced silently wrong stdev values (e.g., 41 million instead of 0 for identical values) or complex numbers when the computed variance went negative. Replace with the numerically stable two-pass formula sum((x - mean)²) / n, iterating over the frequency map for efficiency with duplicate values. https://claude.ai/code/session_01BH7FXdY35eRtf98jo8kQiG --- ngraph/results/artifacts.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ngraph/results/artifacts.py b/ngraph/results/artifacts.py index 0309cc5..47cda78 100644 --- a/ngraph/results/artifacts.py +++ b/ngraph/results/artifacts.py @@ -74,10 +74,9 @@ def from_values( if not values: raise ValueError("Cannot create envelope from empty values list") - # Single pass to calculate everything efficiently + # First pass: build frequency map and compute mean frequencies = {} total_sum = 0.0 - sum_squares = 0.0 min_capacity = float("inf") max_capacity = float("-inf") @@ -87,7 +86,6 @@ def from_values( # Update statistics total_sum += value - sum_squares += value * value min_capacity = min(min_capacity, value) max_capacity = max(max_capacity, value) @@ -95,9 +93,15 @@ def from_values( n = len(values) mean_capacity = total_sum / n - # Use computational formula for variance: Var(X) = E[X²] - (E[X])² - variance = (sum_squares / n) - (mean_capacity * mean_capacity) - stdev_capacity = variance**0.5 + # Second pass over unique values: compute variance using the + # numerically stable formula sum((x - mean)^2) / n. + # Iterating over the frequency map is efficient when there are + # many duplicate values (common in Monte Carlo results). + variance_sum = 0.0 + for value, count in frequencies.items(): + diff = value - mean_capacity + variance_sum += count * diff * diff + stdev_capacity = (variance_sum / n) ** 0.5 # Process flow summaries if provided flow_summary_stats = {}