From cde61c4f0bb4bcd73c9ce5349983c1b54b1baa5e Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Mon, 2 Mar 2026 16:29:16 +0000
Subject: [PATCH 1/9] Show absolute z-score on benchmarks

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 29 +++++++++++++++++++----------
 vortex-bench/src/measurements.rs   | 26 ++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 10 deletions(-)
diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index 1802325c4cd..5e01ffaddd1 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -63,6 +63,10 @@ def extract_dataset_key(df):
 improvement_threshold = 1.0 - (threshold_pct / 100.0)  # e.g., 0.7 for 30%, 0.9 for 10%
 regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
 
+# After merge with suffixes, z-score columns become abs_z_score_base and abs_z_score_pr
+has_z_base = "abs_z_score_base" in df3.columns
+has_z_pr = "abs_z_score_pr" in df3.columns
+
 # Generate summary statistics
 df3["ratio"] = df3["value_pr"] / df3["value_base"]
 df3["remark"] = pd.Series([""] * len(df3))
@@ -183,16 +187,21 @@ def format_performance(ratio, target_name):
     )
 
 # Build table
-table_df = pd.DataFrame(
-    {
-        "name": df3["name"],
-        f"PR {pr_commit_id[:8]}": df3["value_pr"],
-        f"base {base_commit_id[:8]}": df3["value_base"],
-        "ratio (PR/base)": df3["ratio"],
-        "unit": df3["unit_base"],
-        "remark": df3["remark"],
-    }
-)
+table_dict = {
+    "name": df3["name"],
+    f"PR {pr_commit_id[:8]}": df3["value_pr"],
+    f"base {base_commit_id[:8]}": df3["value_base"],
+    "ratio (PR/base)": df3["ratio"],
+    "unit": df3["unit_base"],
+}
+
+if has_z_pr:
+    table_dict["|z| PR"] = df3["abs_z_score_pr"]
+if has_z_base:
+    table_dict["|z| base"] = df3["abs_z_score_base"]
+
+table_dict["remark"] = df3["remark"]
+table_df = pd.DataFrame(table_dict)
 
 # Output complete formatted markdown
 print("\n".join(summary_lines))
diff --git a/vortex-bench/src/measurements.rs b/vortex-bench/src/measurements.rs
index f49349cd95e..af8a05cca5f 100644
--- a/vortex-bench/src/measurements.rs
+++ b/vortex-bench/src/measurements.rs
@@ -272,6 +272,27 @@ impl QueryMeasurement {
             )
         }
     }
+
+    /// Compute |z-score| = |median - mean| / stddev for the runs.
+    /// Returns `None` if fewer than 2 runs (stddev is undefined).
+    pub fn abs_z_score(&self) -> Option<f64> {
+        let n = self.runs.len();
+        if n < 2 {
+            return None;
+        }
+
+        let nanos: Vec<f64> = self.runs.iter().map(|d| d.as_nanos() as f64).collect();
+        let mean = nanos.iter().sum::<f64>() / n as f64;
+        let variance = nanos.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1) as f64;
+        let stddev = variance.sqrt();
+
+        if stddev == 0.0 {
+            return Some(0.0);
+        }
+
+        let median = self.median_run().as_nanos() as f64;
+        Some(((median - mean) / stddev).abs())
+    }
 }
 
 #[derive(Serialize, Deserialize)]
@@ -282,6 +303,10 @@ pub struct QueryMeasurementJson {
     pub unit: String,
     pub value: u128,
     pub all_runtimes: Vec<u128>,
+    /// Absolute z-score of the median relative to the mean: |median - mean| / stddev.
+    /// Indicates how representative the reported median is. `None` when fewer than 2 runs.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub abs_z_score: Option<f64>,
     pub target: Target,
     pub commit_id: String,
     pub env_triple: TripleJson,
@@ -313,6 +338,7 @@ impl ToJson for QueryMeasurement {
             unit: "ns".to_string(),
             value: self.median_run().as_nanos(),
             all_runtimes: self.runs.iter().map(|r| r.as_nanos()).collect_vec(),
+            abs_z_score: self.abs_z_score(),
             commit_id: GIT_COMMIT_ID.to_string(),
             target: self.target,
             env_triple: TripleJson {

From 805608dab652988e69542581134205b146859202 Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Mon, 2 Mar 2026 16:49:42 +0000
Subject: [PATCH 2/9] python all the things

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 28 +++++++++++++++++++++++++---
 vortex-bench/src/measurements.rs   | 26 --------------------------
 2 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index 5e01ffaddd1..81385371faa 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -63,9 +63,31 @@ def extract_dataset_key(df):
 improvement_threshold = 1.0 - (threshold_pct / 100.0)  # e.g., 0.7 for 30%, 0.9 for 10%
 regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
 
-# After merge with suffixes, z-score columns become abs_z_score_base and abs_z_score_pr
-has_z_base = "abs_z_score_base" in df3.columns
-has_z_pr = "abs_z_score_pr" in df3.columns
+def compute_abs_z_score(runtimes):
+    """Compute |median - mean| / stddev from a list of runtimes."""
+    if not isinstance(runtimes, list) or len(runtimes) < 2:
+        return float("nan")
+    n = len(runtimes)
+    mean = sum(runtimes) / n
+    variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1)
+    stddev = math.sqrt(variance)
+    if stddev == 0:
+        return 0.0
+    sorted_rt = sorted(runtimes)
+    if n % 2 == 1:
+        median = sorted_rt[n // 2]
+    else:
+        median = (sorted_rt[n // 2 - 1] + sorted_rt[n // 2]) / 2
+    return abs((median - mean) / stddev)
+
+
+# Compute |z-score| from all_runtimes when available
+has_z_pr = "all_runtimes_pr" in df3.columns
+has_z_base = "all_runtimes_base" in df3.columns
+if has_z_pr:
+    df3["abs_z_score_pr"] = df3["all_runtimes_pr"].apply(compute_abs_z_score)
+if has_z_base:
+    df3["abs_z_score_base"] = df3["all_runtimes_base"].apply(compute_abs_z_score)
 
 # Generate summary statistics
 df3["ratio"] = df3["value_pr"] / df3["value_base"]
diff --git a/vortex-bench/src/measurements.rs b/vortex-bench/src/measurements.rs
index af8a05cca5f..f49349cd95e 100644
--- a/vortex-bench/src/measurements.rs
+++ b/vortex-bench/src/measurements.rs
@@ -272,27 +272,6 @@ impl QueryMeasurement {
             )
         }
     }
-
-    /// Compute |z-score| = |median - mean| / stddev for the runs.
-    /// Returns `None` if fewer than 2 runs (stddev is undefined).
-    pub fn abs_z_score(&self) -> Option<f64> {
-        let n = self.runs.len();
-        if n < 2 {
-            return None;
-        }
-
-        let nanos: Vec<f64> = self.runs.iter().map(|d| d.as_nanos() as f64).collect();
-        let mean = nanos.iter().sum::<f64>() / n as f64;
-        let variance = nanos.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1) as f64;
-        let stddev = variance.sqrt();
-
-        if stddev == 0.0 {
-            return Some(0.0);
-        }
-
-        let median = self.median_run().as_nanos() as f64;
-        Some(((median - mean) / stddev).abs())
-    }
 }
 
 #[derive(Serialize, Deserialize)]
@@ -303,10 +282,6 @@ pub struct QueryMeasurementJson {
     pub unit: String,
     pub value: u128,
     pub all_runtimes: Vec<u128>,
-    /// Absolute z-score of the median relative to the mean: |median - mean| / stddev.
-    /// Indicates how representative the reported median is. `None` when fewer than 2 runs.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub abs_z_score: Option<f64>,
     pub target: Target,
     pub commit_id: String,
     pub env_triple: TripleJson,
@@ -338,7 +313,6 @@ impl ToJson for QueryMeasurement {
             unit: "ns".to_string(),
             value: self.median_run().as_nanos(),
             all_runtimes: self.runs.iter().map(|r| r.as_nanos()).collect_vec(),
-            abs_z_score: self.abs_z_score(),
             commit_id: GIT_COMMIT_ID.to_string(),
             target: self.target,
             env_triple: TripleJson {

From 8368456a04b11f7806468f32f16d1774655d206a Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Mon, 2 Mar 2026 16:53:42 +0000
Subject: [PATCH 3/9] ruff format

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index 81385371faa..9b0d556cbe9 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -63,6 +63,7 @@ def extract_dataset_key(df):
 improvement_threshold = 1.0 - (threshold_pct / 100.0)  # e.g., 0.7 for 30%, 0.9 for 10%
 regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
 
+
 def compute_abs_z_score(runtimes):
     """Compute |median - mean| / stddev from a list of runtimes."""
     if not isinstance(runtimes, list) or len(runtimes) < 2:

From e860ced2967d18c66b36d0c6d3d0ef9575b548d9 Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Mon, 2 Mar 2026 17:06:08 +0000
Subject: [PATCH 4/9] Fix z-score table formatting

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index 9b0d556cbe9..cc06a544e24 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -219,9 +219,9 @@ def format_performance(ratio, target_name):
 }
 
 if has_z_pr:
-    table_dict["|z| PR"] = df3["abs_z_score_pr"]
+    table_dict["abs(z-score) PR"] = df3["abs_z_score_pr"]
 if has_z_base:
-    table_dict["|z| base"] = df3["abs_z_score_base"]
+    table_dict["abs(z-score) base"] = df3["abs_z_score_base"]
 
 table_dict["remark"] = df3["remark"]
 table_df = pd.DataFrame(table_dict)

From ef308c41417a3ac9dd8b887ad520a808031746eb Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Thu, 5 Mar 2026 14:52:54 +0000
Subject: [PATCH 5/9] variance instead of z score

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index cc06a544e24..d6545b9fb60 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -64,31 +64,22 @@ def extract_dataset_key(df):
 regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
 
 
-def compute_abs_z_score(runtimes):
-    """Compute |median - mean| / stddev from a list of runtimes."""
+def compute_variance(runtimes):
+    """Compute sample variance from a list of runtimes."""
     if not isinstance(runtimes, list) or len(runtimes) < 2:
         return float("nan")
     n = len(runtimes)
     mean = sum(runtimes) / n
-    variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1)
-    stddev = math.sqrt(variance)
-    if stddev == 0:
-        return 0.0
-    sorted_rt = sorted(runtimes)
-    if n % 2 == 1:
-        median = sorted_rt[n // 2]
-    else:
-        median = (sorted_rt[n // 2 - 1] + sorted_rt[n // 2]) / 2
-    return abs((median - mean) / stddev)
+    return sum((x - mean) ** 2 for x in runtimes) / (n - 1)
 
 
-# Compute |z-score| from all_runtimes when available
+# Compute variance from all_runtimes when available
 has_z_pr = "all_runtimes_pr" in df3.columns
 has_z_base = "all_runtimes_base" in df3.columns
 if has_z_pr:
-    df3["abs_z_score_pr"] = df3["all_runtimes_pr"].apply(compute_abs_z_score)
+    df3["variance_pr"] = df3["all_runtimes_pr"].apply(compute_variance)
 if has_z_base:
-    df3["abs_z_score_base"] = df3["all_runtimes_base"].apply(compute_abs_z_score)
+    df3["variance_base"] = df3["all_runtimes_base"].apply(compute_variance)
 
 # Generate summary statistics
 df3["ratio"] = df3["value_pr"] / df3["value_base"]
@@ -219,9 +210,9 @@ def format_performance(ratio, target_name):
 }
 
 if has_z_pr:
-    table_dict["abs(z-score) PR"] = df3["abs_z_score_pr"]
+    table_dict["variance PR"] = df3["variance_pr"]
 if has_z_base:
-    table_dict["abs(z-score) base"] = df3["abs_z_score_base"]
+    table_dict["variance base"] = df3["variance_base"]
 
 table_dict["remark"] = df3["remark"]
 table_df = pd.DataFrame(table_dict)

From 284722f1c110a115b991f5560873f2aa2b9782b5 Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Fri, 6 Mar 2026 13:30:42 +0000
Subject: [PATCH 6/9] cv

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index d6545b9fb60..499294bfd42 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -64,22 +64,25 @@ def extract_dataset_key(df):
 regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
 
 
-def compute_variance(runtimes):
-    """Compute sample variance from a list of runtimes."""
+def compute_cv_pct(runtimes):
+    """Compute coefficient of variation (std_dev / mean * 100) as a percentage."""
     if not isinstance(runtimes, list) or len(runtimes) < 2:
         return float("nan")
     n = len(runtimes)
     mean = sum(runtimes) / n
-    return sum((x - mean) ** 2 for x in runtimes) / (n - 1)
+    if mean == 0:
+        return float("nan")
+    variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1)
+    return (variance**0.5 / mean) * 100
 
 
-# Compute variance from all_runtimes when available
+# Compute CV% from all_runtimes when available
 has_z_pr = "all_runtimes_pr" in df3.columns
 has_z_base = "all_runtimes_base" in df3.columns
 if has_z_pr:
-    df3["variance_pr"] = df3["all_runtimes_pr"].apply(compute_variance)
+    df3["cv_pct_pr"] = df3["all_runtimes_pr"].apply(compute_cv_pct)
 if has_z_base:
-    df3["variance_base"] = df3["all_runtimes_base"].apply(compute_variance)
+    df3["cv_pct_base"] = df3["all_runtimes_base"].apply(compute_cv_pct)
 
 # Generate summary statistics
 df3["ratio"] = df3["value_pr"] / df3["value_base"]
@@ -210,9 +213,9 @@ def format_performance(ratio, target_name):
 }
 
 if has_z_pr:
-    table_dict["variance PR"] = df3["variance_pr"]
+    table_dict["CV% PR"] = df3["cv_pct_pr"]
 if has_z_base:
-    table_dict["variance base"] = df3["variance_base"]
+    table_dict["CV% base"] = df3["cv_pct_base"]
 
 table_dict["remark"] = df3["remark"]
 table_df = pd.DataFrame(table_dict)

From 0c50cadadcdabd2873c998a6891865ac82e2b7fe Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Tue, 10 Mar 2026 17:28:55 +0000
Subject: [PATCH 7/9] better math and display

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 116 +++++++++++++++++++++++++++--
 1 file changed, 109 insertions(+), 7 deletions(-)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index 499294bfd42..b1ddcf66a20 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -83,6 +83,24 @@ def compute_cv_pct(runtimes):
     df3["cv_pct_pr"] = df3["all_runtimes_pr"].apply(compute_cv_pct)
 if has_z_base:
     df3["cv_pct_base"] = df3["all_runtimes_base"].apply(compute_cv_pct)
+if has_z_pr or has_z_base:
+    cv_columns = [column for column in ["cv_pct_pr", "cv_pct_base"] if column in df3.columns]
+    df3["cv_pct_max"] = df3[cv_columns].max(axis=1, skipna=True)
+
+
+def describe_noise(cv_pct):
+    """Bucket runtime noise into labels that are easy to scan in GitHub tables."""
+    if pd.isna(cv_pct):
+        return "unknown"
+    if cv_pct < 5:
+        return "low"
+    if cv_pct < 15:
+        return "medium"
+    return "high"
+
+
+if "cv_pct_max" in df3.columns:
+    df3["noise"] = df3["cv_pct_max"].apply(describe_noise)
 
 # Generate summary statistics
 df3["ratio"] = df3["value_pr"] / df3["value_base"]
@@ -118,10 +136,75 @@ def calculate_geo_mean(df):
         return float("nan")
 
 
+def calculate_run_drift_metrics(df):
+    """Summarize common-mode movement across the whole benchmark run.
+
+    We work in log-ratio space because ratios compose multiplicatively:
+    a 10% slowdown (1.10x) and a 10% speedup (0.90x) are roughly symmetric
+    once transformed with log(). That makes the median log-ratio a robust
+    estimate of "the whole run was faster/slower than usual".
+    """
+    valid_ratios = [r for r in df["ratio"] if r > 0 and not pd.isna(r)]
+    if not valid_ratios:
+        return {
+            "drift_ratio": float("nan"),
+            "same_direction_pct": float("nan"),
+            "residual_mad_pct": float("nan"),
+            "is_baseline_suspect": False,
+            "drift_level": "unknown",
+        }
+
+    log_ratios = pd.Series([math.log(r) for r in valid_ratios])
+    median_log_ratio = float(log_ratios.median())
+    drift_ratio = math.exp(median_log_ratio)
+
+    # Count how often benchmarks move in the same direction as the run-wide drift.
+    # This distinguishes "everything got faster/slower together" from a mixed run
+    # with a similar central tendency.
+    if median_log_ratio < 0:
+        same_direction_pct = float((log_ratios < 0).mean() * 100)
+    elif median_log_ratio > 0:
+        same_direction_pct = float((log_ratios > 0).mean() * 100)
+    else:
+        same_direction_pct = float((log_ratios == 0).mean() * 100)
+
+    # Residual MAD measures how tightly benchmarks cluster around the run-wide
+    # drift. Small residual spread plus broad agreement is a strong indicator
+    # that the baseline itself is shifted rather than the PR changing specific
+    # benchmarks independently.
+    residual_logs = log_ratios - median_log_ratio
+    residual_mad = float(residual_logs.abs().median())
+    residual_mad_pct = (math.exp(residual_mad) - 1.0) * 100
+
+    drift_threshold_pct = 5.0
+    consistency_threshold_pct = 80.0
+    residual_spread_threshold_pct = 5.0
+    is_baseline_suspect = (
+        abs(drift_ratio - 1.0) * 100 >= drift_threshold_pct
+        and same_direction_pct >= consistency_threshold_pct
+        and residual_mad_pct <= residual_spread_threshold_pct
+    )
+    if is_baseline_suspect:
+        drift_level = "large"
+    elif abs(drift_ratio - 1.0) * 100 >= drift_threshold_pct:
+        drift_level = "noticeable"
+    else:
+        drift_level = "small"
+
+    return {
+        "drift_ratio": drift_ratio,
+        "same_direction_pct": same_direction_pct,
+        "residual_mad_pct": residual_mad_pct,
+        "is_baseline_suspect": is_baseline_suspect,
+        "drift_level": drift_level,
+    }
+
+
 vortex_geo_mean_ratio = calculate_geo_mean(vortex_df)
 duckdb_vortex_geo_mean_ratio = calculate_geo_mean(duckdb_vortex_df)
 datafusion_vortex_geo_mean_ratio = calculate_geo_mean(datafusion_vortex_df)
 parquet_geo_mean_ratio = calculate_geo_mean(parquet_df)
+run_drift_metrics = calculate_run_drift_metrics(df3)
 
 # Find best and worst changes for vortex-only results
 vortex_valid_ratios = vortex_df["ratio"].dropna()
@@ -176,6 +259,14 @@ def format_performance(ratio, target_name):
     "## Summary",
     "",
     f"- **Overall**: {overall_performance}",
+    (
+        f"- **Run drift**: {run_drift_metrics['drift_ratio']:.3f}x, "
+        f"{run_drift_metrics['drift_level']} run-wide shift "
+        f"({run_drift_metrics['same_direction_pct']:.0f}% aligned, "
+        f"residual MAD {run_drift_metrics['residual_mad_pct']:.1f}%)"
+        if not pd.isna(run_drift_metrics["drift_ratio"])
+        else "- **Run drift**: no data"
+    ),
 ]
 
 # Only add vortex-specific sections if we have vortex data
@@ -203,19 +294,30 @@ def format_performance(ratio, target_name):
         ]
     )
 
+if run_drift_metrics["is_baseline_suspect"]:
+    summary_lines.append("- **Baseline signal**: suspect common-mode drift; this run moved together more than expected")
+
+# Convert rendered timing values from ns to ms to keep the GitHub table narrower.
+# This affects display only; all comparison math above stays in the original units.
+display_pr_values = df3["value_pr"].copy()
+display_base_values = df3["value_base"].copy()
+display_units = df3["unit_base"].copy()
+ns_mask = display_units == "ns"
+display_pr_values.loc[ns_mask] = display_pr_values.loc[ns_mask] / 1_000_000
+display_base_values.loc[ns_mask] = display_base_values.loc[ns_mask] / 1_000_000
+display_units.loc[ns_mask] = "ms"
+
 # Build table
 table_dict = {
     "name": df3["name"],
-    f"PR {pr_commit_id[:8]}": df3["value_pr"],
-    f"base {base_commit_id[:8]}": df3["value_base"],
+    f"PR {pr_commit_id[:8]}": display_pr_values,
+    f"base {base_commit_id[:8]}": display_base_values,
     "ratio (PR/base)": df3["ratio"],
-    "unit": df3["unit_base"],
+    "unit": display_units,
 }
 
-if has_z_pr:
-    table_dict["CV% PR"] = df3["cv_pct_pr"]
-if has_z_base:
-    table_dict["CV% base"] = df3["cv_pct_base"]
+if "cv_pct_max" in df3.columns:
+    table_dict["noise"] = df3["noise"]
 
 table_dict["remark"] = df3["remark"]
 table_df = pd.DataFrame(table_dict)

From 1192444d540bc18bcd4d9b22b038e80f2b564a5a Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Tue, 10 Mar 2026 17:38:30 +0000
Subject: [PATCH 8/9] fix script

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 262 ++++++++++++++++-------------
 1 file changed, 143 insertions(+), 119 deletions(-)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index b1ddcf66a20..68df62f306e 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -14,6 +14,14 @@
 
 import pandas as pd
 
+S3_THRESHOLD_PCT = 30
+DEFAULT_THRESHOLD_PCT = 10
+NOISE_LOW_MAX_CV_PCT = 5.0
+NOISE_MEDIUM_MAX_CV_PCT = 15.0
+DRIFT_NOTICEABLE_THRESHOLD_PCT = 5.0
+DRIFT_CONSISTENCY_THRESHOLD_PCT = 80.0
+DRIFT_RESIDUAL_SPREAD_THRESHOLD_PCT = 5.0
+
 # Check if benchmark name argument is provided (will be added from workflow)
 benchmark_name = sys.argv[3] if len(sys.argv) > 3 else ""
 
@@ -59,7 +67,7 @@ def extract_dataset_key(df):
 # Determine threshold based on benchmark name
 # Use 30% threshold for S3 benchmarks, 10% for others
 is_s3_benchmark = "s3" in benchmark_name.lower()
-threshold_pct = 30 if is_s3_benchmark else 10
+threshold_pct = S3_THRESHOLD_PCT if is_s3_benchmark else DEFAULT_THRESHOLD_PCT
 improvement_threshold = 1.0 - (threshold_pct / 100.0)  # e.g., 0.7 for 30%, 0.9 for 10%
 regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
 
@@ -77,13 +85,13 @@ def compute_cv_pct(runtimes):
 
 
 # Compute CV% from all_runtimes when available
-has_z_pr = "all_runtimes_pr" in df3.columns
-has_z_base = "all_runtimes_base" in df3.columns
-if has_z_pr:
+has_runtimes_pr = "all_runtimes_pr" in df3.columns
+has_runtimes_base = "all_runtimes_base" in df3.columns
+if has_runtimes_pr:
     df3["cv_pct_pr"] = df3["all_runtimes_pr"].apply(compute_cv_pct)
-if has_z_base:
+if has_runtimes_base:
     df3["cv_pct_base"] = df3["all_runtimes_base"].apply(compute_cv_pct)
-if has_z_pr or has_z_base:
+if has_runtimes_pr or has_runtimes_base:
     cv_columns = [column for column in ["cv_pct_pr", "cv_pct_base"] if column in df3.columns]
     df3["cv_pct_max"] = df3[cv_columns].max(axis=1, skipna=True)
 
@@ -92,9 +100,9 @@ def describe_noise(cv_pct):
     """Bucket runtime noise into labels that are easy to scan in GitHub tables."""
     if pd.isna(cv_pct):
         return "unknown"
-    if cv_pct < 5:
+    if cv_pct < NOISE_LOW_MAX_CV_PCT:
         return "low"
-    if cv_pct < 15:
+    if cv_pct < NOISE_MEDIUM_MAX_CV_PCT:
         return "medium"
     return "high"
 
@@ -176,17 +184,14 @@ def calculate_run_drift_metrics(df):
     residual_mad = float(residual_logs.abs().median())
     residual_mad_pct = (math.exp(residual_mad) - 1.0) * 100
 
-    drift_threshold_pct = 5.0
-    consistency_threshold_pct = 80.0
-    residual_spread_threshold_pct = 5.0
     is_baseline_suspect = (
-        abs(drift_ratio - 1.0) * 100 >= drift_threshold_pct
-        and same_direction_pct >= consistency_threshold_pct
-        and residual_mad_pct <= residual_spread_threshold_pct
+        abs(drift_ratio - 1.0) * 100 >= DRIFT_NOTICEABLE_THRESHOLD_PCT
+        and same_direction_pct >= DRIFT_CONSISTENCY_THRESHOLD_PCT
+        and residual_mad_pct <= DRIFT_RESIDUAL_SPREAD_THRESHOLD_PCT
     )
     if is_baseline_suspect:
         drift_level = "large"
-    elif abs(drift_ratio - 1.0) * 100 >= drift_threshold_pct:
+    elif abs(drift_ratio - 1.0) * 100 >= DRIFT_NOTICEABLE_THRESHOLD_PCT:
         drift_level = "noticeable"
     else:
         drift_level = "small"
@@ -200,40 +205,6 @@ def calculate_run_drift_metrics(df):
     }
 
 
-vortex_geo_mean_ratio = calculate_geo_mean(vortex_df)
-duckdb_vortex_geo_mean_ratio = calculate_geo_mean(duckdb_vortex_df)
-datafusion_vortex_geo_mean_ratio = calculate_geo_mean(datafusion_vortex_df)
-parquet_geo_mean_ratio = calculate_geo_mean(parquet_df)
-run_drift_metrics = calculate_run_drift_metrics(df3)
-
-# Find best and worst changes for vortex-only results
-vortex_valid_ratios = vortex_df["ratio"].dropna()
-if len(vortex_valid_ratios) > 0:
-    # Best improvement: smallest ratio (< 1.0, fastest performance)
-    improvements = vortex_valid_ratios[vortex_valid_ratios < 1.0]
-    if len(improvements) > 0:
-        best_idx = improvements.idxmin()
-        best_improvement = f"{vortex_df.loc[best_idx, 'name']} ({vortex_df.loc[best_idx, 'ratio']:.3f}x)"
-    else:
-        best_improvement = "No improvements"
-
-    # Worst regression: largest ratio (> 1.0, slowest performance)
-    regressions = vortex_valid_ratios[vortex_valid_ratios > 1.0]
-    if len(regressions) > 0:
-        worst_idx = regressions.idxmax()
-        worst_regression = f"{vortex_df.loc[worst_idx, 'name']} ({vortex_df.loc[worst_idx, 'ratio']:.3f}x)"
-    else:
-        worst_regression = "No regressions"
-else:
-    best_improvement = "No valid vortex comparisons"
-    worst_regression = "No valid vortex comparisons"
-
-# Count significant changes for vortex-only results
-significant_improvements = (vortex_df["ratio"] < improvement_threshold).sum()
-significant_regressions = (vortex_df["ratio"] > regression_threshold).sum()
-
-
-# Build summary
 def format_performance(ratio, target_name):
     if pd.isna(ratio):
         return f"no {target_name.lower()} data"
@@ -248,79 +219,132 @@ def format_performance(ratio, target_name):
         return f"{ratio:.3f}x {emoji}"
 
 
-overall_performance = "no data" if pd.isna(geo_mean_ratio) else format_performance(geo_mean_ratio, "overall")
-vortex_performance = format_performance(vortex_geo_mean_ratio, "vortex")
-duckdb_vortex_performance = format_performance(duckdb_vortex_geo_mean_ratio, "duckdb:vortex")
-datafusion_vortex_performance = format_performance(datafusion_vortex_geo_mean_ratio, "datafusion:vortex")
-parquet_performance = format_performance(parquet_geo_mean_ratio, "parquet")
-
-
-summary_lines = [
-    "## Summary",
-    "",
-    f"- **Overall**: {overall_performance}",
-    (
-        f"- **Run drift**: {run_drift_metrics['drift_ratio']:.3f}x, "
-        f"{run_drift_metrics['drift_level']} run-wide shift "
-        f"({run_drift_metrics['same_direction_pct']:.0f}% aligned, "
-        f"residual MAD {run_drift_metrics['residual_mad_pct']:.1f}%)"
-        if not pd.isna(run_drift_metrics["drift_ratio"])
-        else "- **Run drift**: no data"
-    ),
-]
-
-# Only add vortex-specific sections if we have vortex data
-if len(vortex_df) > 0:
-    summary_lines.extend([f"- **Vortex**: {vortex_performance}"])
-
-if len(parquet_df) > 0:
-    summary_lines.extend([f"- **Parquet**: {parquet_performance}"])
-
-# Only add duckdb:vortex section if we have that data
-if len(duckdb_vortex_df) > 0:
-    summary_lines.append(f"- **duckdb:vortex**: {duckdb_vortex_performance}")
-
-# Only add datafusion:vortex section if we have that data
-if len(datafusion_vortex_df) > 0:
-    summary_lines.append(f"- **datafusion:vortex**: {datafusion_vortex_performance}")
-
-# Only add best/worst if we have vortex data
-if len(vortex_df) > 0:
-    summary_lines.extend(
-        [
-            f"- **Best**: {best_improvement}",
-            f"- **Worst**: {worst_regression}",
-            f"- **Significant (>{threshold_pct}%)**: {significant_improvements}↑ {significant_regressions}↓",
-        ]
-    )
+def build_summary_lines(
+    overall_ratio,
+    vortex_df,
+    parquet_df,
+    duckdb_vortex_df,
+    datafusion_vortex_df,
+    threshold_pct,
+    run_drift_metrics,
+):
+    """Build markdown summary lines from precomputed benchmark metrics."""
+    vortex_geo_mean_ratio = calculate_geo_mean(vortex_df)
+    duckdb_vortex_geo_mean_ratio = calculate_geo_mean(duckdb_vortex_df)
+    datafusion_vortex_geo_mean_ratio = calculate_geo_mean(datafusion_vortex_df)
+    parquet_geo_mean_ratio = calculate_geo_mean(parquet_df)
+
+    overall_performance = "no data" if pd.isna(overall_ratio) else format_performance(overall_ratio, "overall")
+    vortex_performance = format_performance(vortex_geo_mean_ratio, "vortex")
+    duckdb_vortex_performance = format_performance(duckdb_vortex_geo_mean_ratio, "duckdb:vortex")
+    datafusion_vortex_performance = format_performance(datafusion_vortex_geo_mean_ratio, "datafusion:vortex")
+    parquet_performance = format_performance(parquet_geo_mean_ratio, "parquet")
+
+    summary_lines = [
+        "## Summary",
+        "",
+        f"- **Overall**: {overall_performance}",
+        (
+            f"- **Run drift**: {run_drift_metrics['drift_ratio']:.3f}x, "
+            f"{run_drift_metrics['drift_level']} whole-run shift "
+            f"({run_drift_metrics['same_direction_pct']:.0f}% aligned, "
+            f"residual MAD {run_drift_metrics['residual_mad_pct']:.1f}%)"
+            if not pd.isna(run_drift_metrics["drift_ratio"])
+            else "- **Run drift**: no data"
+        ),
+    ]
+
+    if len(vortex_df) > 0:
+        summary_lines.append(f"- **Vortex**: {vortex_performance}")
+
+    if len(parquet_df) > 0:
+        summary_lines.append(f"- **Parquet**: {parquet_performance}")
+
+    if len(duckdb_vortex_df) > 0:
+        summary_lines.append(f"- **duckdb:vortex**: {duckdb_vortex_performance}")
+
+    if len(datafusion_vortex_df) > 0:
+        summary_lines.append(f"- **datafusion:vortex**: {datafusion_vortex_performance}")
+
+    if len(vortex_df) > 0:
+        vortex_valid_ratios = vortex_df["ratio"].dropna()
+        if len(vortex_valid_ratios) > 0:
+            improvements = vortex_valid_ratios[vortex_valid_ratios < 1.0]
+            if len(improvements) > 0:
+                best_idx = improvements.idxmin()
+                best_improvement = f"{vortex_df.loc[best_idx, 'name']} ({vortex_df.loc[best_idx, 'ratio']:.3f}x)"
+            else:
+                best_improvement = "No improvements"
+
+            regressions = vortex_valid_ratios[vortex_valid_ratios > 1.0]
+            if len(regressions) > 0:
+                worst_idx = regressions.idxmax()
+                worst_regression = f"{vortex_df.loc[worst_idx, 'name']} ({vortex_df.loc[worst_idx, 'ratio']:.3f}x)"
+            else:
+                worst_regression = "No regressions"
+        else:
+            best_improvement = "No valid vortex comparisons"
+            worst_regression = "No valid vortex comparisons"
+
+        significant_improvements = (vortex_df["ratio"] < improvement_threshold).sum()
+        significant_regressions = (vortex_df["ratio"] > regression_threshold).sum()
+        summary_lines.extend(
+            [
+                f"- **Best**: {best_improvement}",
+                f"- **Worst**: {worst_regression}",
+                f"- **Significant (>{threshold_pct}%)**: {significant_improvements}↑ {significant_regressions}↓",
+            ]
+        )
 
-if run_drift_metrics["is_baseline_suspect"]:
-    summary_lines.append("- **Baseline signal**: suspect common-mode drift; this run moved together more than expected")
-
-# Convert rendered timing values from ns to ms to keep the GitHub table narrower.
-# This affects display only; all comparison math above stays in the original units.
-display_pr_values = df3["value_pr"].copy()
-display_base_values = df3["value_base"].copy()
-display_units = df3["unit_base"].copy()
-ns_mask = display_units == "ns"
-display_pr_values.loc[ns_mask] = display_pr_values.loc[ns_mask] / 1_000_000
-display_base_values.loc[ns_mask] = display_base_values.loc[ns_mask] / 1_000_000
-display_units.loc[ns_mask] = "ms"
-
-# Build table
-table_dict = {
-    "name": df3["name"],
-    f"PR {pr_commit_id[:8]}": display_pr_values,
-    f"base {base_commit_id[:8]}": display_base_values,
-    "ratio (PR/base)": df3["ratio"],
-    "unit": display_units,
-}
+    if run_drift_metrics["is_baseline_suspect"]:
+        summary_lines.append("- **Baseline**: likely unreliable for this run; most benchmarks shifted together")
+
+    return summary_lines
 
-if "cv_pct_max" in df3.columns:
-    table_dict["noise"] = df3["noise"]
 
-table_dict["remark"] = df3["remark"]
-table_df = pd.DataFrame(table_dict)
+def build_display_table(df, pr_commit_id, base_commit_id):
+    """Build the markdown table with display-friendly units and columns.
+
+    This keeps presentation-only concerns out of the metric computation above.
+    """
+    display_df = df.copy()
+
+    # Convert rendered timing values from ns to ms to keep the GitHub table narrower.
+    # This affects display only; all comparison math above stays in the original units.
+    display_df["value_pr_display"] = display_df["value_pr"].astype(float)
+    display_df["value_base_display"] = display_df["value_base"].astype(float)
+    display_df["unit_display"] = display_df["unit_base"].copy()
+    ns_mask = display_df["unit_display"] == "ns"
+    display_df.loc[ns_mask, "value_pr_display"] = display_df.loc[ns_mask, "value_pr_display"] / 1_000_000
+    display_df.loc[ns_mask, "value_base_display"] = display_df.loc[ns_mask, "value_base_display"] / 1_000_000
+    display_df.loc[ns_mask, "unit_display"] = "ms"
+
+    table_dict = {
+        "name": display_df["name"],
+        f"PR {pr_commit_id[:8]}": display_df["value_pr_display"],
+        f"base {base_commit_id[:8]}": display_df["value_base_display"],
+        "ratio (PR/base)": display_df["ratio"],
+        "unit": display_df["unit_display"],
+    }
+
+    if "noise" in display_df.columns:
+        table_dict["noise"] = display_df["noise"]
+
+    table_dict["remark"] = display_df["remark"]
+    return pd.DataFrame(table_dict)
+
+
+run_drift_metrics = calculate_run_drift_metrics(df3)
+summary_lines = build_summary_lines(
+    geo_mean_ratio,
+    vortex_df,
+    parquet_df,
+    duckdb_vortex_df,
+    datafusion_vortex_df,
+    threshold_pct,
+    run_drift_metrics,
+)
+table_df = build_display_table(df3, pr_commit_id, base_commit_id)
 
 # Output complete formatted markdown
 print("\n".join(summary_lines))

From 7eec2bad6d1962f2881c6271f9ce7e37749c159f Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Tue, 10 Mar 2026 17:50:10 +0000
Subject: [PATCH 9/9] Some formatting changes

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 41 ++++++++++++++++++------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index 68df62f306e..d67a63f0a3b 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -240,19 +240,31 @@ def build_summary_lines(
     datafusion_vortex_performance = format_performance(datafusion_vortex_geo_mean_ratio, "datafusion:vortex")
     parquet_performance = format_performance(parquet_geo_mean_ratio, "parquet")
 
-    summary_lines = [
-        "## Summary",
-        "",
-        f"- **Overall**: {overall_performance}",
-        (
-            f"- **Run drift**: {run_drift_metrics['drift_ratio']:.3f}x, "
-            f"{run_drift_metrics['drift_level']} whole-run shift "
-            f"({run_drift_metrics['same_direction_pct']:.0f}% aligned, "
-            f"residual MAD {run_drift_metrics['residual_mad_pct']:.1f}%)"
-            if not pd.isna(run_drift_metrics["drift_ratio"])
-            else "- **Run drift**: no data"
-        ),
-    ]
+    summary_lines = []
+
+    if run_drift_metrics["is_baseline_suspect"]:
+        summary_lines.extend(
+            [
+                "- **Baseline**: likely unreliable for this run; most benchmarks shifted together",
+                "",
+            ]
+        )
+
+    summary_lines.extend(
+        [
+            "## Summary",
+            "",
+            f"- **Overall**: {overall_performance}",
+            (
+                f"- **Run drift**: {run_drift_metrics['drift_ratio']:.3f}x, "
+                f"**{run_drift_metrics['drift_level']}** whole-run shift "
+                f"({run_drift_metrics['same_direction_pct']:.0f}% aligned, "
+                f"residual MAD {run_drift_metrics['residual_mad_pct']:.1f}%)"
+                if not pd.isna(run_drift_metrics["drift_ratio"])
+                else "- **Run drift**: no data"
+            ),
+        ]
+    )
 
     if len(vortex_df) > 0:
         summary_lines.append(f"- **Vortex**: {vortex_performance}")
@@ -296,9 +308,6 @@ def build_summary_lines(
             ]
         )
 
-    if run_drift_metrics["is_baseline_suspect"]:
-        summary_lines.append("- **Baseline**: likely unreliable for this run; most benchmarks shifted together")
-
     return summary_lines