vortex-data · AdamGS · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
@@ -14,6 +14,14 @@
 
 import pandas as pd
 
+S3_THRESHOLD_PCT = 30
+DEFAULT_THRESHOLD_PCT = 10
+NOISE_LOW_MAX_CV_PCT = 5.0
+NOISE_MEDIUM_MAX_CV_PCT = 15.0
+DRIFT_NOTICEABLE_THRESHOLD_PCT = 5.0
+DRIFT_CONSISTENCY_THRESHOLD_PCT = 80.0
+DRIFT_RESIDUAL_SPREAD_THRESHOLD_PCT = 5.0
+
 # Check if benchmark name argument is provided (will be added from workflow)
 benchmark_name = sys.argv[3] if len(sys.argv) > 3 else ""
 
@@ -59,10 +67,49 @@ def extract_dataset_key(df):
 # Determine threshold based on benchmark name
 # Use 30% threshold for S3 benchmarks, 10% for others
 is_s3_benchmark = "s3" in benchmark_name.lower()
-threshold_pct = 30 if is_s3_benchmark else 10
+threshold_pct = S3_THRESHOLD_PCT if is_s3_benchmark else DEFAULT_THRESHOLD_PCT
 improvement_threshold = 1.0 - (threshold_pct / 100.0)  # e.g., 0.7 for 30%, 0.9 for 10%
 regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
 
+
+def compute_cv_pct(runtimes):
+    """Compute coefficient of variation (std_dev / mean * 100) as a percentage."""
+    if not isinstance(runtimes, list) or len(runtimes) < 2:
+        return float("nan")
+    n = len(runtimes)
+    mean = sum(runtimes) / n
+    if mean == 0:
+        return float("nan")
+    variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1)
+    return (variance**0.5 / mean) * 100
+
+
+# Compute CV% from all_runtimes when available
+has_runtimes_pr = "all_runtimes_pr" in df3.columns
+has_runtimes_base = "all_runtimes_base" in df3.columns
+if has_runtimes_pr:
+    df3["cv_pct_pr"] = df3["all_runtimes_pr"].apply(compute_cv_pct)
+if has_runtimes_base:
+    df3["cv_pct_base"] = df3["all_runtimes_base"].apply(compute_cv_pct)
+if has_runtimes_pr or has_runtimes_base:
+    cv_columns = [column for column in ["cv_pct_pr", "cv_pct_base"] if column in df3.columns]
+    df3["cv_pct_max"] = df3[cv_columns].max(axis=1, skipna=True)
+
+
+def describe_noise(cv_pct):
+    """Bucket runtime noise into labels that are easy to scan in GitHub tables."""
+    if pd.isna(cv_pct):
+        return "unknown"
+    if cv_pct < NOISE_LOW_MAX_CV_PCT:
+        return "low"
+    if cv_pct < NOISE_MEDIUM_MAX_CV_PCT:
+        return "medium"
+    return "high"
+
+
+if "cv_pct_max" in df3.columns:
+    df3["noise"] = df3["cv_pct_max"].apply(describe_noise)
+
 # Generate summary statistics
 df3["ratio"] = df3["value_pr"] / df3["value_base"]
 df3["remark"] = pd.Series([""] * len(df3))
@@ -97,39 +144,67 @@ def calculate_geo_mean(df):
         return float("nan")
 
 
-vortex_geo_mean_ratio = calculate_geo_mean(vortex_df)
-duckdb_vortex_geo_mean_ratio = calculate_geo_mean(duckdb_vortex_df)
-datafusion_vortex_geo_mean_ratio = calculate_geo_mean(datafusion_vortex_df)
-parquet_geo_mean_ratio = calculate_geo_mean(parquet_df)
+def calculate_run_drift_metrics(df):
+    """Summarize common-mode movement across the whole benchmark run.
 
-# Find best and worst changes for vortex-only results
-vortex_valid_ratios = vortex_df["ratio"].dropna()
-if len(vortex_valid_ratios) > 0:
-    # Best improvement: smallest ratio (< 1.0, fastest performance)
-    improvements = vortex_valid_ratios[vortex_valid_ratios < 1.0]
-    if len(improvements) > 0:
-        best_idx = improvements.idxmin()
-        best_improvement = f"{vortex_df.loc[best_idx, 'name']} ({vortex_df.loc[best_idx, 'ratio']:.3f}x)"
+    We work in log-ratio space because ratios compose multiplicatively:
+    a 10% slowdown (1.10x) and a 10% speedup (0.90x) are roughly symmetric
+    once transformed with log(). That makes the median log-ratio a robust
+    estimate of "the whole run was faster/slower than usual".
+    """
+    valid_ratios = [r for r in df["ratio"] if r > 0 and not pd.isna(r)]
+    if not valid_ratios:
+        return {
+            "drift_ratio": float("nan"),
+            "same_direction_pct": float("nan"),
+            "residual_mad_pct": float("nan"),
+            "is_baseline_suspect": False,
+            "drift_level": "unknown",
+        }
+
+    log_ratios = pd.Series([math.log(r) for r in valid_ratios])
+    median_log_ratio = float(log_ratios.median())
+    drift_ratio = math.exp(median_log_ratio)
+
+    # Count how often benchmarks move in the same direction as the run-wide drift.
+    # This distinguishes "everything got faster/slower together" from a mixed run
+    # with a similar central tendency.
+    if median_log_ratio < 0:
+        same_direction_pct = float((log_ratios < 0).mean() * 100)
+    elif median_log_ratio > 0:
+        same_direction_pct = float((log_ratios > 0).mean() * 100)
     else:
-        best_improvement = "No improvements"
-
-    # Worst regression: largest ratio (> 1.0, slowest performance)
-    regressions = vortex_valid_ratios[vortex_valid_ratios > 1.0]
-    if len(regressions) > 0:
-        worst_idx = regressions.idxmax()
-        worst_regression = f"{vortex_df.loc[worst_idx, 'name']} ({vortex_df.loc[worst_idx, 'ratio']:.3f}x)"
+        same_direction_pct = float((log_ratios == 0).mean() * 100)
+
+    # Residual MAD measures how tightly benchmarks cluster around the run-wide
+    # drift. Small residual spread plus broad agreement is a strong indicator
+    # that the baseline itself is shifted rather than the PR changing specific
+    # benchmarks independently.
+    residual_logs = log_ratios - median_log_ratio
+    residual_mad = float(residual_logs.abs().median())
+    residual_mad_pct = (math.exp(residual_mad) - 1.0) * 100
+
+    is_baseline_suspect = (
+        abs(drift_ratio - 1.0) * 100 >= DRIFT_NOTICEABLE_THRESHOLD_PCT
+        and same_direction_pct >= DRIFT_CONSISTENCY_THRESHOLD_PCT
+        and residual_mad_pct <= DRIFT_RESIDUAL_SPREAD_THRESHOLD_PCT
+    )
+    if is_baseline_suspect:
+        drift_level = "large"
+    elif abs(drift_ratio - 1.0) * 100 >= DRIFT_NOTICEABLE_THRESHOLD_PCT:
+        drift_level = "noticeable"
     else:
-        worst_regression = "No regressions"
-else:
-    best_improvement = "No valid vortex comparisons"
-    worst_regression = "No valid vortex comparisons"
-
-# Count significant changes for vortex-only results
-significant_improvements = (vortex_df["ratio"] < improvement_threshold).sum()
-significant_regressions = (vortex_df["ratio"] > regression_threshold).sum()
+        drift_level = "small"
+
+    return {
+        "drift_ratio": drift_ratio,
+        "same_direction_pct": same_direction_pct,
+        "residual_mad_pct": residual_mad_pct,
+        "is_baseline_suspect": is_baseline_suspect,
+        "drift_level": drift_level,
+    }
 
 
-# Build summary
 def format_performance(ratio, target_name):
     if pd.isna(ratio):
         return f"no {target_name.lower()} data"
@@ -144,55 +219,141 @@ def format_performance(ratio, target_name):
         return f"{ratio:.3f}x {emoji}"
 
 
-overall_performance = "no data" if pd.isna(geo_mean_ratio) else format_performance(geo_mean_ratio, "overall")
-vortex_performance = format_performance(vortex_geo_mean_ratio, "vortex")
-duckdb_vortex_performance = format_performance(duckdb_vortex_geo_mean_ratio, "duckdb:vortex")
-datafusion_vortex_performance = format_performance(datafusion_vortex_geo_mean_ratio, "datafusion:vortex")
-parquet_performance = format_performance(parquet_geo_mean_ratio, "parquet")
+def build_summary_lines(
+    overall_ratio,
+    vortex_df,
+    parquet_df,
+    duckdb_vortex_df,
+    datafusion_vortex_df,
+    threshold_pct,
+    run_drift_metrics,
+):
+    """Build markdown summary lines from precomputed benchmark metrics."""
+    vortex_geo_mean_ratio = calculate_geo_mean(vortex_df)
+    duckdb_vortex_geo_mean_ratio = calculate_geo_mean(duckdb_vortex_df)
+    datafusion_vortex_geo_mean_ratio = calculate_geo_mean(datafusion_vortex_df)
+    parquet_geo_mean_ratio = calculate_geo_mean(parquet_df)
+
+    overall_performance = "no data" if pd.isna(overall_ratio) else format_performance(overall_ratio, "overall")
+    vortex_performance = format_performance(vortex_geo_mean_ratio, "vortex")
+    duckdb_vortex_performance = format_performance(duckdb_vortex_geo_mean_ratio, "duckdb:vortex")
+    datafusion_vortex_performance = format_performance(datafusion_vortex_geo_mean_ratio, "datafusion:vortex")
+    parquet_performance = format_performance(parquet_geo_mean_ratio, "parquet")
+
+    summary_lines = []
+
+    if run_drift_metrics["is_baseline_suspect"]:
+        summary_lines.extend(
+            [
+                "- **Baseline**: likely unreliable for this run; most benchmarks shifted together",
+                "",
+            ]
+        )
 
+    summary_lines.extend(
+        [
+            "## Summary",
+            "",
+            f"- **Overall**: {overall_performance}",
+            (
+                f"- **Run drift**: {run_drift_metrics['drift_ratio']:.3f}x, "
+                f"**{run_drift_metrics['drift_level']}** whole-run shift "
+                f"({run_drift_metrics['same_direction_pct']:.0f}% aligned, "
+                f"residual MAD {run_drift_metrics['residual_mad_pct']:.1f}%)"
+                if not pd.isna(run_drift_metrics["drift_ratio"])
+                else "- **Run drift**: no data"
+            ),
+        ]
+    )
 
-summary_lines = [
-    "## Summary",
-    "",
-    f"- **Overall**: {overall_performance}",
-]
+    if len(vortex_df) > 0:
+        summary_lines.append(f"- **Vortex**: {vortex_performance}")
+
+    if len(parquet_df) > 0:
+        summary_lines.append(f"- **Parquet**: {parquet_performance}")
+
+    if len(duckdb_vortex_df) > 0:
+        summary_lines.append(f"- **duckdb:vortex**: {duckdb_vortex_performance}")
+
+    if len(datafusion_vortex_df) > 0:
+        summary_lines.append(f"- **datafusion:vortex**: {datafusion_vortex_performance}")
+
+    if len(vortex_df) > 0:
+        vortex_valid_ratios = vortex_df["ratio"].dropna()
+        if len(vortex_valid_ratios) > 0:
+            improvements = vortex_valid_ratios[vortex_valid_ratios < 1.0]
+            if len(improvements) > 0:
+                best_idx = improvements.idxmin()
+                best_improvement = f"{vortex_df.loc[best_idx, 'name']} ({vortex_df.loc[best_idx, 'ratio']:.3f}x)"
+            else:
+                best_improvement = "No improvements"
+
+            regressions = vortex_valid_ratios[vortex_valid_ratios > 1.0]
+            if len(regressions) > 0:
+                worst_idx = regressions.idxmax()
+                worst_regression = f"{vortex_df.loc[worst_idx, 'name']} ({vortex_df.loc[worst_idx, 'ratio']:.3f}x)"
+            else:
+                worst_regression = "No regressions"
+        else:
+            best_improvement = "No valid vortex comparisons"
+            worst_regression = "No valid vortex comparisons"
+
+        significant_improvements = (vortex_df["ratio"] < improvement_threshold).sum()
+        significant_regressions = (vortex_df["ratio"] > regression_threshold).sum()
+        summary_lines.extend(
+            [
+                f"- **Best**: {best_improvement}",
+                f"- **Worst**: {worst_regression}",
+                f"- **Significant (>{threshold_pct}%)**: {significant_improvements}↑ {significant_regressions}↓",
+            ]
+        )
 
-# Only add vortex-specific sections if we have vortex data
-if len(vortex_df) > 0:
-    summary_lines.extend([f"- **Vortex**: {vortex_performance}"])
+    return summary_lines
 
-if len(parquet_df) > 0:
-    summary_lines.extend([f"- **Parquet**: {parquet_performance}"])
 
-# Only add duckdb:vortex section if we have that data
-if len(duckdb_vortex_df) > 0:
-    summary_lines.append(f"- **duckdb:vortex**: {duckdb_vortex_performance}")
+def build_display_table(df, pr_commit_id, base_commit_id):
+    """Build the markdown table with display-friendly units and columns.
 
-# Only add datafusion:vortex section if we have that data
-if len(datafusion_vortex_df) > 0:
-    summary_lines.append(f"- **datafusion:vortex**: {datafusion_vortex_performance}")
+    This keeps presentation-only concerns out of the metric computation above.
+    """
+    display_df = df.copy()
 
-# Only add best/worst if we have vortex data
-if len(vortex_df) > 0:
-    summary_lines.extend(
-        [
-            f"- **Best**: {best_improvement}",
-            f"- **Worst**: {worst_regression}",
-            f"- **Significant (>{threshold_pct}%)**: {significant_improvements}↑ {significant_regressions}↓",
-        ]
-    )
+    # Convert rendered timing values from ns to ms to keep the GitHub table narrower.
+    # This affects display only; all comparison math above stays in the original units.
+    display_df["value_pr_display"] = display_df["value_pr"].astype(float)
+    display_df["value_base_display"] = display_df["value_base"].astype(float)
+    display_df["unit_display"] = display_df["unit_base"].copy()
+    ns_mask = display_df["unit_display"] == "ns"
+    display_df.loc[ns_mask, "value_pr_display"] = display_df.loc[ns_mask, "value_pr_display"] / 1_000_000
+    display_df.loc[ns_mask, "value_base_display"] = display_df.loc[ns_mask, "value_base_display"] / 1_000_000
+    display_df.loc[ns_mask, "unit_display"] = "ms"
 
-# Build table
-table_df = pd.DataFrame(
-    {
-        "name": df3["name"],
-        f"PR {pr_commit_id[:8]}": df3["value_pr"],
-        f"base {base_commit_id[:8]}": df3["value_base"],
-        "ratio (PR/base)": df3["ratio"],
-        "unit": df3["unit_base"],
-        "remark": df3["remark"],
+    table_dict = {
+        "name": display_df["name"],
+        f"PR {pr_commit_id[:8]}": display_df["value_pr_display"],
+        f"base {base_commit_id[:8]}": display_df["value_base_display"],
+        "ratio (PR/base)": display_df["ratio"],
+        "unit": display_df["unit_display"],
     }
+
+    if "noise" in display_df.columns:
+        table_dict["noise"] = display_df["noise"]
+
+    table_dict["remark"] = display_df["remark"]
+    return pd.DataFrame(table_dict)
+
+
+run_drift_metrics = calculate_run_drift_metrics(df3)
+summary_lines = build_summary_lines(
+    geo_mean_ratio,
+    vortex_df,
+    parquet_df,
+    duckdb_vortex_df,
+    datafusion_vortex_df,
+    threshold_pct,
+    run_drift_metrics,
 )
+table_df = build_display_table(df3, pr_commit_id, base_commit_id)
 
 # Output complete formatted markdown
 print("\n".join(summary_lines))