From cde61c4f0bb4bcd73c9ce5349983c1b54b1baa5e Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 2 Mar 2026 16:29:16 +0000 Subject: [PATCH 1/9] Show absolute z-score on benchmarks Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 29 +++++++++++++++++++---------- vortex-bench/src/measurements.rs | 26 ++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 1802325c4cd..5e01ffaddd1 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -63,6 +63,10 @@ def extract_dataset_key(df): improvement_threshold = 1.0 - (threshold_pct / 100.0) # e.g., 0.7 for 30%, 0.9 for 10% regression_threshold = 1.0 + (threshold_pct / 100.0) # e.g., 1.3 for 30%, 1.1 for 10% +# After merge with suffixes, z-score columns become abs_z_score_base and abs_z_score_pr +has_z_base = "abs_z_score_base" in df3.columns +has_z_pr = "abs_z_score_pr" in df3.columns + # Generate summary statistics df3["ratio"] = df3["value_pr"] / df3["value_base"] df3["remark"] = pd.Series([""] * len(df3)) @@ -183,16 +187,21 @@ def format_performance(ratio, target_name): ) # Build table -table_df = pd.DataFrame( - { - "name": df3["name"], - f"PR {pr_commit_id[:8]}": df3["value_pr"], - f"base {base_commit_id[:8]}": df3["value_base"], - "ratio (PR/base)": df3["ratio"], - "unit": df3["unit_base"], - "remark": df3["remark"], - } -) +table_dict = { + "name": df3["name"], + f"PR {pr_commit_id[:8]}": df3["value_pr"], + f"base {base_commit_id[:8]}": df3["value_base"], + "ratio (PR/base)": df3["ratio"], + "unit": df3["unit_base"], +} + +if has_z_pr: + table_dict["|z| PR"] = df3["abs_z_score_pr"] +if has_z_base: + table_dict["|z| base"] = df3["abs_z_score_base"] + +table_dict["remark"] = df3["remark"] +table_df = pd.DataFrame(table_dict) # Output complete formatted markdown print("\n".join(summary_lines)) diff --git a/vortex-bench/src/measurements.rs b/vortex-bench/src/measurements.rs index f49349cd95e..af8a05cca5f 100644 --- a/vortex-bench/src/measurements.rs +++ b/vortex-bench/src/measurements.rs @@ -272,6 +272,27 @@ impl QueryMeasurement { ) } } + + /// Compute |z-score| = |median - mean| / stddev for the runs. + /// Returns `None` if fewer than 2 runs (stddev is undefined). + pub fn abs_z_score(&self) -> Option { + let n = self.runs.len(); + if n < 2 { + return None; + } + + let nanos: Vec = self.runs.iter().map(|d| d.as_nanos() as f64).collect(); + let mean = nanos.iter().sum::() / n as f64; + let variance = nanos.iter().map(|x| (x - mean).powi(2)).sum::() / (n - 1) as f64; + let stddev = variance.sqrt(); + + if stddev == 0.0 { + return Some(0.0); + } + + let median = self.median_run().as_nanos() as f64; + Some(((median - mean) / stddev).abs()) + } } #[derive(Serialize, Deserialize)] @@ -282,6 +303,10 @@ pub struct QueryMeasurementJson { pub unit: String, pub value: u128, pub all_runtimes: Vec, + /// Absolute z-score of the median relative to the mean: |median - mean| / stddev. + /// Indicates how representative the reported median is. `None` when fewer than 2 runs. + #[serde(skip_serializing_if = "Option::is_none")] + pub abs_z_score: Option, pub target: Target, pub commit_id: String, pub env_triple: TripleJson, @@ -313,6 +338,7 @@ impl ToJson for QueryMeasurement { unit: "ns".to_string(), value: self.median_run().as_nanos(), all_runtimes: self.runs.iter().map(|r| r.as_nanos()).collect_vec(), + abs_z_score: self.abs_z_score(), commit_id: GIT_COMMIT_ID.to_string(), target: self.target, env_triple: TripleJson { From 805608dab652988e69542581134205b146859202 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 2 Mar 2026 16:49:42 +0000 Subject: [PATCH 2/9] python all the things Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 28 +++++++++++++++++++++++++--- vortex-bench/src/measurements.rs | 26 -------------------------- 2 files changed, 25 insertions(+), 29 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 5e01ffaddd1..81385371faa 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -63,9 +63,31 @@ def extract_dataset_key(df): improvement_threshold = 1.0 - (threshold_pct / 100.0) # e.g., 0.7 for 30%, 0.9 for 10% regression_threshold = 1.0 + (threshold_pct / 100.0) # e.g., 1.3 for 30%, 1.1 for 10% -# After merge with suffixes, z-score columns become abs_z_score_base and abs_z_score_pr -has_z_base = "abs_z_score_base" in df3.columns -has_z_pr = "abs_z_score_pr" in df3.columns +def compute_abs_z_score(runtimes): + """Compute |median - mean| / stddev from a list of runtimes.""" + if not isinstance(runtimes, list) or len(runtimes) < 2: + return float("nan") + n = len(runtimes) + mean = sum(runtimes) / n + variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1) + stddev = math.sqrt(variance) + if stddev == 0: + return 0.0 + sorted_rt = sorted(runtimes) + if n % 2 == 1: + median = sorted_rt[n // 2] + else: + median = (sorted_rt[n // 2 - 1] + sorted_rt[n // 2]) / 2 + return abs((median - mean) / stddev) + + +# Compute |z-score| from all_runtimes when available +has_z_pr = "all_runtimes_pr" in df3.columns +has_z_base = "all_runtimes_base" in df3.columns +if has_z_pr: + df3["abs_z_score_pr"] = df3["all_runtimes_pr"].apply(compute_abs_z_score) +if has_z_base: + df3["abs_z_score_base"] = df3["all_runtimes_base"].apply(compute_abs_z_score) # Generate summary statistics df3["ratio"] = df3["value_pr"] / df3["value_base"] diff --git a/vortex-bench/src/measurements.rs b/vortex-bench/src/measurements.rs index af8a05cca5f..f49349cd95e 100644 --- a/vortex-bench/src/measurements.rs +++ b/vortex-bench/src/measurements.rs @@ -272,27 +272,6 @@ impl QueryMeasurement { ) } } - - /// Compute |z-score| = |median - mean| / stddev for the runs. - /// Returns `None` if fewer than 2 runs (stddev is undefined). - pub fn abs_z_score(&self) -> Option { - let n = self.runs.len(); - if n < 2 { - return None; - } - - let nanos: Vec = self.runs.iter().map(|d| d.as_nanos() as f64).collect(); - let mean = nanos.iter().sum::() / n as f64; - let variance = nanos.iter().map(|x| (x - mean).powi(2)).sum::() / (n - 1) as f64; - let stddev = variance.sqrt(); - - if stddev == 0.0 { - return Some(0.0); - } - - let median = self.median_run().as_nanos() as f64; - Some(((median - mean) / stddev).abs()) - } } #[derive(Serialize, Deserialize)] @@ -303,10 +282,6 @@ pub struct QueryMeasurementJson { pub unit: String, pub value: u128, pub all_runtimes: Vec, - /// Absolute z-score of the median relative to the mean: |median - mean| / stddev. - /// Indicates how representative the reported median is. `None` when fewer than 2 runs. - #[serde(skip_serializing_if = "Option::is_none")] - pub abs_z_score: Option, pub target: Target, pub commit_id: String, pub env_triple: TripleJson, @@ -338,7 +313,6 @@ impl ToJson for QueryMeasurement { unit: "ns".to_string(), value: self.median_run().as_nanos(), all_runtimes: self.runs.iter().map(|r| r.as_nanos()).collect_vec(), - abs_z_score: self.abs_z_score(), commit_id: GIT_COMMIT_ID.to_string(), target: self.target, env_triple: TripleJson { From 8368456a04b11f7806468f32f16d1774655d206a Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 2 Mar 2026 16:53:42 +0000 Subject: [PATCH 3/9] ruff format Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 81385371faa..9b0d556cbe9 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -63,6 +63,7 @@ def extract_dataset_key(df): improvement_threshold = 1.0 - (threshold_pct / 100.0) # e.g., 0.7 for 30%, 0.9 for 10% regression_threshold = 1.0 + (threshold_pct / 100.0) # e.g., 1.3 for 30%, 1.1 for 10% + def compute_abs_z_score(runtimes): """Compute |median - mean| / stddev from a list of runtimes.""" if not isinstance(runtimes, list) or len(runtimes) < 2: From e860ced2967d18c66b36d0c6d3d0ef9575b548d9 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 2 Mar 2026 17:06:08 +0000 Subject: [PATCH 4/9] Fix z-score table formatting Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 9b0d556cbe9..cc06a544e24 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -219,9 +219,9 @@ def format_performance(ratio, target_name): } if has_z_pr: - table_dict["|z| PR"] = df3["abs_z_score_pr"] + table_dict["abs(z-score) PR"] = df3["abs_z_score_pr"] if has_z_base: - table_dict["|z| base"] = df3["abs_z_score_base"] + table_dict["abs(z-score) base"] = df3["abs_z_score_base"] table_dict["remark"] = df3["remark"] table_df = pd.DataFrame(table_dict) From ef308c41417a3ac9dd8b887ad520a808031746eb Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 5 Mar 2026 14:52:54 +0000 Subject: [PATCH 5/9] variance instead of z score Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index cc06a544e24..d6545b9fb60 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -64,31 +64,22 @@ def extract_dataset_key(df): regression_threshold = 1.0 + (threshold_pct / 100.0) # e.g., 1.3 for 30%, 1.1 for 10% -def compute_abs_z_score(runtimes): - """Compute |median - mean| / stddev from a list of runtimes.""" +def compute_variance(runtimes): + """Compute sample variance from a list of runtimes.""" if not isinstance(runtimes, list) or len(runtimes) < 2: return float("nan") n = len(runtimes) mean = sum(runtimes) / n - variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1) - stddev = math.sqrt(variance) - if stddev == 0: - return 0.0 - sorted_rt = sorted(runtimes) - if n % 2 == 1: - median = sorted_rt[n // 2] - else: - median = (sorted_rt[n // 2 - 1] + sorted_rt[n // 2]) / 2 - return abs((median - mean) / stddev) + return sum((x - mean) ** 2 for x in runtimes) / (n - 1) -# Compute |z-score| from all_runtimes when available +# Compute variance from all_runtimes when available has_z_pr = "all_runtimes_pr" in df3.columns has_z_base = "all_runtimes_base" in df3.columns if has_z_pr: - df3["abs_z_score_pr"] = df3["all_runtimes_pr"].apply(compute_abs_z_score) + df3["variance_pr"] = df3["all_runtimes_pr"].apply(compute_variance) if has_z_base: - df3["abs_z_score_base"] = df3["all_runtimes_base"].apply(compute_abs_z_score) + df3["variance_base"] = df3["all_runtimes_base"].apply(compute_variance) # Generate summary statistics df3["ratio"] = df3["value_pr"] / df3["value_base"] @@ -219,9 +210,9 @@ def format_performance(ratio, target_name): } if has_z_pr: - table_dict["abs(z-score) PR"] = df3["abs_z_score_pr"] + table_dict["variance PR"] = df3["variance_pr"] if has_z_base: - table_dict["abs(z-score) base"] = df3["abs_z_score_base"] + table_dict["variance base"] = df3["variance_base"] table_dict["remark"] = df3["remark"] table_df = pd.DataFrame(table_dict) From 284722f1c110a115b991f5560873f2aa2b9782b5 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Fri, 6 Mar 2026 13:30:42 +0000 Subject: [PATCH 6/9] cv Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index d6545b9fb60..499294bfd42 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -64,22 +64,25 @@ def extract_dataset_key(df): regression_threshold = 1.0 + (threshold_pct / 100.0) # e.g., 1.3 for 30%, 1.1 for 10% -def compute_variance(runtimes): - """Compute sample variance from a list of runtimes.""" +def compute_cv_pct(runtimes): + """Compute coefficient of variation (std_dev / mean * 100) as a percentage.""" if not isinstance(runtimes, list) or len(runtimes) < 2: return float("nan") n = len(runtimes) mean = sum(runtimes) / n - return sum((x - mean) ** 2 for x in runtimes) / (n - 1) + if mean == 0: + return float("nan") + variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1) + return (variance**0.5 / mean) * 100 -# Compute variance from all_runtimes when available +# Compute CV% from all_runtimes when available has_z_pr = "all_runtimes_pr" in df3.columns has_z_base = "all_runtimes_base" in df3.columns if has_z_pr: - df3["variance_pr"] = df3["all_runtimes_pr"].apply(compute_variance) + df3["cv_pct_pr"] = df3["all_runtimes_pr"].apply(compute_cv_pct) if has_z_base: - df3["variance_base"] = df3["all_runtimes_base"].apply(compute_variance) + df3["cv_pct_base"] = df3["all_runtimes_base"].apply(compute_cv_pct) # Generate summary statistics df3["ratio"] = df3["value_pr"] / df3["value_base"] @@ -210,9 +213,9 @@ def format_performance(ratio, target_name): } if has_z_pr: - table_dict["variance PR"] = df3["variance_pr"] + table_dict["CV% PR"] = df3["cv_pct_pr"] if has_z_base: - table_dict["variance base"] = df3["variance_base"] + table_dict["CV% base"] = df3["cv_pct_base"] table_dict["remark"] = df3["remark"] table_df = pd.DataFrame(table_dict) From 0c50cadadcdabd2873c998a6891865ac82e2b7fe Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 10 Mar 2026 17:28:55 +0000 Subject: [PATCH 7/9] better math and display Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 116 +++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 7 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 499294bfd42..b1ddcf66a20 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -83,6 +83,24 @@ def compute_cv_pct(runtimes): df3["cv_pct_pr"] = df3["all_runtimes_pr"].apply(compute_cv_pct) if has_z_base: df3["cv_pct_base"] = df3["all_runtimes_base"].apply(compute_cv_pct) +if has_z_pr or has_z_base: + cv_columns = [column for column in ["cv_pct_pr", "cv_pct_base"] if column in df3.columns] + df3["cv_pct_max"] = df3[cv_columns].max(axis=1, skipna=True) + + +def describe_noise(cv_pct): + """Bucket runtime noise into labels that are easy to scan in GitHub tables.""" + if pd.isna(cv_pct): + return "unknown" + if cv_pct < 5: + return "low" + if cv_pct < 15: + return "medium" + return "high" + + +if "cv_pct_max" in df3.columns: + df3["noise"] = df3["cv_pct_max"].apply(describe_noise) # Generate summary statistics df3["ratio"] = df3["value_pr"] / df3["value_base"] @@ -118,10 +136,75 @@ def calculate_geo_mean(df): return float("nan") +def calculate_run_drift_metrics(df): + """Summarize common-mode movement across the whole benchmark run. + + We work in log-ratio space because ratios compose multiplicatively: + a 10% slowdown (1.10x) and a 10% speedup (0.90x) are roughly symmetric + once transformed with log(). That makes the median log-ratio a robust + estimate of "the whole run was faster/slower than usual". + """ + valid_ratios = [r for r in df["ratio"] if r > 0 and not pd.isna(r)] + if not valid_ratios: + return { + "drift_ratio": float("nan"), + "same_direction_pct": float("nan"), + "residual_mad_pct": float("nan"), + "is_baseline_suspect": False, + "drift_level": "unknown", + } + + log_ratios = pd.Series([math.log(r) for r in valid_ratios]) + median_log_ratio = float(log_ratios.median()) + drift_ratio = math.exp(median_log_ratio) + + # Count how often benchmarks move in the same direction as the run-wide drift. + # This distinguishes "everything got faster/slower together" from a mixed run + # with a similar central tendency. + if median_log_ratio < 0: + same_direction_pct = float((log_ratios < 0).mean() * 100) + elif median_log_ratio > 0: + same_direction_pct = float((log_ratios > 0).mean() * 100) + else: + same_direction_pct = float((log_ratios == 0).mean() * 100) + + # Residual MAD measures how tightly benchmarks cluster around the run-wide + # drift. Small residual spread plus broad agreement is a strong indicator + # that the baseline itself is shifted rather than the PR changing specific + # benchmarks independently. + residual_logs = log_ratios - median_log_ratio + residual_mad = float(residual_logs.abs().median()) + residual_mad_pct = (math.exp(residual_mad) - 1.0) * 100 + + drift_threshold_pct = 5.0 + consistency_threshold_pct = 80.0 + residual_spread_threshold_pct = 5.0 + is_baseline_suspect = ( + abs(drift_ratio - 1.0) * 100 >= drift_threshold_pct + and same_direction_pct >= consistency_threshold_pct + and residual_mad_pct <= residual_spread_threshold_pct + ) + if is_baseline_suspect: + drift_level = "large" + elif abs(drift_ratio - 1.0) * 100 >= drift_threshold_pct: + drift_level = "noticeable" + else: + drift_level = "small" + + return { + "drift_ratio": drift_ratio, + "same_direction_pct": same_direction_pct, + "residual_mad_pct": residual_mad_pct, + "is_baseline_suspect": is_baseline_suspect, + "drift_level": drift_level, + } + + vortex_geo_mean_ratio = calculate_geo_mean(vortex_df) duckdb_vortex_geo_mean_ratio = calculate_geo_mean(duckdb_vortex_df) datafusion_vortex_geo_mean_ratio = calculate_geo_mean(datafusion_vortex_df) parquet_geo_mean_ratio = calculate_geo_mean(parquet_df) +run_drift_metrics = calculate_run_drift_metrics(df3) # Find best and worst changes for vortex-only results vortex_valid_ratios = vortex_df["ratio"].dropna() @@ -176,6 +259,14 @@ def format_performance(ratio, target_name): "## Summary", "", f"- **Overall**: {overall_performance}", + ( + f"- **Run drift**: {run_drift_metrics['drift_ratio']:.3f}x, " + f"{run_drift_metrics['drift_level']} run-wide shift " + f"({run_drift_metrics['same_direction_pct']:.0f}% aligned, " + f"residual MAD {run_drift_metrics['residual_mad_pct']:.1f}%)" + if not pd.isna(run_drift_metrics["drift_ratio"]) + else "- **Run drift**: no data" + ), ] # Only add vortex-specific sections if we have vortex data @@ -203,19 +294,30 @@ def format_performance(ratio, target_name): ] ) +if run_drift_metrics["is_baseline_suspect"]: + summary_lines.append("- **Baseline signal**: suspect common-mode drift; this run moved together more than expected") + +# Convert rendered timing values from ns to ms to keep the GitHub table narrower. +# This affects display only; all comparison math above stays in the original units. +display_pr_values = df3["value_pr"].copy() +display_base_values = df3["value_base"].copy() +display_units = df3["unit_base"].copy() +ns_mask = display_units == "ns" +display_pr_values.loc[ns_mask] = display_pr_values.loc[ns_mask] / 1_000_000 +display_base_values.loc[ns_mask] = display_base_values.loc[ns_mask] / 1_000_000 +display_units.loc[ns_mask] = "ms" + # Build table table_dict = { "name": df3["name"], - f"PR {pr_commit_id[:8]}": df3["value_pr"], - f"base {base_commit_id[:8]}": df3["value_base"], + f"PR {pr_commit_id[:8]}": display_pr_values, + f"base {base_commit_id[:8]}": display_base_values, "ratio (PR/base)": df3["ratio"], - "unit": df3["unit_base"], + "unit": display_units, } -if has_z_pr: - table_dict["CV% PR"] = df3["cv_pct_pr"] -if has_z_base: - table_dict["CV% base"] = df3["cv_pct_base"] +if "cv_pct_max" in df3.columns: + table_dict["noise"] = df3["noise"] table_dict["remark"] = df3["remark"] table_df = pd.DataFrame(table_dict) From 1192444d540bc18bcd4d9b22b038e80f2b564a5a Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 10 Mar 2026 17:38:30 +0000 Subject: [PATCH 8/9] fix script Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 262 ++++++++++++++++------------- 1 file changed, 143 insertions(+), 119 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index b1ddcf66a20..68df62f306e 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -14,6 +14,14 @@ import pandas as pd +S3_THRESHOLD_PCT = 30 +DEFAULT_THRESHOLD_PCT = 10 +NOISE_LOW_MAX_CV_PCT = 5.0 +NOISE_MEDIUM_MAX_CV_PCT = 15.0 +DRIFT_NOTICEABLE_THRESHOLD_PCT = 5.0 +DRIFT_CONSISTENCY_THRESHOLD_PCT = 80.0 +DRIFT_RESIDUAL_SPREAD_THRESHOLD_PCT = 5.0 + # Check if benchmark name argument is provided (will be added from workflow) benchmark_name = sys.argv[3] if len(sys.argv) > 3 else "" @@ -59,7 +67,7 @@ def extract_dataset_key(df): # Determine threshold based on benchmark name # Use 30% threshold for S3 benchmarks, 10% for others is_s3_benchmark = "s3" in benchmark_name.lower() -threshold_pct = 30 if is_s3_benchmark else 10 +threshold_pct = S3_THRESHOLD_PCT if is_s3_benchmark else DEFAULT_THRESHOLD_PCT improvement_threshold = 1.0 - (threshold_pct / 100.0) # e.g., 0.7 for 30%, 0.9 for 10% regression_threshold = 1.0 + (threshold_pct / 100.0) # e.g., 1.3 for 30%, 1.1 for 10% @@ -77,13 +85,13 @@ def compute_cv_pct(runtimes): # Compute CV% from all_runtimes when available -has_z_pr = "all_runtimes_pr" in df3.columns -has_z_base = "all_runtimes_base" in df3.columns -if has_z_pr: +has_runtimes_pr = "all_runtimes_pr" in df3.columns +has_runtimes_base = "all_runtimes_base" in df3.columns +if has_runtimes_pr: df3["cv_pct_pr"] = df3["all_runtimes_pr"].apply(compute_cv_pct) -if has_z_base: +if has_runtimes_base: df3["cv_pct_base"] = df3["all_runtimes_base"].apply(compute_cv_pct) -if has_z_pr or has_z_base: +if has_runtimes_pr or has_runtimes_base: cv_columns = [column for column in ["cv_pct_pr", "cv_pct_base"] if column in df3.columns] df3["cv_pct_max"] = df3[cv_columns].max(axis=1, skipna=True) @@ -92,9 +100,9 @@ def describe_noise(cv_pct): """Bucket runtime noise into labels that are easy to scan in GitHub tables.""" if pd.isna(cv_pct): return "unknown" - if cv_pct < 5: + if cv_pct < NOISE_LOW_MAX_CV_PCT: return "low" - if cv_pct < 15: + if cv_pct < NOISE_MEDIUM_MAX_CV_PCT: return "medium" return "high" @@ -176,17 +184,14 @@ def calculate_run_drift_metrics(df): residual_mad = float(residual_logs.abs().median()) residual_mad_pct = (math.exp(residual_mad) - 1.0) * 100 - drift_threshold_pct = 5.0 - consistency_threshold_pct = 80.0 - residual_spread_threshold_pct = 5.0 is_baseline_suspect = ( - abs(drift_ratio - 1.0) * 100 >= drift_threshold_pct - and same_direction_pct >= consistency_threshold_pct - and residual_mad_pct <= residual_spread_threshold_pct + abs(drift_ratio - 1.0) * 100 >= DRIFT_NOTICEABLE_THRESHOLD_PCT + and same_direction_pct >= DRIFT_CONSISTENCY_THRESHOLD_PCT + and residual_mad_pct <= DRIFT_RESIDUAL_SPREAD_THRESHOLD_PCT ) if is_baseline_suspect: drift_level = "large" - elif abs(drift_ratio - 1.0) * 100 >= drift_threshold_pct: + elif abs(drift_ratio - 1.0) * 100 >= DRIFT_NOTICEABLE_THRESHOLD_PCT: drift_level = "noticeable" else: drift_level = "small" @@ -200,40 +205,6 @@ def calculate_run_drift_metrics(df): } -vortex_geo_mean_ratio = calculate_geo_mean(vortex_df) -duckdb_vortex_geo_mean_ratio = calculate_geo_mean(duckdb_vortex_df) -datafusion_vortex_geo_mean_ratio = calculate_geo_mean(datafusion_vortex_df) -parquet_geo_mean_ratio = calculate_geo_mean(parquet_df) -run_drift_metrics = calculate_run_drift_metrics(df3) - -# Find best and worst changes for vortex-only results -vortex_valid_ratios = vortex_df["ratio"].dropna() -if len(vortex_valid_ratios) > 0: - # Best improvement: smallest ratio (< 1.0, fastest performance) - improvements = vortex_valid_ratios[vortex_valid_ratios < 1.0] - if len(improvements) > 0: - best_idx = improvements.idxmin() - best_improvement = f"{vortex_df.loc[best_idx, 'name']} ({vortex_df.loc[best_idx, 'ratio']:.3f}x)" - else: - best_improvement = "No improvements" - - # Worst regression: largest ratio (> 1.0, slowest performance) - regressions = vortex_valid_ratios[vortex_valid_ratios > 1.0] - if len(regressions) > 0: - worst_idx = regressions.idxmax() - worst_regression = f"{vortex_df.loc[worst_idx, 'name']} ({vortex_df.loc[worst_idx, 'ratio']:.3f}x)" - else: - worst_regression = "No regressions" -else: - best_improvement = "No valid vortex comparisons" - worst_regression = "No valid vortex comparisons" - -# Count significant changes for vortex-only results -significant_improvements = (vortex_df["ratio"] < improvement_threshold).sum() -significant_regressions = (vortex_df["ratio"] > regression_threshold).sum() - - -# Build summary def format_performance(ratio, target_name): if pd.isna(ratio): return f"no {target_name.lower()} data" @@ -248,79 +219,132 @@ def format_performance(ratio, target_name): return f"{ratio:.3f}x {emoji}" -overall_performance = "no data" if pd.isna(geo_mean_ratio) else format_performance(geo_mean_ratio, "overall") -vortex_performance = format_performance(vortex_geo_mean_ratio, "vortex") -duckdb_vortex_performance = format_performance(duckdb_vortex_geo_mean_ratio, "duckdb:vortex") -datafusion_vortex_performance = format_performance(datafusion_vortex_geo_mean_ratio, "datafusion:vortex") -parquet_performance = format_performance(parquet_geo_mean_ratio, "parquet") - - -summary_lines = [ - "## Summary", - "", - f"- **Overall**: {overall_performance}", - ( - f"- **Run drift**: {run_drift_metrics['drift_ratio']:.3f}x, " - f"{run_drift_metrics['drift_level']} run-wide shift " - f"({run_drift_metrics['same_direction_pct']:.0f}% aligned, " - f"residual MAD {run_drift_metrics['residual_mad_pct']:.1f}%)" - if not pd.isna(run_drift_metrics["drift_ratio"]) - else "- **Run drift**: no data" - ), -] - -# Only add vortex-specific sections if we have vortex data -if len(vortex_df) > 0: - summary_lines.extend([f"- **Vortex**: {vortex_performance}"]) - -if len(parquet_df) > 0: - summary_lines.extend([f"- **Parquet**: {parquet_performance}"]) - -# Only add duckdb:vortex section if we have that data -if len(duckdb_vortex_df) > 0: - summary_lines.append(f"- **duckdb:vortex**: {duckdb_vortex_performance}") - -# Only add datafusion:vortex section if we have that data -if len(datafusion_vortex_df) > 0: - summary_lines.append(f"- **datafusion:vortex**: {datafusion_vortex_performance}") - -# Only add best/worst if we have vortex data -if len(vortex_df) > 0: - summary_lines.extend( - [ - f"- **Best**: {best_improvement}", - f"- **Worst**: {worst_regression}", - f"- **Significant (>{threshold_pct}%)**: {significant_improvements}↑ {significant_regressions}↓", - ] - ) +def build_summary_lines( + overall_ratio, + vortex_df, + parquet_df, + duckdb_vortex_df, + datafusion_vortex_df, + threshold_pct, + run_drift_metrics, +): + """Build markdown summary lines from precomputed benchmark metrics.""" + vortex_geo_mean_ratio = calculate_geo_mean(vortex_df) + duckdb_vortex_geo_mean_ratio = calculate_geo_mean(duckdb_vortex_df) + datafusion_vortex_geo_mean_ratio = calculate_geo_mean(datafusion_vortex_df) + parquet_geo_mean_ratio = calculate_geo_mean(parquet_df) + + overall_performance = "no data" if pd.isna(overall_ratio) else format_performance(overall_ratio, "overall") + vortex_performance = format_performance(vortex_geo_mean_ratio, "vortex") + duckdb_vortex_performance = format_performance(duckdb_vortex_geo_mean_ratio, "duckdb:vortex") + datafusion_vortex_performance = format_performance(datafusion_vortex_geo_mean_ratio, "datafusion:vortex") + parquet_performance = format_performance(parquet_geo_mean_ratio, "parquet") + + summary_lines = [ + "## Summary", + "", + f"- **Overall**: {overall_performance}", + ( + f"- **Run drift**: {run_drift_metrics['drift_ratio']:.3f}x, " + f"{run_drift_metrics['drift_level']} whole-run shift " + f"({run_drift_metrics['same_direction_pct']:.0f}% aligned, " + f"residual MAD {run_drift_metrics['residual_mad_pct']:.1f}%)" + if not pd.isna(run_drift_metrics["drift_ratio"]) + else "- **Run drift**: no data" + ), + ] + + if len(vortex_df) > 0: + summary_lines.append(f"- **Vortex**: {vortex_performance}") + + if len(parquet_df) > 0: + summary_lines.append(f"- **Parquet**: {parquet_performance}") + + if len(duckdb_vortex_df) > 0: + summary_lines.append(f"- **duckdb:vortex**: {duckdb_vortex_performance}") + + if len(datafusion_vortex_df) > 0: + summary_lines.append(f"- **datafusion:vortex**: {datafusion_vortex_performance}") + + if len(vortex_df) > 0: + vortex_valid_ratios = vortex_df["ratio"].dropna() + if len(vortex_valid_ratios) > 0: + improvements = vortex_valid_ratios[vortex_valid_ratios < 1.0] + if len(improvements) > 0: + best_idx = improvements.idxmin() + best_improvement = f"{vortex_df.loc[best_idx, 'name']} ({vortex_df.loc[best_idx, 'ratio']:.3f}x)" + else: + best_improvement = "No improvements" + + regressions = vortex_valid_ratios[vortex_valid_ratios > 1.0] + if len(regressions) > 0: + worst_idx = regressions.idxmax() + worst_regression = f"{vortex_df.loc[worst_idx, 'name']} ({vortex_df.loc[worst_idx, 'ratio']:.3f}x)" + else: + worst_regression = "No regressions" + else: + best_improvement = "No valid vortex comparisons" + worst_regression = "No valid vortex comparisons" + + significant_improvements = (vortex_df["ratio"] < improvement_threshold).sum() + significant_regressions = (vortex_df["ratio"] > regression_threshold).sum() + summary_lines.extend( + [ + f"- **Best**: {best_improvement}", + f"- **Worst**: {worst_regression}", + f"- **Significant (>{threshold_pct}%)**: {significant_improvements}↑ {significant_regressions}↓", + ] + ) -if run_drift_metrics["is_baseline_suspect"]: - summary_lines.append("- **Baseline signal**: suspect common-mode drift; this run moved together more than expected") - -# Convert rendered timing values from ns to ms to keep the GitHub table narrower. -# This affects display only; all comparison math above stays in the original units. -display_pr_values = df3["value_pr"].copy() -display_base_values = df3["value_base"].copy() -display_units = df3["unit_base"].copy() -ns_mask = display_units == "ns" -display_pr_values.loc[ns_mask] = display_pr_values.loc[ns_mask] / 1_000_000 -display_base_values.loc[ns_mask] = display_base_values.loc[ns_mask] / 1_000_000 -display_units.loc[ns_mask] = "ms" - -# Build table -table_dict = { - "name": df3["name"], - f"PR {pr_commit_id[:8]}": display_pr_values, - f"base {base_commit_id[:8]}": display_base_values, - "ratio (PR/base)": df3["ratio"], - "unit": display_units, -} + if run_drift_metrics["is_baseline_suspect"]: + summary_lines.append("- **Baseline**: likely unreliable for this run; most benchmarks shifted together") + + return summary_lines -if "cv_pct_max" in df3.columns: - table_dict["noise"] = df3["noise"] -table_dict["remark"] = df3["remark"] -table_df = pd.DataFrame(table_dict) +def build_display_table(df, pr_commit_id, base_commit_id): + """Build the markdown table with display-friendly units and columns. + + This keeps presentation-only concerns out of the metric computation above. + """ + display_df = df.copy() + + # Convert rendered timing values from ns to ms to keep the GitHub table narrower. + # This affects display only; all comparison math above stays in the original units. + display_df["value_pr_display"] = display_df["value_pr"].astype(float) + display_df["value_base_display"] = display_df["value_base"].astype(float) + display_df["unit_display"] = display_df["unit_base"].copy() + ns_mask = display_df["unit_display"] == "ns" + display_df.loc[ns_mask, "value_pr_display"] = display_df.loc[ns_mask, "value_pr_display"] / 1_000_000 + display_df.loc[ns_mask, "value_base_display"] = display_df.loc[ns_mask, "value_base_display"] / 1_000_000 + display_df.loc[ns_mask, "unit_display"] = "ms" + + table_dict = { + "name": display_df["name"], + f"PR {pr_commit_id[:8]}": display_df["value_pr_display"], + f"base {base_commit_id[:8]}": display_df["value_base_display"], + "ratio (PR/base)": display_df["ratio"], + "unit": display_df["unit_display"], + } + + if "noise" in display_df.columns: + table_dict["noise"] = display_df["noise"] + + table_dict["remark"] = display_df["remark"] + return pd.DataFrame(table_dict) + + +run_drift_metrics = calculate_run_drift_metrics(df3) +summary_lines = build_summary_lines( + geo_mean_ratio, + vortex_df, + parquet_df, + duckdb_vortex_df, + datafusion_vortex_df, + threshold_pct, + run_drift_metrics, +) +table_df = build_display_table(df3, pr_commit_id, base_commit_id) # Output complete formatted markdown print("\n".join(summary_lines)) From 7eec2bad6d1962f2881c6271f9ce7e37749c159f Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 10 Mar 2026 17:50:10 +0000 Subject: [PATCH 9/9] Some formatting changes Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 41 ++++++++++++++++++------------ 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 68df62f306e..d67a63f0a3b 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -240,19 +240,31 @@ def build_summary_lines( datafusion_vortex_performance = format_performance(datafusion_vortex_geo_mean_ratio, "datafusion:vortex") parquet_performance = format_performance(parquet_geo_mean_ratio, "parquet") - summary_lines = [ - "## Summary", - "", - f"- **Overall**: {overall_performance}", - ( - f"- **Run drift**: {run_drift_metrics['drift_ratio']:.3f}x, " - f"{run_drift_metrics['drift_level']} whole-run shift " - f"({run_drift_metrics['same_direction_pct']:.0f}% aligned, " - f"residual MAD {run_drift_metrics['residual_mad_pct']:.1f}%)" - if not pd.isna(run_drift_metrics["drift_ratio"]) - else "- **Run drift**: no data" - ), - ] + summary_lines = [] + + if run_drift_metrics["is_baseline_suspect"]: + summary_lines.extend( + [ + "- **Baseline**: likely unreliable for this run; most benchmarks shifted together", + "", + ] + ) + + summary_lines.extend( + [ + "## Summary", + "", + f"- **Overall**: {overall_performance}", + ( + f"- **Run drift**: {run_drift_metrics['drift_ratio']:.3f}x, " + f"**{run_drift_metrics['drift_level']}** whole-run shift " + f"({run_drift_metrics['same_direction_pct']:.0f}% aligned, " + f"residual MAD {run_drift_metrics['residual_mad_pct']:.1f}%)" + if not pd.isna(run_drift_metrics["drift_ratio"]) + else "- **Run drift**: no data" + ), + ] + ) if len(vortex_df) > 0: summary_lines.append(f"- **Vortex**: {vortex_performance}") @@ -296,9 +308,6 @@ def build_summary_lines( ] ) - if run_drift_metrics["is_baseline_suspect"]: - summary_lines.append("- **Baseline**: likely unreliable for this run; most benchmarks shifted together") - return summary_lines