From c6402fb9a28408718a9f6768f6acf574bb6d6f42 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Thu, 19 Mar 2026 16:08:32 +0800 Subject: [PATCH 01/31] Migrate DiskANN benchmark pipeline from ADO to GitHub Actions - Add benchmarks.yml workflow using workflow_dispatch, comparing current branch against a configurable baseline ref - Add compare_disk_index_json_output.py to diff benchmark crate JSON outputs into a CSV suitable for benchmark_result_parse.py - Add benchmark_result_parse.py for validating results and posting PR comments - Add wikipedia-100K-disk-index.json benchmark config using the public Wikipedia-100K dataset from big-ann-benchmarks (100K Cohere embeddings, 768-dim, cosine distance) to replace internal ADO datasets --- .github/scripts/benchmark_result_parse.py | 507 ++++++++++++++++++ .../scripts/compare_disk_index_json_output.py | 258 +++++++++ .github/workflows/benchmarks.yml | 318 +++++++++++ .../wikipedia-100K-disk-index.json | 40 ++ 4 files changed, 1123 insertions(+) create mode 100644 .github/scripts/benchmark_result_parse.py create mode 100644 .github/scripts/compare_disk_index_json_output.py create mode 100644 .github/workflows/benchmarks.yml create mode 100644 diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py new file mode 100644 index 000000000..0308b2990 --- /dev/null +++ b/.github/scripts/benchmark_result_parse.py @@ -0,0 +1,507 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +""" +Benchmark Result Parser for GitHub Actions + +Parses benchmark CSV results and validates against thresholds. +Posts comments to GitHub PRs when regressions are detected. + +Migrated from ADO: .pipelines/templates/BenchmarkResultParse.py + +Usage: + python benchmark_result_parse.py --mode pr --file results.csv + python benchmark_result_parse.py --mode aa --file results.csv --data search + +Environment Variables (for PR comments): + GITHUB_TOKEN: GitHub token for API access + GITHUB_REPOSITORY: Owner/repo (e.g., "microsoft/DiskANN") + GITHUB_PR_NUMBER: Pull request number + GITHUB_RUN_ID: Workflow run ID for linking to logs +""" + +import csv +import os +import sys +import argparse +import json +from typing import Any + +# Optional: requests for posting PR comments +try: + import requests + HAS_REQUESTS = True +except ImportError: + HAS_REQUESTS = False + + +# ============================================================================= +# Data Structures +# ============================================================================= + +# Template for full benchmark data (build + search) +DATA_TEMPLATE_FULL = { + "DiskIndexBuild-PqConstruction": { + "duration_seconds": [], + "peak_memory_usage": [] + }, + "DiskIndexBuild-InmemIndexBuild": { + "duration_seconds": [], + "peak_memory_usage": [] + }, + "search_disk_index-search_completed": { + "duration_seconds": [], + "peak_memory_usage": [] + }, + "disk_index_perf_test": { + "total_duration_seconds": [], + }, + "index-build statistics": { + "total_time": [], + "total_comparisons": [], + "search_hops": [] + }, + "search-with-L=2000-bw=4": { + "latency_95": [], + "mean_latency": [], + "mean_io_time": [], + "mean_cpus": [], + "qps": [], + "mean_ios": [], + "mean_comps": [], + "mean_hops": [], + "recall": [] + } +} + +# Template for search-only benchmark data +DATA_TEMPLATE_SEARCH = { + "search_disk_index-search_completed": { + "duration_seconds": [], + "peak_memory_usage": [] + }, + "disk_index_perf_test": { + "total_duration_seconds": [], + }, + "search-with-L=2000-bw=4": { + "latency_95": [], + "mean_latency": [], + "mean_io_time": [], + "mean_cpus": [], + "qps": [], + "mean_ios": [], + "mean_comps": [], + "mean_hops": [], + "recall": [] + } +} + +# Thresholds for benchmark values +# Format: [threshold_percentage, direction, contract_value] +# - threshold_percentage: Maximum allowed deviation percentage +# - direction: 'GT' = higher is better, 'LT' = lower is better +# - contract_value: Promised performance value (empty string if none) +# +# For 'GT' metrics (like QPS, recall): regression if value decreases beyond threshold +# For 'LT' metrics (like latency, memory): regression if value increases beyond threshold +DATA_THRESHOLDS = { + "DiskIndexBuild-PqConstruction": { + "duration_seconds": [10, 'LT', ""], + "peak_memory_usage": [10, 'LT', ""] + }, + "DiskIndexBuild-InmemIndexBuild": { + "duration_seconds": [10, 'LT', ""], + "peak_memory_usage": [10, 'LT', ""] + }, + "search_disk_index-search_completed": { + "duration_seconds": [10, 'LT', ""], + "peak_memory_usage": [10, 'LT', 1.42] + }, + "disk_index_perf_test": { + "total_duration_seconds": [10, 'LT', ""], + }, + "index-build statistics": { + "total_time": [10, 'LT', 1206], + "total_comparisons": [1, 'LT', ""], + "search_hops": [1, 'LT', ""] + }, + "search-with-L=2000-bw=4": { + "latency_95": [10, 'LT', ""], + "mean_latency": [10, 'LT', ""], + "mean_io_time": [10, 'LT', ""], + "mean_cpus": [10, 'LT', ""], + "qps": [10, 'GT', 29], + "mean_ios": [1, 'LT', 2026], + "mean_comps": [1, 'LT', 50000], + "mean_hops": [1, 'LT', ""], + "recall": [1, 'GT', 95.1] + } +} + + +# ============================================================================= +# CSV Parsing +# ============================================================================= + +def parse_csv(file_path: str, data: dict[str, dict[str, list]]) -> dict[str, dict[str, list]]: + """ + Parse benchmark CSV file and populate data structure. + + CSV format expected: + Column 0: (unused) + Column 1: Category name (e.g., "search-with-L=2000-bw=4") + Column 2: Metric name (e.g., "qps") + Column 3: Current value + Column 4: Baseline value + Column 5: Change percentage + """ + with open(file_path, 'r', encoding='utf-8') as f: + reader = csv.reader(f) + next(reader) # Skip header row + + current_key = None + for row in reader: + if len(row) < 6: + continue + + # Column 1 contains category name (only set on first row of category) + if row[1]: + current_key = row[1] + elif current_key and current_key in data: + metric_name = row[2] + if metric_name in data[current_key]: + # Append: [current_value, baseline_value, change_percentage] + data[current_key][metric_name].append(row[3]) # current + data[current_key][metric_name].append(row[4]) # baseline + data[current_key][metric_name].append(row[5]) # change % + + return data + + +def get_data_template(data_type: str) -> dict[str, dict[str, list]]: + """Get a fresh copy of the data template.""" + import copy + if data_type == 'search': + return copy.deepcopy(DATA_TEMPLATE_SEARCH) + return copy.deepcopy(DATA_TEMPLATE_FULL) + + +# ============================================================================= +# Threshold Checking +# ============================================================================= + +def get_target_change_range(threshold: float, direction: str, mode: str) -> tuple[float, float]: + """ + Calculate acceptable change range based on threshold and direction. + + Args: + threshold: Maximum allowed deviation percentage + direction: 'GT' (higher is better) or 'LT' (lower is better) + mode: 'aa' (A/A test, symmetric) or 'pr' (PR test, directional) + + Returns: + Tuple of (min_allowed, max_allowed) change percentages + """ + if mode == 'aa': + # A/A test: symmetric threshold + return (-threshold, threshold) + else: + # PR test: directional threshold + if direction == 'GT': + # Higher is better: allow any improvement, flag regressions + return (-threshold, float('inf')) + else: + # Lower is better: allow any improvement (negative change), flag increases + return (float('-inf'), threshold) + + +def format_interval(start: float, end: float) -> str: + """Format a numeric interval as a string.""" + start_str = '-inf' if start == float('-inf') else f"{start}%" + end_str = 'inf' if end == float('inf') else f"{end}%" + return f"({start_str} - {end_str})" + + +def is_change_threshold_failed(change: float, target_range: tuple[float, float]) -> bool: + """Check if the change exceeds the allowed threshold range.""" + return change < target_range[0] or change > target_range[1] + + +def is_promise_broken(current_value: float, target_value: Any, direction: str) -> tuple[bool, str]: + """ + Check if the current value violates a promised contract value. + + Returns: + Tuple of (is_broken, formatted_target_value) + """ + if target_value == "": + return False, "N/A" + + target_value = float(target_value) + + if direction == 'GT': + # Higher is better: current should be >= target + if current_value < target_value: + return True, f"> {target_value}" + else: + # Lower is better: current should be <= target + if current_value > target_value: + return True, f"< {target_value}" + + return False, str(target_value) + + +def get_outcome_message(threshold_failed: bool, promise_broken: bool) -> str: + """Generate human-readable outcome message.""" + if threshold_failed and promise_broken: + return 'Regression detected, Promise broken' + elif promise_broken: + return 'Promise broken' + elif threshold_failed: + return 'Regression detected' + return 'OK' + + +def check_thresholds( + data: dict[str, dict[str, list]], + thresholds: dict[str, dict[str, list]], + mode: str, + run_id: str | None = None +) -> tuple[bool, str]: + """ + Check all metrics against their thresholds. + + Returns: + Tuple of (has_failures, failure_report_markdown) + """ + failed_rows = [] + + for category in data: + for metric in data[category]: + # Skip metrics without thresholds defined + if category not in thresholds or metric not in thresholds[category]: + print(f"Skipping {category}/{metric} - no threshold defined") + continue + + values = data[category][metric] + if not values: + print(f"ERROR: {category}/{metric} has no data") + return True, f"Missing data for {category}/{metric}" + + # Parse values: [current, baseline, change%] + try: + value_current = float(values[0]) + value_baseline = float(values[1]) + change = float(values[2]) if values[2] else 0.0 + except (ValueError, IndexError) as e: + print(f"ERROR: Failed to parse {category}/{metric}: {e}") + return True, f"Parse error for {category}/{metric}" + + # Get threshold config + threshold_config = thresholds[category][metric] + threshold_pct = threshold_config[0] + direction = threshold_config[1] + contract_value = threshold_config[2] + + # Check thresholds + target_range = get_target_change_range(threshold_pct, direction, mode) + threshold_failed = is_change_threshold_failed(change, target_range) + promise_broken, target_formatted = is_promise_broken(value_current, contract_value, direction) + + if threshold_failed: + print(f"THRESHOLD FAILED: {category}/{metric} change={change}% allowed={format_interval(*target_range)}") + if promise_broken: + print(f"PROMISE BROKEN: {category}/{metric} value={value_current} required={target_formatted}") + + if threshold_failed or promise_broken: + outcome = get_outcome_message(threshold_failed, promise_broken) + failed_rows.append( + f"| {category}/{metric} | {value_baseline} | {value_current} | " + f"{target_formatted} | {change}% | {format_interval(*target_range)} | {outcome} |" + ) + + if failed_rows: + # Build failure report + logs_link = "" + if run_id: + repo = os.getenv('GITHUB_REPOSITORY', 'microsoft/DiskANN') + logs_link = f"https://github.com/{repo}/actions/runs/{run_id}" + + report = "### ❌ Benchmark Check Failed\n\n" + if logs_link: + report += f"Please investigate the [workflow logs]({logs_link}) to determine if the failure is due to your changes.\n\n" + + report += "| Metric | Baseline | Current | Contract | Change | Allowed | Outcome |\n" + report += "|--------|----------|---------|----------|--------|---------|--------|\n" + report += "\n".join(failed_rows) + + return True, report + + return False, "" + + +# ============================================================================= +# GitHub Integration +# ============================================================================= + +def post_github_pr_comment(comment: str) -> bool: + """ + Post a comment to a GitHub pull request. + + Requires environment variables: + GITHUB_TOKEN: Personal access token or GitHub Actions token + GITHUB_REPOSITORY: Owner/repo format + GITHUB_PR_NUMBER: Pull request number + """ + if not HAS_REQUESTS: + print("WARNING: 'requests' module not available, cannot post PR comment") + return False + + token = os.getenv('GITHUB_TOKEN') + repo = os.getenv('GITHUB_REPOSITORY') + pr_number = os.getenv('GITHUB_PR_NUMBER') + + if not all([token, repo, pr_number]): + print("WARNING: Missing GitHub environment variables for PR comment") + print(f" GITHUB_TOKEN: {'set' if token else 'missing'}") + print(f" GITHUB_REPOSITORY: {repo or 'missing'}") + print(f" GITHUB_PR_NUMBER: {pr_number or 'missing'}") + return False + + url = f"https://api.github.com/repos/{repo}/issues/{pr_number}/comments" + headers = { + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {token}", + "X-GitHub-Api-Version": "2022-11-28" + } + body = {"body": comment} + + try: + response = requests.post(url, headers=headers, json=body, timeout=30) + response.raise_for_status() + print(f"Successfully posted comment to PR #{pr_number}") + return True + except requests.RequestException as e: + print(f"ERROR: Failed to post PR comment: {e}") + return False + + +def write_github_step_summary(content: str) -> None: + """Write content to GitHub Actions step summary.""" + summary_file = os.getenv('GITHUB_STEP_SUMMARY') + if summary_file: + with open(summary_file, 'a', encoding='utf-8') as f: + f.write(content) + f.write("\n") + + +def write_github_output(name: str, value: str) -> None: + """Write an output variable for GitHub Actions.""" + output_file = os.getenv('GITHUB_OUTPUT') + if output_file: + with open(output_file, 'a', encoding='utf-8') as f: + f.write(f"{name}={value}\n") + + +# ============================================================================= +# Main +# ============================================================================= + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description='Parse benchmark results and validate against thresholds.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Check PR benchmark results + python benchmark_result_parse.py --mode pr --file results_change.csv + + # Check A/A test results (symmetric thresholds) + python benchmark_result_parse.py --mode aa --file results_change.csv + + # Check search-only benchmarks + python benchmark_result_parse.py --mode pr --file results_change.csv --data search + """ + ) + parser.add_argument( + '--mode', + type=str, + default='aa', + choices=['aa', 'pr', 'lkg'], + help='Benchmark mode: aa=A/A test (symmetric), pr=PR test (directional), lkg=last known good' + ) + parser.add_argument( + '--data', + type=str, + default='both', + choices=['both', 'search'], + help='Type of benchmark data: both=full benchmark, search=search-only' + ) + parser.add_argument( + '--file', + type=str, + default=None, + help='Path to CSV file (overrides FILE_PATH env var)' + ) + parser.add_argument( + '--no-comment', + action='store_true', + help='Skip posting PR comment even in pr mode' + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + + # Get file path + file_path = args.file or os.getenv('FILE_PATH') + if not file_path: + print("ERROR: No input file specified. Use --file or set FILE_PATH env var.") + return 1 + + if not os.path.exists(file_path): + print(f"ERROR: File not found: {file_path}") + return 1 + + print(f"Benchmark mode: {args.mode}") + print(f"Data type: {args.data}") + print(f"Input file: {file_path}") + + # Parse CSV + data_template = get_data_template(args.data) + data = parse_csv(file_path, data_template) + + # Debug output + print("\nParsed data:") + print(json.dumps({k: {sk: sv for sk, sv in v.items() if sv} for k, v in data.items() if any(v.values())}, indent=2)) + + # Check thresholds + run_id = os.getenv('GITHUB_RUN_ID') + has_failures, report = check_thresholds(data, DATA_THRESHOLDS, args.mode, run_id) + + if has_failures: + print("\n" + report) + + # Write to GitHub step summary + write_github_step_summary(report) + + # Post PR comment if in pr mode + if args.mode == 'pr' and not args.no_comment: + post_github_pr_comment(report) + + # Set output for downstream steps + write_github_output('benchmark_failed', 'true') + + return 1 + + print("\n✅ All benchmark values passed!") + write_github_step_summary("### ✅ Benchmark Check Passed\n\nAll metrics within acceptable thresholds.") + write_github_output('benchmark_failed', 'false') + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/scripts/compare_disk_index_json_output.py b/.github/scripts/compare_disk_index_json_output.py new file mode 100644 index 000000000..e3fa5afce --- /dev/null +++ b/.github/scripts/compare_disk_index_json_output.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +""" +Compare two disk-index benchmark JSON files and emit a diff CSV. + +This script takes baseline and branch (target) JSON files from the benchmark crate's +disk-index benchmarks and produces a CSV file comparing the metrics with deviation percentages. + +The output format matches the CSV structure expected by benchmark_result_parse.py: + Parent Span Name, Span Name, Stat Key, Stat Value (Target), Stat Value (Baseline), Deviation (%) + +Migrated from ADO: .pipelines/templates/compare_disk_index_json_output.py + +Usage: + python compare_disk_index_json_output.py \\ + --baseline baseline/target/tmp/_benchmark_crate_baseline.json \\ + --branch diskann_rust/target/tmp/_benchmark_crate_target.json \\ + --out diskann_rust/target/tmp/_change.csv +""" + +import json +import csv +import argparse +from typing import List, Dict, Any, Optional + + +def load_json(path: str) -> List[Dict[str, Any]]: + """Load JSON file and return the parsed content.""" + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def calc_deviation(baseline: float, target: float) -> str: + """Calculate the percentage deviation from baseline to target.""" + try: + if baseline != 0: + dev = ((target - baseline) / baseline) * 100 + return f"{dev:.2f}" + return "" + except Exception: + return "" + + +def extract_build_metrics(results: Dict[str, Any]) -> Dict[str, Any]: + """Extract build metrics from the results structure.""" + if not results: + return {} + + build = results.get("build", {}) + if not build: + return {} + + metrics = {} + + # Total build time (in seconds) + build_time = build.get("build_time") + if build_time: + # build_time is in microseconds, convert to seconds + metrics["total_time"] = build_time / 1e6 + + # Extract span metrics + span_metrics = build.get("span_metrics", {}) + spans = span_metrics.get("spans", []) + + for span in spans: + span_name = span.get("span_name", "") + span_data = span.get("metrics", {}) + + if span_name == "DiskIndexBuild-PqConstruction": + metrics["pq_construction_time"] = span_data.get("duration_seconds", 0) + elif span_name == "DiskIndexBuild-InmemIndexBuild": + metrics["inmem_index_build_time"] = span_data.get("duration_seconds", 0) + elif span_name == "DiskIndexBuild-DiskLayout": + metrics["disk_layout_time"] = span_data.get("duration_seconds", 0) + elif span_name == "disk-index-build": + metrics["total_build_duration"] = span_data.get("duration_seconds", 0) + + return metrics + + +def extract_search_metrics(results: Dict[str, Any], search_l: int, beam_width: int) -> Dict[str, Any]: + """Extract search metrics for a specific search_l value.""" + if not results: + return {} + + search = results.get("search", {}) + if not search: + return {} + + metrics = {} + + # Find the search result for the specified search_l + search_results = search.get("search_results_per_l", []) + for sr in search_results: + if sr.get("search_l") == search_l: + metrics["qps"] = sr.get("qps", 0) + metrics["recall"] = sr.get("recall", 0) + metrics["mean_latency"] = sr.get("mean_latency", 0) + metrics["mean_ios"] = sr.get("mean_ios", 0) + metrics["mean_comps"] = sr.get("mean_comparisons", 0) + metrics["mean_hops"] = sr.get("mean_hops", 0) + metrics["mean_io_time"] = sr.get("mean_io_time", 0) + metrics["mean_cpus"] = sr.get("mean_cpu_time", 0) + metrics["latency_95"] = sr.get("p999_latency", 0) # Use p999 as proxy for 95th percentile + break + + # Also try span metrics + span_metrics = search.get("span_metrics", {}) + spans = span_metrics.get("spans", []) + + search_span_name = f"search-with-L={search_l}-bw={beam_width}" + for span in spans: + if span.get("span_name") == search_span_name: + span_data = span.get("metrics", {}) + # Override with span metrics if they exist + if "qps" in span_data: + metrics["qps"] = span_data["qps"] + if "recall" in span_data: + metrics["recall"] = span_data["recall"] + if "mean_latency" in span_data: + metrics["mean_latency"] = span_data["mean_latency"] + if "mean_ios" in span_data: + metrics["mean_ios"] = span_data["mean_ios"] + if "mean_comps" in span_data: + metrics["mean_comps"] = span_data["mean_comps"] + if "mean_hops" in span_data: + metrics["mean_hops"] = span_data["mean_hops"] + if "mean_io_time" in span_data: + metrics["mean_io_time"] = span_data["mean_io_time"] + if "mean_cpus" in span_data: + metrics["mean_cpus"] = span_data["mean_cpus"] + break + + return metrics + + +def make_rows(baseline_list: List[Dict], target_list: List[Dict]) -> List[List[str]]: + """Generate comparison rows for the CSV output.""" + rows = [] + + for baseline, target in zip(baseline_list, target_list): + baseline_results = baseline.get("results", {}) + target_results = target.get("results", {}) + + # Get input info for context + inp = target.get("input", {}) + content = inp.get("content", {}) + search_phase = content.get("search_phase", {}) + + # Determine search_l and beam_width for search metrics + search_list = search_phase.get("search_list", [2000]) + beam_width = search_phase.get("beam_width", 4) + + # Use the first (or primary) search_l value + primary_search_l = search_list[0] if search_list else 2000 + + # Extract build metrics + baseline_build = extract_build_metrics(baseline_results) + target_build = extract_build_metrics(target_results) + + # Build metrics rows + build_metrics = [ + ("total_time", "total build time (s)"), + ("pq_construction_time", "PQ construction (s)"), + ("inmem_index_build_time", "in-memory index build (s)"), + ("disk_layout_time", "disk layout (s)"), + ] + + for key, display_name in build_metrics: + if key in target_build or key in baseline_build: + target_val = target_build.get(key, 0) + baseline_val = baseline_build.get(key, 0) + rows.append([ + "index-build statistics", + display_name, + key, + str(target_val), + str(baseline_val), + calc_deviation(baseline_val, target_val) + ]) + + # Extract search metrics for the primary search_l + baseline_search = extract_search_metrics(baseline_results, primary_search_l, beam_width) + target_search = extract_search_metrics(target_results, primary_search_l, beam_width) + + search_span_name = f"search-with-L={primary_search_l}-bw={beam_width}" + + # Search metrics rows + search_metrics = [ + ("qps", "queries per second"), + ("recall", "recall (%)"), + ("mean_latency", "mean latency (μs)"), + ("latency_95", "p999 latency (μs)"), + ("mean_ios", "mean IOs"), + ("mean_comps", "mean comparisons"), + ("mean_hops", "mean hops"), + ("mean_io_time", "mean IO time (μs)"), + ("mean_cpus", "mean CPU time (μs)"), + ] + + for key, display_name in search_metrics: + if key in target_search or key in baseline_search: + target_val = target_search.get(key, 0) + baseline_val = baseline_search.get(key, 0) + rows.append([ + search_span_name, + display_name, + key, + str(target_val), + str(baseline_val), + calc_deviation(baseline_val, target_val) + ]) + + return rows + + +def write_csv(rows: List[List[str]], out_path: str): + """Write the comparison rows to a CSV file.""" + header = [ + "Parent Span Name", + "Span Name", + "Stat Key", + "Stat Value (Target)", + "Stat Value (Baseline)", + "Deviation (%)" + ] + with open(out_path, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(header) + writer.writerows(rows) + + +def main(): + parser = argparse.ArgumentParser( + description="Compare two disk-index benchmark JSONs and emit a diff CSV." + ) + parser.add_argument("--baseline", "-b", required=True, help="Path to baseline JSON") + parser.add_argument("--branch", "-r", required=True, help="Path to branch/target JSON") + parser.add_argument("--out", "-o", required=True, help="Where to write output CSV") + args = parser.parse_args() + + baseline_list = load_json(args.baseline) + target_list = load_json(args.branch) + + if len(baseline_list) != len(target_list): + raise ValueError( + f"baseline/branch JSON arrays differ in length: {len(baseline_list)} vs {len(target_list)}" + ) + + rows = make_rows(baseline_list, target_list) + write_csv(rows, args.out) + print(f"✓ Written diff CSV to {args.out}") + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 000000000..daf181ac6 --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,318 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +# DiskANN Benchmarks Workflow +# Migrated from ADO pipeline: .pipelines/DiskANN-Benchmarks.yml +# +# This workflow runs macro benchmarks comparing the current branch against a baseline. +# It is manually triggered and requires a baseline reference (branch, tag, or commit). + +name: Benchmarks + +on: + workflow_dispatch: + inputs: + baseline_ref: + description: 'A branch, commit SHA, or tag name to compare the current branch with' + required: true + default: 'main' + type: string + +# Cancel in-progress runs when a new run is triggered +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true + +env: + RUST_BACKTRACE: 1 + # Use the Rust version specified in rust-toolchain.toml + rust_stable: "1.92" + +defaults: + run: + shell: bash + +permissions: + contents: read + pull-requests: write # Required for posting PR comments + +jobs: + # Macro benchmark: Mimir Enron dataset + macro-benchmark-mimir-enron: + name: Macro Benchmark - Mimir Enron + runs-on: ubuntu-latest + # TODO: For production benchmarks, consider using a self-hosted runner with: + # - NVMe storage for consistent I/O performance + # - CPU pinning (taskset) for reduced variance + # - Dedicated hardware to avoid noisy neighbor effects + timeout-minutes: 120 + + steps: + - name: Checkout current branch + uses: actions/checkout@v4 + with: + path: diskann_rust + lfs: true + + - name: Checkout baseline (${{ inputs.baseline_ref }}) + uses: actions/checkout@v4 + with: + ref: ${{ inputs.baseline_ref }} + path: baseline + lfs: true + + - name: Install Rust ${{ env.rust_stable }} + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ env.rust_stable }} + + - name: Cache Rust dependencies (current) + uses: Swatinem/rust-cache@v2 + with: + workspaces: diskann_rust -> target + key: benchmark-current + + - name: Cache Rust dependencies (baseline) + uses: Swatinem/rust-cache@v2 + with: + workspaces: baseline -> target + key: benchmark-baseline + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y openssl libssl-dev pkg-config python3-pip + pip install csvtomd numpy scipy + + # Download the public Wikipedia-100K dataset via big-ann-benchmarks + # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance) + # Source: https://github.com/harsha-simhadri/big-ann-benchmarks + - name: Clone big-ann-benchmarks + run: git clone --depth 1 https://github.com/harsha-simhadri/big-ann-benchmarks.git + + - name: Download wikipedia-100K dataset + working-directory: big-ann-benchmarks + run: python create_dataset.py --dataset wikipedia-100K + + - name: Copy dataset to benchmark directories + run: | + mkdir -p diskann_rust/target/tmp baseline/target/tmp + cp -r big-ann-benchmarks/data/wikipedia_cohere diskann_rust/target/tmp/ + cp -r big-ann-benchmarks/data/wikipedia_cohere baseline/target/tmp/ + + - name: Run baseline benchmark + working-directory: baseline + run: | + # Note: For accurate benchmarks, consider using CPU pinning on self-hosted runners: + # sudo taskset -c 0,2,4,6 ionice -c 1 -n 0 cargo run ... + cargo run -p diskann-benchmark --features disk-index --release -- \ + run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \ + --output-file target/tmp/wikipedia-100K_benchmark_crate_baseline.json + + - name: Run current branch benchmark + working-directory: diskann_rust + run: | + cargo run -p diskann-benchmark --features disk-index --release -- \ + run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \ + --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json + + - name: Generate diff stats (baseline vs target) + continue-on-error: true + run: | + python diskann_rust/.github/scripts/compare_disk_index_json_output.py \ + --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \ + --branch diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \ + --out diskann_rust/target/tmp/wikipedia-100K_change.csv + + - name: Convert results to Markdown + working-directory: diskann_rust + run: | + csvtomd target/tmp/wikipedia-100K_change.csv > target/tmp/wikipedia-100K_change.md + echo "### Benchmark Results: Wikipedia-100K Dataset" >> $GITHUB_STEP_SUMMARY + cat target/tmp/wikipedia-100K_change.md >> $GITHUB_STEP_SUMMARY + + - name: Validate benchmark results + working-directory: diskann_rust + run: | + python .github/scripts/benchmark_result_parse.py \ + --mode pr \ + --file target/tmp/wikipedia-100K_change.csv + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }} + GITHUB_RUN_ID: ${{ github.run_id }} + + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + if: always() # Upload even if validation fails + with: + name: benchmark-results-wikipedia-100K + path: | + diskann_rust/target/tmp/wikipedia-100K_change.csv + diskann_rust/target/tmp/wikipedia-100K_change.md + diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json + baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json + retention-days: 30 + + # Macro benchmark: OAI Large dataset + macro-benchmark-oai-large: + name: Macro Benchmark - OAI Large + runs-on: ubuntu-latest + # TODO: For production benchmarks, consider using a self-hosted runner + timeout-minutes: 120 + + steps: + - name: Checkout current branch + uses: actions/checkout@v4 + with: + path: diskann_rust + lfs: true + + - name: Checkout baseline (${{ inputs.baseline_ref }}) + uses: actions/checkout@v4 + with: + ref: ${{ inputs.baseline_ref }} + path: baseline + lfs: true + + - name: Install Rust ${{ env.rust_stable }} + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ env.rust_stable }} + + - name: Cache Rust dependencies (current) + uses: Swatinem/rust-cache@v2 + with: + workspaces: diskann_rust -> target + key: benchmark-current + + - name: Cache Rust dependencies (baseline) + uses: Swatinem/rust-cache@v2 + with: + workspaces: baseline -> target + key: benchmark-baseline + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y openssl libssl-dev pkg-config python3-pip + pip install csvtomd numpy scipy + + # Download the public Wikipedia-100K dataset via big-ann-benchmarks + # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance) + # Source: https://github.com/harsha-simhadri/big-ann-benchmarks + - name: Clone big-ann-benchmarks + run: git clone --depth 1 https://github.com/harsha-simhadri/big-ann-benchmarks.git + + - name: Download wikipedia-100K dataset + working-directory: big-ann-benchmarks + run: python create_dataset.py --dataset wikipedia-100K + + - name: Copy dataset to benchmark directories + run: | + mkdir -p diskann_rust/target/tmp baseline/target/tmp + cp -r big-ann-benchmarks/data/wikipedia_cohere diskann_rust/target/tmp/ + cp -r big-ann-benchmarks/data/wikipedia_cohere baseline/target/tmp/ + + - name: Run baseline benchmark + working-directory: baseline + run: | + cargo run -p diskann-benchmark --features disk-index --release -- \ + run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \ + --output-file target/tmp/wikipedia-100K_benchmark_crate_baseline.json + + - name: Run current branch benchmark + working-directory: diskann_rust + run: | + cargo run -p diskann-benchmark --features disk-index --release -- \ + run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \ + --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json + + - name: Generate diff stats (baseline vs target) + continue-on-error: true + run: | + python diskann_rust/.github/scripts/compare_disk_index_json_output.py \ + --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \ + --branch diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \ + --out diskann_rust/target/tmp/wikipedia-100K_change.csv + + - name: Convert results to Markdown + working-directory: diskann_rust + run: | + csvtomd target/tmp/wikipedia-100K_change.csv > target/tmp/wikipedia-100K_change.md + echo "### Benchmark Results: Wikipedia-100K Dataset" >> $GITHUB_STEP_SUMMARY + cat target/tmp/wikipedia-100K_change.md >> $GITHUB_STEP_SUMMARY + + - name: Validate benchmark results + working-directory: diskann_rust + run: | + python .github/scripts/benchmark_result_parse.py \ + --mode pr \ + --file target/tmp/wikipedia-100K_change.csv + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }} + GITHUB_RUN_ID: ${{ github.run_id }} + + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + if: always() # Upload even if validation fails + with: + name: benchmark-results-oai-wikipedia-100K + path: | + diskann_rust/target/tmp/wikipedia-100K_change.csv + diskann_rust/target/tmp/wikipedia-100K_change.md + diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json + baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json + retention-days: 30 + + # NOTE: IAI micro-benchmarks are temporarily disabled in the ADO pipeline + # due to callgrind not running with Rust version `ms-1.86.0`. + # Uncomment when ready to enable: + # + # micro-benchmark-iai: + # name: Micro Benchmark - IAI + # runs-on: ubuntu-latest + # timeout-minutes: 120 + # + # steps: + # - name: Checkout current branch + # uses: actions/checkout@v4 + # with: + # path: diskann_rust + # + # - name: Checkout baseline (${{ inputs.baseline_ref }}) + # uses: actions/checkout@v4 + # with: + # ref: ${{ inputs.baseline_ref }} + # path: baseline + # + # - name: Install Rust ${{ env.rust_stable }} + # uses: dtolnay/rust-toolchain@master + # with: + # toolchain: ${{ env.rust_stable }} + # + # - name: Install valgrind and iai-callgrind-runner + # run: | + # sudo apt-get update + # sudo apt-get install -y valgrind + # cargo install --version 0.14.0 iai-callgrind-runner + # + # - name: Run baseline IAI benchmarks + # working-directory: baseline + # run: | + # cargo bench --bench bench_main_iai + # cargo bench --bench bench_main_vector_iai + # + # - name: Copy IAI baseline files + # run: | + # mkdir -p diskann_rust/target + # cp -R baseline/target/iai diskann_rust/target/ + # + # - name: Run current branch IAI benchmarks + # working-directory: diskann_rust + # run: | + # cargo bench --bench bench_main_iai + # cargo bench --bench bench_main_vector_iai diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json new file mode 100644 index 000000000..1557a594a --- /dev/null +++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json @@ -0,0 +1,40 @@ +{ + "search_directories": [ + "target/tmp" + ], + "jobs": [ + { + "type": "disk-index", + "content": { + "source": { + "disk-index-source": "Build", + "data_type": "float32", + "data": "wikipedia_cohere/wikipedia_base.bin.crop_nb_100000", + "distance": "cosine_normalized", + "dim": 768, + "max_degree": 32, + "l_build": 50, + "num_threads": 4, + "build_ram_limit_gb": 4.0, + "num_pq_chunks": 96, + "quantization_type": "FP", + "save_path": "wikipedia_100k_benchmark_index" + }, + "search_phase": { + "queries": "wikipedia_cohere/wikipedia_query.bin", + "groundtruth": "wikipedia_cohere/wikipedia-100K", + "search_list": [ + 100, + 200 + ], + "beam_width": 4, + "recall_at": 100, + "num_threads": 4, + "is_flat_search": false, + "distance": "cosine_normalized", + "vector_filters_file": null + } + } + } + ] +} From 221735309934177a7979590aee5b24b238fb0c3c Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Fri, 20 Mar 2026 13:37:35 +0800 Subject: [PATCH 02/31] Fix openai-100K distance metric and add gitignore patterns --- .github/scripts/benchmark_result_parse.py | 115 +++++++++++++----- .../scripts/compare_disk_index_json_output.py | 2 - .github/workflows/benchmarks.yml | 57 +++++---- .../openai-100K-disk-index.json | 40 ++++++ 4 files changed, 155 insertions(+), 59 deletions(-) create mode 100644 diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py index 0308b2990..0bbede446 100644 --- a/.github/scripts/benchmark_result_parse.py +++ b/.github/scripts/benchmark_result_parse.py @@ -8,8 +8,6 @@ Parses benchmark CSV results and validates against thresholds. Posts comments to GitHub PRs when regressions are detected. -Migrated from ADO: .pipelines/templates/BenchmarkResultParse.py - Usage: python benchmark_result_parse.py --mode pr --file results.csv python benchmark_result_parse.py --mode aa --file results.csv --data search @@ -72,10 +70,30 @@ "mean_comps": [], "mean_hops": [], "recall": [] + }, + "search-with-L=100-bw=4": { + "latency_95": [], + "mean_latency": [], + "mean_io_time": [], + "mean_cpus": [], + "qps": [], + "mean_ios": [], + "mean_comps": [], + "mean_hops": [], + "recall": [] + }, + "search-with-L=200-bw=4": { + "latency_95": [], + "mean_latency": [], + "mean_io_time": [], + "mean_cpus": [], + "qps": [], + "mean_ios": [], + "mean_comps": [], + "mean_hops": [], + "recall": [] } } - -# Template for search-only benchmark data DATA_TEMPLATE_SEARCH = { "search_disk_index-search_completed": { "duration_seconds": [], @@ -94,6 +112,28 @@ "mean_comps": [], "mean_hops": [], "recall": [] + }, + "search-with-L=100-bw=4": { + "latency_95": [], + "mean_latency": [], + "mean_io_time": [], + "mean_cpus": [], + "qps": [], + "mean_ios": [], + "mean_comps": [], + "mean_hops": [], + "recall": [] + }, + "search-with-L=200-bw=4": { + "latency_95": [], + "mean_latency": [], + "mean_io_time": [], + "mean_cpus": [], + "qps": [], + "mean_ios": [], + "mean_comps": [], + "mean_hops": [], + "recall": [] } } @@ -136,6 +176,28 @@ "mean_comps": [1, 'LT', 50000], "mean_hops": [1, 'LT', ""], "recall": [1, 'GT', 95.1] + }, + "search-with-L=100-bw=4": { + "latency_95": [10, 'LT', ""], + "mean_latency": [10, 'LT', ""], + "mean_io_time": [10, 'LT', ""], + "mean_cpus": [10, 'LT', ""], + "qps": [10, 'GT', ""], + "mean_ios": [10, 'LT', ""], + "mean_comps": [10, 'LT', ""], + "mean_hops": [10, 'LT', ""], + "recall": [1, 'GT', ""] + }, + "search-with-L=200-bw=4": { + "latency_95": [10, 'LT', ""], + "mean_latency": [10, 'LT', ""], + "mean_io_time": [10, 'LT', ""], + "mean_cpus": [10, 'LT', ""], + "qps": [10, 'GT', ""], + "mean_ios": [10, 'LT', ""], + "mean_comps": [10, 'LT', ""], + "mean_hops": [10, 'LT', ""], + "recall": [1, 'GT', ""] } } @@ -147,35 +209,32 @@ def parse_csv(file_path: str, data: dict[str, dict[str, list]]) -> dict[str, dict[str, list]]: """ Parse benchmark CSV file and populate data structure. - - CSV format expected: - Column 0: (unused) - Column 1: Category name (e.g., "search-with-L=2000-bw=4") - Column 2: Metric name (e.g., "qps") - Column 3: Current value - Column 4: Baseline value - Column 5: Change percentage + + CSV format produced by compare_disk_index_json_output.py: + Column 0: Parent Span Name (category, e.g. "index-build statistics") + Column 1: Span Name (display name, unused for matching) + Column 2: Stat Key (metric key, e.g. "qps") + Column 3: Stat Value (Target) + Column 4: Stat Value (Baseline) + Column 5: Deviation (%) """ with open(file_path, 'r', encoding='utf-8') as f: reader = csv.reader(f) next(reader) # Skip header row - - current_key = None + for row in reader: if len(row) < 6: continue - - # Column 1 contains category name (only set on first row of category) - if row[1]: - current_key = row[1] - elif current_key and current_key in data: - metric_name = row[2] - if metric_name in data[current_key]: - # Append: [current_value, baseline_value, change_percentage] - data[current_key][metric_name].append(row[3]) # current - data[current_key][metric_name].append(row[4]) # baseline - data[current_key][metric_name].append(row[5]) # change % - + + category = row[0].strip() + metric_name = row[2].strip() + + if category in data and metric_name in data[category]: + # Append: [current_value, baseline_value, change_percentage] + data[category][metric_name].append(row[3]) # target (current) + data[category][metric_name].append(row[4]) # baseline + data[category][metric_name].append(row[5]) # deviation % + return data @@ -286,8 +345,8 @@ def check_thresholds( values = data[category][metric] if not values: - print(f"ERROR: {category}/{metric} has no data") - return True, f"Missing data for {category}/{metric}" + # No data for this metric in the CSV — skip silently + continue # Parse values: [current, baseline, change%] try: diff --git a/.github/scripts/compare_disk_index_json_output.py b/.github/scripts/compare_disk_index_json_output.py index e3fa5afce..ca9c9d26b 100644 --- a/.github/scripts/compare_disk_index_json_output.py +++ b/.github/scripts/compare_disk_index_json_output.py @@ -11,8 +11,6 @@ The output format matches the CSV structure expected by benchmark_result_parse.py: Parent Span Name, Span Name, Stat Key, Stat Value (Target), Stat Value (Baseline), Deviation (%) -Migrated from ADO: .pipelines/templates/compare_disk_index_json_output.py - Usage: python compare_disk_index_json_output.py \\ --baseline baseline/target/tmp/_benchmark_crate_baseline.json \\ diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index daf181ac6..875c00834 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -2,7 +2,6 @@ # Licensed under the MIT license. # DiskANN Benchmarks Workflow -# Migrated from ADO pipeline: .pipelines/DiskANN-Benchmarks.yml # # This workflow runs macro benchmarks comparing the current branch against a baseline. # It is manually triggered and requires a baseline reference (branch, tag, or commit). @@ -37,9 +36,9 @@ permissions: pull-requests: write # Required for posting PR comments jobs: - # Macro benchmark: Mimir Enron dataset - macro-benchmark-mimir-enron: - name: Macro Benchmark - Mimir Enron + # Macro benchmark: Wikipedia-100K dataset + macro-benchmark-wikipedia-100K: + name: Macro Benchmark - Wikipedia 100K runs-on: ubuntu-latest # TODO: For production benchmarks, consider using a self-hosted runner with: # - NVMe storage for consistent I/O performance @@ -155,9 +154,9 @@ jobs: baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json retention-days: 30 - # Macro benchmark: OAI Large dataset + # Macro benchmark: OpenAI ArXiv dataset macro-benchmark-oai-large: - name: Macro Benchmark - OAI Large + name: Macro Benchmark - OAI ArXiv 100K runs-on: ubuntu-latest # TODO: For production benchmarks, consider using a self-hosted runner timeout-minutes: 120 @@ -199,57 +198,57 @@ jobs: sudo apt-get install -y openssl libssl-dev pkg-config python3-pip pip install csvtomd numpy scipy - # Download the public Wikipedia-100K dataset via big-ann-benchmarks - # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance) + # Download the public OpenAI ArXiv 100K dataset via big-ann-benchmarks + # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance) # Source: https://github.com/harsha-simhadri/big-ann-benchmarks - name: Clone big-ann-benchmarks run: git clone --depth 1 https://github.com/harsha-simhadri/big-ann-benchmarks.git - - name: Download wikipedia-100K dataset + - name: Download openai-100K dataset working-directory: big-ann-benchmarks - run: python create_dataset.py --dataset wikipedia-100K + run: python create_dataset.py --dataset openai-100K - name: Copy dataset to benchmark directories run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - cp -r big-ann-benchmarks/data/wikipedia_cohere diskann_rust/target/tmp/ - cp -r big-ann-benchmarks/data/wikipedia_cohere baseline/target/tmp/ + cp -r big-ann-benchmarks/data/OpenAIArXiv diskann_rust/target/tmp/ + cp -r big-ann-benchmarks/data/OpenAIArXiv baseline/target/tmp/ - name: Run baseline benchmark working-directory: baseline run: | cargo run -p diskann-benchmark --features disk-index --release -- \ - run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \ - --output-file target/tmp/wikipedia-100K_benchmark_crate_baseline.json + run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \ + --output-file target/tmp/openai-100K_benchmark_crate_baseline.json - name: Run current branch benchmark working-directory: diskann_rust run: | cargo run -p diskann-benchmark --features disk-index --release -- \ - run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \ - --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json + run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \ + --output-file target/tmp/openai-100K_benchmark_crate_target.json - name: Generate diff stats (baseline vs target) continue-on-error: true run: | python diskann_rust/.github/scripts/compare_disk_index_json_output.py \ - --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \ - --branch diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \ - --out diskann_rust/target/tmp/wikipedia-100K_change.csv + --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \ + --branch diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \ + --out diskann_rust/target/tmp/openai-100K_change.csv - name: Convert results to Markdown working-directory: diskann_rust run: | - csvtomd target/tmp/wikipedia-100K_change.csv > target/tmp/wikipedia-100K_change.md - echo "### Benchmark Results: Wikipedia-100K Dataset" >> $GITHUB_STEP_SUMMARY - cat target/tmp/wikipedia-100K_change.md >> $GITHUB_STEP_SUMMARY + csvtomd target/tmp/openai-100K_change.csv > target/tmp/openai-100K_change.md + echo "### Benchmark Results: OpenAI ArXiv 100K Dataset" >> $GITHUB_STEP_SUMMARY + cat target/tmp/openai-100K_change.md >> $GITHUB_STEP_SUMMARY - name: Validate benchmark results working-directory: diskann_rust run: | python .github/scripts/benchmark_result_parse.py \ --mode pr \ - --file target/tmp/wikipedia-100K_change.csv + --file target/tmp/openai-100K_change.csv env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_REPOSITORY: ${{ github.repository }} @@ -260,15 +259,15 @@ jobs: uses: actions/upload-artifact@v4 if: always() # Upload even if validation fails with: - name: benchmark-results-oai-wikipedia-100K + name: benchmark-results-openai-100K path: | - diskann_rust/target/tmp/wikipedia-100K_change.csv - diskann_rust/target/tmp/wikipedia-100K_change.md - diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json - baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json + diskann_rust/target/tmp/openai-100K_change.csv + diskann_rust/target/tmp/openai-100K_change.md + diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json + baseline/target/tmp/openai-100K_benchmark_crate_baseline.json retention-days: 30 - # NOTE: IAI micro-benchmarks are temporarily disabled in the ADO pipeline + # NOTE: IAI micro-benchmarks are temporarily disabled # due to callgrind not running with Rust version `ms-1.86.0`. # Uncomment when ready to enable: # diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json new file mode 100644 index 000000000..969724cae --- /dev/null +++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json @@ -0,0 +1,40 @@ +{ + "search_directories": [ + "target/tmp" + ], + "jobs": [ + { + "type": "disk-index", + "content": { + "source": { + "disk-index-source": "Build", + "data_type": "float32", + "data": "OpenAIArXiv/openai_base.bin.crop_nb_100000", + "distance": "squared_l2", + "dim": 1536, + "max_degree": 32, + "l_build": 50, + "num_threads": 8, + "build_ram_limit_gb": 4.0, + "num_pq_chunks": 192, + "quantization_type": "FP", + "save_path": "openai_100k_benchmark_index" + }, + "search_phase": { + "queries": "OpenAIArXiv/openai_query.bin", + "groundtruth": "OpenAIArXiv/openai-100K", + "search_list": [ + 100, + 200 + ], + "beam_width": 4, + "recall_at": 100, + "num_threads": 4, + "is_flat_search": false, + "distance": "squared_l2", + "vector_filters_file": null + } + } + } + ] +} From 6b7b250e579999ce48a2db4aaebc13f3f1e7b7ec Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Fri, 20 Mar 2026 13:47:49 +0800 Subject: [PATCH 03/31] Add push trigger to benchmarks workflow for pre-merge testing --- .github/workflows/benchmarks.yml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 875c00834..337560116 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -16,6 +16,14 @@ on: required: true default: 'main' type: string + push: + branches: + - 'user/tianyuanyuan/add-benchmark-pipeline' + paths: + - 'diskann-benchmark/perf_test_inputs/**-disk-index.json' + - '.github/workflows/benchmarks.yml' + - '.github/scripts/compare_disk_index_json_output.py' + - '.github/scripts/benchmark_result_parse.py' # Cancel in-progress runs when a new run is triggered concurrency: @@ -53,10 +61,10 @@ jobs: path: diskann_rust lfs: true - - name: Checkout baseline (${{ inputs.baseline_ref }}) + - name: Checkout baseline (${{ inputs.baseline_ref || 'main' }}) uses: actions/checkout@v4 with: - ref: ${{ inputs.baseline_ref }} + ref: ${{ inputs.baseline_ref || 'main' }} path: baseline lfs: true @@ -168,10 +176,10 @@ jobs: path: diskann_rust lfs: true - - name: Checkout baseline (${{ inputs.baseline_ref }}) + - name: Checkout baseline (${{ inputs.baseline_ref || 'main' }}) uses: actions/checkout@v4 with: - ref: ${{ inputs.baseline_ref }} + ref: ${{ inputs.baseline_ref || 'main' }} path: baseline lfs: true From d4a6abd68f097cbb562f34be3f41c977353bb87d Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Fri, 20 Mar 2026 13:58:51 +0800 Subject: [PATCH 04/31] Fix baseline run: use input config from current branch checkout --- .github/workflows/benchmarks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 337560116..a4df8213f 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -113,7 +113,7 @@ jobs: # Note: For accurate benchmarks, consider using CPU pinning on self-hosted runners: # sudo taskset -c 0,2,4,6 ionice -c 1 -n 0 cargo run ... cargo run -p diskann-benchmark --features disk-index --release -- \ - run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \ + run --input-file ../diskann_rust/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \ --output-file target/tmp/wikipedia-100K_benchmark_crate_baseline.json - name: Run current branch benchmark @@ -226,7 +226,7 @@ jobs: working-directory: baseline run: | cargo run -p diskann-benchmark --features disk-index --release -- \ - run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \ + run --input-file ../diskann_rust/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \ --output-file target/tmp/openai-100K_benchmark_crate_baseline.json - name: Run current branch benchmark From c6d54a9b0a8b61864b6c79db9e57dc9889b5561a Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Fri, 20 Mar 2026 14:17:29 +0800 Subject: [PATCH 05/31] Fix markdown conversion: replace broken csvtomd with inline Python --- .github/workflows/benchmarks.yml | 34 ++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index a4df8213f..0670af06e 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -89,7 +89,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y openssl libssl-dev pkg-config python3-pip - pip install csvtomd numpy scipy + pip install numpy scipy # Download the public Wikipedia-100K dataset via big-ann-benchmarks # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance) @@ -134,9 +134,18 @@ jobs: - name: Convert results to Markdown working-directory: diskann_rust run: | - csvtomd target/tmp/wikipedia-100K_change.csv > target/tmp/wikipedia-100K_change.md - echo "### Benchmark Results: Wikipedia-100K Dataset" >> $GITHUB_STEP_SUMMARY - cat target/tmp/wikipedia-100K_change.md >> $GITHUB_STEP_SUMMARY + python3 -c " + import csv, os + rows = list(csv.reader(open('target/tmp/wikipedia-100K_change.csv'))) + if len(rows) < 2: + print('No data'); exit(0) + header = rows[0] + sep = ['---'] * len(header) + md = '\n'.join(' | '.join(r) for r in [header, sep] + rows[1:]) + open('target/tmp/wikipedia-100K_change.md', 'w').write(md + '\n') + " + echo '### Benchmark Results: Wikipedia-100K Dataset' >> "$GITHUB_STEP_SUMMARY" + cat target/tmp/wikipedia-100K_change.md >> "$GITHUB_STEP_SUMMARY" - name: Validate benchmark results working-directory: diskann_rust @@ -204,7 +213,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y openssl libssl-dev pkg-config python3-pip - pip install csvtomd numpy scipy + pip install numpy scipy # Download the public OpenAI ArXiv 100K dataset via big-ann-benchmarks # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance) @@ -247,9 +256,18 @@ jobs: - name: Convert results to Markdown working-directory: diskann_rust run: | - csvtomd target/tmp/openai-100K_change.csv > target/tmp/openai-100K_change.md - echo "### Benchmark Results: OpenAI ArXiv 100K Dataset" >> $GITHUB_STEP_SUMMARY - cat target/tmp/openai-100K_change.md >> $GITHUB_STEP_SUMMARY + python3 -c " + import csv, os + rows = list(csv.reader(open('target/tmp/openai-100K_change.csv'))) + if len(rows) < 2: + print('No data'); exit(0) + header = rows[0] + sep = ['---'] * len(header) + md = '\n'.join(' | '.join(r) for r in [header, sep] + rows[1:]) + open('target/tmp/openai-100K_change.md', 'w').write(md + '\n') + " + echo '### Benchmark Results: OpenAI ArXiv 100K Dataset' >> "$GITHUB_STEP_SUMMARY" + cat target/tmp/openai-100K_change.md >> "$GITHUB_STEP_SUMMARY" - name: Validate benchmark results working-directory: diskann_rust From 5775198a10473ae82b0864a7a168e3357f4d422d Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Fri, 20 Mar 2026 16:51:59 +0800 Subject: [PATCH 06/31] Update benchmark configs: align build/search params to fix low recall --- .../perf_test_inputs/openai-100K-disk-index.json | 13 ++++++------- .../perf_test_inputs/wikipedia-100K-disk-index.json | 13 ++++++------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json index 969724cae..9ae7e148b 100644 --- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json @@ -12,11 +12,11 @@ "data": "OpenAIArXiv/openai_base.bin.crop_nb_100000", "distance": "squared_l2", "dim": 1536, - "max_degree": 32, - "l_build": 50, + "max_degree": 59, + "l_build": 64, "num_threads": 8, - "build_ram_limit_gb": 4.0, - "num_pq_chunks": 192, + "build_ram_limit_gb": 10.0, + "num_pq_chunks": 384, "quantization_type": "FP", "save_path": "openai_100k_benchmark_index" }, @@ -24,11 +24,10 @@ "queries": "OpenAIArXiv/openai_query.bin", "groundtruth": "OpenAIArXiv/openai-100K", "search_list": [ - 100, - 200 + 2000 ], "beam_width": 4, - "recall_at": 100, + "recall_at": 1000, "num_threads": 4, "is_flat_search": false, "distance": "squared_l2", diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json index 1557a594a..7deaf788e 100644 --- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json @@ -12,11 +12,11 @@ "data": "wikipedia_cohere/wikipedia_base.bin.crop_nb_100000", "distance": "cosine_normalized", "dim": 768, - "max_degree": 32, - "l_build": 50, + "max_degree": 59, + "l_build": 72, "num_threads": 4, - "build_ram_limit_gb": 4.0, - "num_pq_chunks": 96, + "build_ram_limit_gb": 10.0, + "num_pq_chunks": 192, "quantization_type": "FP", "save_path": "wikipedia_100k_benchmark_index" }, @@ -24,11 +24,10 @@ "queries": "wikipedia_cohere/wikipedia_query.bin", "groundtruth": "wikipedia_cohere/wikipedia-100K", "search_list": [ - 100, - 200 + 2000 ], "beam_width": 4, - "recall_at": 100, + "recall_at": 1000, "num_threads": 4, "is_flat_search": false, "distance": "cosine_normalized", From 7ac841e952a3355f3f40d738af5b46b23372162a Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Mon, 23 Mar 2026 13:13:34 +0800 Subject: [PATCH 07/31] Fix recall_at: set to 100 to match groundtruth file K=100 --- diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json | 2 +- .../perf_test_inputs/wikipedia-100K-disk-index.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json index 9ae7e148b..93c1358ba 100644 --- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json @@ -27,7 +27,7 @@ 2000 ], "beam_width": 4, - "recall_at": 1000, + "recall_at": 100, "num_threads": 4, "is_flat_search": false, "distance": "squared_l2", diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json index 7deaf788e..1c0af41b7 100644 --- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json @@ -27,7 +27,7 @@ 2000 ], "beam_width": 4, - "recall_at": 1000, + "recall_at": 100, "num_threads": 4, "is_flat_search": false, "distance": "cosine_normalized", From 7d5a72eddfd94965a082b50672f285971025f908 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Mon, 23 Mar 2026 13:54:31 +0800 Subject: [PATCH 08/31] Remove stale absolute contracts (qps/recall/total_time): calibrated for ADO mimir-enron, not applicable to public datasets on GitHub runners. Threshold calibration tracked in PBI. --- .github/scripts/benchmark_result_parse.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py index 0bbede446..ab3549dc1 100644 --- a/.github/scripts/benchmark_result_parse.py +++ b/.github/scripts/benchmark_result_parse.py @@ -162,7 +162,8 @@ "total_duration_seconds": [10, 'LT', ""], }, "index-build statistics": { - "total_time": [10, 'LT', 1206], + # total_time contract TBD: requires baseline run on target hardware (see PBI: threshold calibration) + "total_time": [10, 'LT', ""], "total_comparisons": [1, 'LT', ""], "search_hops": [1, 'LT', ""] }, @@ -171,11 +172,14 @@ "mean_latency": [10, 'LT', ""], "mean_io_time": [10, 'LT', ""], "mean_cpus": [10, 'LT', ""], - "qps": [10, 'GT', 29], - "mean_ios": [1, 'LT', 2026], - "mean_comps": [1, 'LT', 50000], + # qps/recall/mean_ios/mean_comps contracts TBD: prior values were calibrated for + # internal mimir-enron 1M-vector dataset on production hardware, not applicable here. + # See PBI: define alert thresholds for public dataset benchmarks. + "qps": [10, 'GT', ""], + "mean_ios": [1, 'LT', ""], + "mean_comps": [1, 'LT', ""], "mean_hops": [1, 'LT', ""], - "recall": [1, 'GT', 95.1] + "recall": [1, 'GT', ""] }, "search-with-L=100-bw=4": { "latency_95": [10, 'LT', ""], From c294c16c59a85c1ab6a234aae12f0c5837dccced Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Mon, 23 Mar 2026 13:55:32 +0800 Subject: [PATCH 09/31] remove comments --- .github/scripts/benchmark_result_parse.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py index ab3549dc1..600a66f35 100644 --- a/.github/scripts/benchmark_result_parse.py +++ b/.github/scripts/benchmark_result_parse.py @@ -162,7 +162,6 @@ "total_duration_seconds": [10, 'LT', ""], }, "index-build statistics": { - # total_time contract TBD: requires baseline run on target hardware (see PBI: threshold calibration) "total_time": [10, 'LT', ""], "total_comparisons": [1, 'LT', ""], "search_hops": [1, 'LT', ""] @@ -172,9 +171,6 @@ "mean_latency": [10, 'LT', ""], "mean_io_time": [10, 'LT', ""], "mean_cpus": [10, 'LT', ""], - # qps/recall/mean_ios/mean_comps contracts TBD: prior values were calibrated for - # internal mimir-enron 1M-vector dataset on production hardware, not applicable here. - # See PBI: define alert thresholds for public dataset benchmarks. "qps": [10, 'GT', ""], "mean_ios": [1, 'LT', ""], "mean_comps": [1, 'LT', ""], From b8e1d6deb484799a4bc14daa4ddf49cde96f2fe8 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Mon, 23 Mar 2026 14:47:43 +0800 Subject: [PATCH 10/31] Fix build_ram_limit_gb: reduce 10->4 to fit GitHub runner RAM (7GB limit) --- diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json | 2 +- .../perf_test_inputs/wikipedia-100K-disk-index.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json index 93c1358ba..940269195 100644 --- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json @@ -15,7 +15,7 @@ "max_degree": 59, "l_build": 64, "num_threads": 8, - "build_ram_limit_gb": 10.0, + "build_ram_limit_gb": 4.0, "num_pq_chunks": 384, "quantization_type": "FP", "save_path": "openai_100k_benchmark_index" diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json index 1c0af41b7..d15026e63 100644 --- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json @@ -15,7 +15,7 @@ "max_degree": 59, "l_build": 72, "num_threads": 4, - "build_ram_limit_gb": 10.0, + "build_ram_limit_gb": 4.0, "num_pq_chunks": 192, "quantization_type": "FP", "save_path": "wikipedia_100k_benchmark_index" From 137cae094ef7c3f039de29d7f60bd62f57ec1ae3 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Mon, 23 Mar 2026 15:21:22 +0800 Subject: [PATCH 11/31] Fix wikipedia distance: cosine_normalized->cosine (vectors are not L2-normalized, metric is inner product) --- .../perf_test_inputs/wikipedia-100K-disk-index.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json index d15026e63..c4131e720 100644 --- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json @@ -10,7 +10,7 @@ "disk-index-source": "Build", "data_type": "float32", "data": "wikipedia_cohere/wikipedia_base.bin.crop_nb_100000", - "distance": "cosine_normalized", + "distance": "cosine", "dim": 768, "max_degree": 59, "l_build": 72, @@ -30,7 +30,7 @@ "recall_at": 100, "num_threads": 4, "is_flat_search": false, - "distance": "cosine_normalized", + "distance": "cosine", "vector_filters_file": null } } From 96b63a8779a3d296af080ce16f86151aff5dae15 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Mon, 23 Mar 2026 15:51:15 +0800 Subject: [PATCH 12/31] Fix wikipedia distance: cosine->inner_product (groundtruth uses raw ip, not cosine similarity) --- .../perf_test_inputs/wikipedia-100K-disk-index.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json index c4131e720..6a52b1e32 100644 --- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json @@ -10,7 +10,7 @@ "disk-index-source": "Build", "data_type": "float32", "data": "wikipedia_cohere/wikipedia_base.bin.crop_nb_100000", - "distance": "cosine", + "distance": "inner_product", "dim": 768, "max_degree": 59, "l_build": 72, @@ -30,7 +30,7 @@ "recall_at": 100, "num_threads": 4, "is_flat_search": false, - "distance": "cosine", + "distance": "inner_product", "vector_filters_file": null } } From 1b4cfc87dfe19c693a5155b20bee92c351fdd016 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Tue, 24 Mar 2026 15:09:03 +0800 Subject: [PATCH 13/31] Align build threads --- diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json | 2 +- .../perf_test_inputs/wikipedia-100K-disk-index.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json index 940269195..b9f3e195d 100644 --- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json @@ -14,7 +14,7 @@ "dim": 1536, "max_degree": 59, "l_build": 64, - "num_threads": 8, + "num_threads": 1, "build_ram_limit_gb": 4.0, "num_pq_chunks": 384, "quantization_type": "FP", diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json index 6a52b1e32..5093eaf4d 100644 --- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json @@ -14,7 +14,7 @@ "dim": 768, "max_degree": 59, "l_build": 72, - "num_threads": 4, + "num_threads": 1, "build_ram_limit_gb": 4.0, "num_pq_chunks": 192, "quantization_type": "FP", From 5e6a6e0fb5e5897621b0ae3566d09b7569a25381 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Tue, 24 Mar 2026 15:51:14 +0800 Subject: [PATCH 14/31] Speed up benchmarks: build threads 1->4, openai pq_chunks 384->192 --- .../perf_test_inputs/openai-100K-disk-index.json | 4 ++-- .../perf_test_inputs/wikipedia-100K-disk-index.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json index b9f3e195d..3723d66b6 100644 --- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json @@ -14,9 +14,9 @@ "dim": 1536, "max_degree": 59, "l_build": 64, - "num_threads": 1, + "num_threads": 4, "build_ram_limit_gb": 4.0, - "num_pq_chunks": 384, + "num_pq_chunks": 192, "quantization_type": "FP", "save_path": "openai_100k_benchmark_index" }, diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json index 5093eaf4d..6a52b1e32 100644 --- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json @@ -14,7 +14,7 @@ "dim": 768, "max_degree": 59, "l_build": 72, - "num_threads": 1, + "num_threads": 4, "build_ram_limit_gb": 4.0, "num_pq_chunks": 192, "quantization_type": "FP", From 59d25b966e465a3fda520ca1d2792fdbf8d323b8 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Tue, 24 Mar 2026 16:24:44 +0800 Subject: [PATCH 15/31] Temp: disable concurrency cancellation for A/A batch testing --- .github/workflows/benchmarks.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 0670af06e..fc0de7c07 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -27,8 +27,10 @@ on: # Cancel in-progress runs when a new run is triggered concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} - cancel-in-progress: true + # Use unique group per run for A/A testing (allows parallel runs). + # For production, change back to: github.event.pull_request.number || github.sha + group: ${{ github.workflow }}-${{ github.run_id }} + cancel-in-progress: false env: RUST_BACKTRACE: 1 From 751e7756ec368963dbb3f7edec996e05c39e0f90 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Thu, 26 Mar 2026 13:56:35 +0800 Subject: [PATCH 16/31] revert A/A test settings, update OpenAI config to SQ_1_2.0 --- .github/workflows/benchmarks.yml | 10 ++++------ .../perf_test_inputs/openai-100K-disk-index.json | 6 +++--- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index fc0de7c07..aa659dd09 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -27,10 +27,8 @@ on: # Cancel in-progress runs when a new run is triggered concurrency: - # Use unique group per run for A/A testing (allows parallel runs). - # For production, change back to: github.event.pull_request.number || github.sha - group: ${{ github.workflow }}-${{ github.run_id }} - cancel-in-progress: false + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true env: RUST_BACKTRACE: 1 @@ -91,7 +89,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y openssl libssl-dev pkg-config python3-pip - pip install numpy scipy + pip install csvtomd numpy scipy # Download the public Wikipedia-100K dataset via big-ann-benchmarks # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance) @@ -215,7 +213,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y openssl libssl-dev pkg-config python3-pip - pip install numpy scipy + pip install csvtomd numpy scipy # Download the public OpenAI ArXiv 100K dataset via big-ann-benchmarks # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance) diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json index 3723d66b6..3a2a1d9e2 100644 --- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json @@ -13,11 +13,11 @@ "distance": "squared_l2", "dim": 1536, "max_degree": 59, - "l_build": 64, + "l_build": 80, "num_threads": 4, "build_ram_limit_gb": 4.0, - "num_pq_chunks": 192, - "quantization_type": "FP", + "num_pq_chunks": 384, + "quantization_type": "SQ_1_2.0", "save_path": "openai_100k_benchmark_index" }, "search_phase": { From 2c4d2353096b996aef9b92c3abff0dfc845532bb Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Fri, 27 Mar 2026 11:09:16 +0800 Subject: [PATCH 17/31] Remove micro-benchmark-iai comments --- .github/workflows/benchmarks.yml | 53 +------------------------------- 1 file changed, 1 insertion(+), 52 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index aa659dd09..324bb1cda 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -124,7 +124,6 @@ jobs: --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json - name: Generate diff stats (baseline vs target) - continue-on-error: true run: | python diskann_rust/.github/scripts/compare_disk_index_json_output.py \ --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \ @@ -246,7 +245,6 @@ jobs: --output-file target/tmp/openai-100K_benchmark_crate_target.json - name: Generate diff stats (baseline vs target) - continue-on-error: true run: | python diskann_rust/.github/scripts/compare_disk_index_json_output.py \ --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \ @@ -291,53 +289,4 @@ jobs: diskann_rust/target/tmp/openai-100K_change.md diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json baseline/target/tmp/openai-100K_benchmark_crate_baseline.json - retention-days: 30 - - # NOTE: IAI micro-benchmarks are temporarily disabled - # due to callgrind not running with Rust version `ms-1.86.0`. - # Uncomment when ready to enable: - # - # micro-benchmark-iai: - # name: Micro Benchmark - IAI - # runs-on: ubuntu-latest - # timeout-minutes: 120 - # - # steps: - # - name: Checkout current branch - # uses: actions/checkout@v4 - # with: - # path: diskann_rust - # - # - name: Checkout baseline (${{ inputs.baseline_ref }}) - # uses: actions/checkout@v4 - # with: - # ref: ${{ inputs.baseline_ref }} - # path: baseline - # - # - name: Install Rust ${{ env.rust_stable }} - # uses: dtolnay/rust-toolchain@master - # with: - # toolchain: ${{ env.rust_stable }} - # - # - name: Install valgrind and iai-callgrind-runner - # run: | - # sudo apt-get update - # sudo apt-get install -y valgrind - # cargo install --version 0.14.0 iai-callgrind-runner - # - # - name: Run baseline IAI benchmarks - # working-directory: baseline - # run: | - # cargo bench --bench bench_main_iai - # cargo bench --bench bench_main_vector_iai - # - # - name: Copy IAI baseline files - # run: | - # mkdir -p diskann_rust/target - # cp -R baseline/target/iai diskann_rust/target/ - # - # - name: Run current branch IAI benchmarks - # working-directory: diskann_rust - # run: | - # cargo bench --bench bench_main_iai - # cargo bench --bench bench_main_vector_iai + retention-days: 30 \ No newline at end of file From f58e0846b4e9eda6126f6f12afccff0b483102aa Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Fri, 27 Mar 2026 11:47:59 +0800 Subject: [PATCH 18/31] use GitHub Release assets for benchmark datasets --- .github/workflows/benchmarks.yml | 34 +++++++++++--------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 324bb1cda..a2fd6ad34 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -91,21 +91,16 @@ jobs: sudo apt-get install -y openssl libssl-dev pkg-config python3-pip pip install csvtomd numpy scipy - # Download the public Wikipedia-100K dataset via big-ann-benchmarks + # Download pre-packaged Wikipedia-100K dataset from GitHub Release # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance) - # Source: https://github.com/harsha-simhadri/big-ann-benchmarks - - name: Clone big-ann-benchmarks - run: git clone --depth 1 https://github.com/harsha-simhadri/big-ann-benchmarks.git - - name: Download wikipedia-100K dataset - working-directory: big-ann-benchmarks - run: python create_dataset.py --dataset wikipedia-100K - - - name: Copy dataset to benchmark directories + env: + GH_TOKEN: ${{ github.token }} run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - cp -r big-ann-benchmarks/data/wikipedia_cohere diskann_rust/target/tmp/ - cp -r big-ann-benchmarks/data/wikipedia_cohere baseline/target/tmp/ + gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'wikipedia-100K.tar.gz' --dir . + tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/ + cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/ - name: Run baseline benchmark working-directory: baseline @@ -214,21 +209,16 @@ jobs: sudo apt-get install -y openssl libssl-dev pkg-config python3-pip pip install csvtomd numpy scipy - # Download the public OpenAI ArXiv 100K dataset via big-ann-benchmarks + # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance) - # Source: https://github.com/harsha-simhadri/big-ann-benchmarks - - name: Clone big-ann-benchmarks - run: git clone --depth 1 https://github.com/harsha-simhadri/big-ann-benchmarks.git - - name: Download openai-100K dataset - working-directory: big-ann-benchmarks - run: python create_dataset.py --dataset openai-100K - - - name: Copy dataset to benchmark directories + env: + GH_TOKEN: ${{ github.token }} run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - cp -r big-ann-benchmarks/data/OpenAIArXiv diskann_rust/target/tmp/ - cp -r big-ann-benchmarks/data/OpenAIArXiv baseline/target/tmp/ + gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'openai-100K.tar.gz' --dir . + tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/ + cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/ - name: Run baseline benchmark working-directory: baseline From 0f5c277396413a3c79cd4e1e622d9d9134d3edff Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Fri, 27 Mar 2026 14:09:01 +0800 Subject: [PATCH 19/31] extract csv-to-markdown into reusable script --- .github/scripts/csv_to_markdown.py | 50 ++++++++++++++++++++++++++++++ .github/workflows/benchmarks.yml | 32 +++++-------------- 2 files changed, 58 insertions(+), 24 deletions(-) create mode 100644 .github/scripts/csv_to_markdown.py diff --git a/.github/scripts/csv_to_markdown.py b/.github/scripts/csv_to_markdown.py new file mode 100644 index 000000000..885a20208 --- /dev/null +++ b/.github/scripts/csv_to_markdown.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +"""Convert a CSV file to a Markdown table and optionally append to GitHub Step Summary.""" + +import argparse +import csv +import os +import sys + + +def csv_to_markdown(csv_path: str) -> str: + """Convert a CSV file to a Markdown table string.""" + with open(csv_path) as f: + rows = list(csv.reader(f)) + if len(rows) < 2: + return "" + header = rows[0] + sep = ["---"] * len(header) + return "\n".join(" | ".join(r) for r in [header, sep] + rows[1:]) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--csv", required=True, help="Input CSV file path") + parser.add_argument("--md", required=True, help="Output Markdown file path") + parser.add_argument("--title", default="", help="Section title for GitHub Step Summary") + args = parser.parse_args() + + md = csv_to_markdown(args.csv) + if not md: + print("No data") + return 0 + + with open(args.md, "w") as f: + f.write(md + "\n") + + # Append to GitHub Step Summary if available + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") + if summary_path and args.title: + with open(summary_path, "a") as f: + f.write(f"### {args.title}\n") + f.write(md + "\n") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index a2fd6ad34..a5c5c3a03 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -128,18 +128,10 @@ jobs: - name: Convert results to Markdown working-directory: diskann_rust run: | - python3 -c " - import csv, os - rows = list(csv.reader(open('target/tmp/wikipedia-100K_change.csv'))) - if len(rows) < 2: - print('No data'); exit(0) - header = rows[0] - sep = ['---'] * len(header) - md = '\n'.join(' | '.join(r) for r in [header, sep] + rows[1:]) - open('target/tmp/wikipedia-100K_change.md', 'w').write(md + '\n') - " - echo '### Benchmark Results: Wikipedia-100K Dataset' >> "$GITHUB_STEP_SUMMARY" - cat target/tmp/wikipedia-100K_change.md >> "$GITHUB_STEP_SUMMARY" + python .github/scripts/csv_to_markdown.py \ + --csv target/tmp/wikipedia-100K_change.csv \ + --md target/tmp/wikipedia-100K_change.md \ + --title 'Benchmark Results: Wikipedia-100K Dataset' - name: Validate benchmark results working-directory: diskann_rust @@ -244,18 +236,10 @@ jobs: - name: Convert results to Markdown working-directory: diskann_rust run: | - python3 -c " - import csv, os - rows = list(csv.reader(open('target/tmp/openai-100K_change.csv'))) - if len(rows) < 2: - print('No data'); exit(0) - header = rows[0] - sep = ['---'] * len(header) - md = '\n'.join(' | '.join(r) for r in [header, sep] + rows[1:]) - open('target/tmp/openai-100K_change.md', 'w').write(md + '\n') - " - echo '### Benchmark Results: OpenAI ArXiv 100K Dataset' >> "$GITHUB_STEP_SUMMARY" - cat target/tmp/openai-100K_change.md >> "$GITHUB_STEP_SUMMARY" + python .github/scripts/csv_to_markdown.py \ + --csv target/tmp/openai-100K_change.csv \ + --md target/tmp/openai-100K_change.md \ + --title 'Benchmark Results: OpenAI ArXiv 100K Dataset' - name: Validate benchmark results working-directory: diskann_rust From b8cddb5a28e4593180c3cb53ea3d9ab9ca7887c9 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Fri, 27 Mar 2026 15:03:00 +0800 Subject: [PATCH 20/31] calibrate contract thresholds from GitHub runner data --- .github/scripts/benchmark_result_parse.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py index 600a66f35..c51036056 100644 --- a/.github/scripts/benchmark_result_parse.py +++ b/.github/scripts/benchmark_result_parse.py @@ -162,20 +162,28 @@ "total_duration_seconds": [10, 'LT', ""], }, "index-build statistics": { - "total_time": [10, 'LT', ""], + # Calibrated from 5 GitHub runner runs (10 observations): + # Wikipedia: 35.9–37.2s, OpenAI: 23.0–76.4s (SQ_1_2.0 variance) + # Contract: worst × 1.5 to absorb shared-runner variance + "total_time": [10, 'LT', 115], "total_comparisons": [1, 'LT', ""], "search_hops": [1, 'LT', ""] }, "search-with-L=2000-bw=4": { + # Calibrated from 5 GitHub runner runs (10 observations): + # QPS: 9.56–9.75 (both datasets) + # Recall: wiki 99.87%, oai 99.67–99.91% + # mean_ios: ~2007 (deterministic) + # mean_comps: wiki ~27609, oai 21618–24733 "latency_95": [10, 'LT', ""], "mean_latency": [10, 'LT', ""], "mean_io_time": [10, 'LT', ""], "mean_cpus": [10, 'LT', ""], - "qps": [10, 'GT', ""], - "mean_ios": [1, 'LT', ""], - "mean_comps": [1, 'LT', ""], + "qps": [10, 'GT', 6.5], + "mean_ios": [1, 'LT', 2410], + "mean_comps": [1, 'LT', 33200], "mean_hops": [1, 'LT', ""], - "recall": [1, 'GT', ""] + "recall": [1, 'GT', 98.0] }, "search-with-L=100-bw=4": { "latency_95": [10, 'LT', ""], From accdf2cca9bfe511c0dcef96babfd879aa6d3fba Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Fri, 27 Mar 2026 15:10:12 +0800 Subject: [PATCH 21/31] add daily A/A benchmark stability test with failure notification --- .github/workflows/benchmarks-aa.yml | 276 ++++++++++++++++++++++++++++ 1 file changed, 276 insertions(+) create mode 100644 .github/workflows/benchmarks-aa.yml diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml new file mode 100644 index 000000000..b272f7c69 --- /dev/null +++ b/.github/workflows/benchmarks-aa.yml @@ -0,0 +1,276 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +# DiskANN Daily A/A Benchmark Stability Test +# +# Runs main vs main at 9 AM UTC every day to detect environment noise. +# If any threshold is breached, a GitHub issue is created to notify @microsoft/diskann-admin. +# Can also be triggered manually for debugging. + +name: Benchmarks (A/A) + +on: + schedule: + # Daily at 9 AM UTC + - cron: '0 9 * * *' + workflow_dispatch: # Allow manual trigger for debugging + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: true + +env: + RUST_BACKTRACE: 1 + rust_stable: "1.92" + +defaults: + run: + shell: bash + +permissions: + contents: read + issues: write # Required for creating failure notification issues + +jobs: + # A/A benchmark: Wikipedia-100K dataset (main vs main) + aa-wikipedia-100K: + name: A/A - Wikipedia 100K + runs-on: ubuntu-latest + timeout-minutes: 120 + + steps: + - name: Checkout main (target) + uses: actions/checkout@v4 + with: + ref: main + path: diskann_rust + lfs: true + + - name: Checkout main (baseline) + uses: actions/checkout@v4 + with: + ref: main + path: baseline + lfs: true + + - name: Install Rust ${{ env.rust_stable }} + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ env.rust_stable }} + + - name: Cache Rust dependencies (target) + uses: Swatinem/rust-cache@v2 + with: + workspaces: diskann_rust -> target + key: aa-target + + - name: Cache Rust dependencies (baseline) + uses: Swatinem/rust-cache@v2 + with: + workspaces: baseline -> target + key: aa-baseline + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y openssl libssl-dev pkg-config + + # Download pre-packaged Wikipedia-100K dataset from GitHub Release + - name: Download wikipedia-100K dataset + env: + GH_TOKEN: ${{ github.token }} + run: | + mkdir -p diskann_rust/target/tmp baseline/target/tmp + gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'wikipedia-100K.tar.gz' --dir . + tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/ + cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/ + + - name: Run baseline benchmark + working-directory: baseline + run: | + cargo run -p diskann-benchmark --features disk-index --release -- \ + run --input-file ../diskann_rust/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \ + --output-file target/tmp/wikipedia-100K_benchmark_crate_baseline.json + + - name: Run target benchmark + working-directory: diskann_rust + run: | + cargo run -p diskann-benchmark --features disk-index --release -- \ + run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \ + --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json + + - name: Generate diff stats (baseline vs target) + run: | + python diskann_rust/.github/scripts/compare_disk_index_json_output.py \ + --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \ + --branch diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \ + --out diskann_rust/target/tmp/wikipedia-100K_change.csv + + - name: Convert results to Markdown + working-directory: diskann_rust + run: | + python .github/scripts/csv_to_markdown.py \ + --csv target/tmp/wikipedia-100K_change.csv \ + --md target/tmp/wikipedia-100K_change.md \ + --title 'A/A Results: Wikipedia-100K Dataset' + + - name: Validate benchmark results + working-directory: diskann_rust + run: | + python .github/scripts/benchmark_result_parse.py \ + --mode aa \ + --file target/tmp/wikipedia-100K_change.csv + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_RUN_ID: ${{ github.run_id }} + + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + if: always() + with: + name: aa-results-wikipedia-100K + path: | + diskann_rust/target/tmp/wikipedia-100K_change.csv + diskann_rust/target/tmp/wikipedia-100K_change.md + diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json + baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json + retention-days: 30 + + # A/A benchmark: OpenAI ArXiv 100K dataset (main vs main) + aa-openai-100K: + name: A/A - OAI ArXiv 100K + runs-on: ubuntu-latest + timeout-minutes: 120 + + steps: + - name: Checkout main (target) + uses: actions/checkout@v4 + with: + ref: main + path: diskann_rust + lfs: true + + - name: Checkout main (baseline) + uses: actions/checkout@v4 + with: + ref: main + path: baseline + lfs: true + + - name: Install Rust ${{ env.rust_stable }} + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ env.rust_stable }} + + - name: Cache Rust dependencies (target) + uses: Swatinem/rust-cache@v2 + with: + workspaces: diskann_rust -> target + key: aa-target + + - name: Cache Rust dependencies (baseline) + uses: Swatinem/rust-cache@v2 + with: + workspaces: baseline -> target + key: aa-baseline + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y openssl libssl-dev pkg-config + + # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release + - name: Download openai-100K dataset + env: + GH_TOKEN: ${{ github.token }} + run: | + mkdir -p diskann_rust/target/tmp baseline/target/tmp + gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'openai-100K.tar.gz' --dir . + tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/ + cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/ + + - name: Run baseline benchmark + working-directory: baseline + run: | + cargo run -p diskann-benchmark --features disk-index --release -- \ + run --input-file ../diskann_rust/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \ + --output-file target/tmp/openai-100K_benchmark_crate_baseline.json + + - name: Run target benchmark + working-directory: diskann_rust + run: | + cargo run -p diskann-benchmark --features disk-index --release -- \ + run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \ + --output-file target/tmp/openai-100K_benchmark_crate_target.json + + - name: Generate diff stats (baseline vs target) + run: | + python diskann_rust/.github/scripts/compare_disk_index_json_output.py \ + --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \ + --branch diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \ + --out diskann_rust/target/tmp/openai-100K_change.csv + + - name: Convert results to Markdown + working-directory: diskann_rust + run: | + python .github/scripts/csv_to_markdown.py \ + --csv target/tmp/openai-100K_change.csv \ + --md target/tmp/openai-100K_change.md \ + --title 'A/A Results: OpenAI ArXiv 100K Dataset' + + - name: Validate benchmark results + working-directory: diskann_rust + run: | + python .github/scripts/benchmark_result_parse.py \ + --mode aa \ + --file target/tmp/openai-100K_change.csv + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_RUN_ID: ${{ github.run_id }} + + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + if: always() + with: + name: aa-results-openai-100K + path: | + diskann_rust/target/tmp/openai-100K_change.csv + diskann_rust/target/tmp/openai-100K_change.md + diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json + baseline/target/tmp/openai-100K_benchmark_crate_baseline.json + retention-days: 30 + + # Notify diskann-admin on A/A failure + notify-on-failure: + name: Notify on A/A Failure + needs: [aa-wikipedia-100K, aa-openai-100K] + runs-on: ubuntu-latest + if: failure() + steps: + - name: Create GitHub issue for A/A failure + uses: actions/github-script@v7 + with: + script: | + const date = new Date().toISOString().split('T')[0]; + const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; + await github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: `[Benchmark A/A] Daily stability test failed – ${date}`, + body: [ + `## Daily A/A Benchmark Failure`, + ``, + `The scheduled A/A benchmark run (main vs main) **failed** on ${date}.`, + `This indicates environment noise exceeded the configured thresholds.`, + ``, + `**Run:** ${runUrl}`, + ``, + `Please review the benchmark artifacts and determine if thresholds need tuning`, + `or if there is a runner environment issue.`, + ``, + `/cc @microsoft/diskann-admin`, + ].join('\n'), + labels: ['benchmark', 'A/A-failure'], + }); From 5b0e7f72770cb283b32e3c231f2d53fbd679efe9 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Fri, 27 Mar 2026 15:30:03 +0800 Subject: [PATCH 22/31] widen mean_cpus threshold to 15% for shared-runner CPU noise --- .github/scripts/benchmark_result_parse.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py index c51036056..27385406d 100644 --- a/.github/scripts/benchmark_result_parse.py +++ b/.github/scripts/benchmark_result_parse.py @@ -178,7 +178,7 @@ "latency_95": [10, 'LT', ""], "mean_latency": [10, 'LT', ""], "mean_io_time": [10, 'LT', ""], - "mean_cpus": [10, 'LT', ""], + "mean_cpus": [15, 'LT', ""], # wider threshold — CPU time is noisy on shared runners "qps": [10, 'GT', 6.5], "mean_ios": [1, 'LT', 2410], "mean_comps": [1, 'LT', 33200], @@ -189,7 +189,7 @@ "latency_95": [10, 'LT', ""], "mean_latency": [10, 'LT', ""], "mean_io_time": [10, 'LT', ""], - "mean_cpus": [10, 'LT', ""], + "mean_cpus": [15, 'LT', ""], # wider threshold — CPU time is noisy on shared runners "qps": [10, 'GT', ""], "mean_ios": [10, 'LT', ""], "mean_comps": [10, 'LT', ""], @@ -200,7 +200,7 @@ "latency_95": [10, 'LT', ""], "mean_latency": [10, 'LT', ""], "mean_io_time": [10, 'LT', ""], - "mean_cpus": [10, 'LT', ""], + "mean_cpus": [15, 'LT', ""], # wider threshold — CPU time is noisy on shared runners "qps": [10, 'GT', ""], "mean_ios": [10, 'LT', ""], "mean_comps": [10, 'LT', ""], From 639d4bb2c19cb34bc978834546ee0aee2c115fdf Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Mon, 30 Mar 2026 15:32:20 +0800 Subject: [PATCH 23/31] move benchmark datasets to separate repo (YuanyuanTian-hh/diskann-benchmark-data) --- .github/workflows/benchmarks-aa.yml | 6 ++++-- .github/workflows/benchmarks.yml | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml index b272f7c69..c1fd45ad8 100644 --- a/.github/workflows/benchmarks-aa.yml +++ b/.github/workflows/benchmarks-aa.yml @@ -76,12 +76,13 @@ jobs: sudo apt-get install -y openssl libssl-dev pkg-config # Download pre-packaged Wikipedia-100K dataset from GitHub Release + # Source: https://github.com/harsha-simhadri/big-ann-benchmarks - name: Download wikipedia-100K dataset env: GH_TOKEN: ${{ github.token }} run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'wikipedia-100K.tar.gz' --dir . + gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'wikipedia-100K.tar.gz' --dir . tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/ cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/ @@ -181,12 +182,13 @@ jobs: sudo apt-get install -y openssl libssl-dev pkg-config # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release + # Source: https://github.com/harsha-simhadri/big-ann-benchmarks - name: Download openai-100K dataset env: GH_TOKEN: ${{ github.token }} run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'openai-100K.tar.gz' --dir . + gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'openai-100K.tar.gz' --dir . tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/ cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/ diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index a5c5c3a03..8cdf767a9 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -93,12 +93,13 @@ jobs: # Download pre-packaged Wikipedia-100K dataset from GitHub Release # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance) + # Source: https://github.com/harsha-simhadri/big-ann-benchmarks - name: Download wikipedia-100K dataset env: GH_TOKEN: ${{ github.token }} run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'wikipedia-100K.tar.gz' --dir . + gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'wikipedia-100K.tar.gz' --dir . tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/ cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/ @@ -203,12 +204,13 @@ jobs: # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance) + # Source: https://github.com/harsha-simhadri/big-ann-benchmarks - name: Download openai-100K dataset env: GH_TOKEN: ${{ github.token }} run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'openai-100K.tar.gz' --dir . + gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'openai-100K.tar.gz' --dir . tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/ cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/ From 1b52de23ded0fdf6b570d9481e13831a67c95f42 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Mon, 30 Mar 2026 15:57:22 +0800 Subject: [PATCH 24/31] consolidate 3 benchmark scripts into single benchmark_validate.py Replaces the previous 3-step pipeline (JSONCSVMarkdownvalidate) with a single script that reads both JSONs directly, compares metrics, writes Markdown to step summary, checks thresholds, and posts PR comments. Removed: - compare_disk_index_json_output.py (JSON diff CSV) - csv_to_markdown.py (CSV Markdown) - benchmark_result_parse.py (CSV threshold check) Also removes pip install csvtomd/numpy/scipy all scripts now use stdlib only. --- .github/scripts/benchmark_result_parse.py | 574 ------------------ .github/scripts/benchmark_validate.py | 425 +++++++++++++ .../scripts/compare_disk_index_json_output.py | 256 -------- .github/scripts/csv_to_markdown.py | 50 -- .github/workflows/benchmarks-aa.yml | 48 +- .github/workflows/benchmarks.yml | 57 +- 6 files changed, 444 insertions(+), 966 deletions(-) delete mode 100644 .github/scripts/benchmark_result_parse.py create mode 100644 .github/scripts/benchmark_validate.py delete mode 100644 .github/scripts/compare_disk_index_json_output.py delete mode 100644 .github/scripts/csv_to_markdown.py diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py deleted file mode 100644 index 27385406d..000000000 --- a/.github/scripts/benchmark_result_parse.py +++ /dev/null @@ -1,574 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT license. - -""" -Benchmark Result Parser for GitHub Actions - -Parses benchmark CSV results and validates against thresholds. -Posts comments to GitHub PRs when regressions are detected. - -Usage: - python benchmark_result_parse.py --mode pr --file results.csv - python benchmark_result_parse.py --mode aa --file results.csv --data search - -Environment Variables (for PR comments): - GITHUB_TOKEN: GitHub token for API access - GITHUB_REPOSITORY: Owner/repo (e.g., "microsoft/DiskANN") - GITHUB_PR_NUMBER: Pull request number - GITHUB_RUN_ID: Workflow run ID for linking to logs -""" - -import csv -import os -import sys -import argparse -import json -from typing import Any - -# Optional: requests for posting PR comments -try: - import requests - HAS_REQUESTS = True -except ImportError: - HAS_REQUESTS = False - - -# ============================================================================= -# Data Structures -# ============================================================================= - -# Template for full benchmark data (build + search) -DATA_TEMPLATE_FULL = { - "DiskIndexBuild-PqConstruction": { - "duration_seconds": [], - "peak_memory_usage": [] - }, - "DiskIndexBuild-InmemIndexBuild": { - "duration_seconds": [], - "peak_memory_usage": [] - }, - "search_disk_index-search_completed": { - "duration_seconds": [], - "peak_memory_usage": [] - }, - "disk_index_perf_test": { - "total_duration_seconds": [], - }, - "index-build statistics": { - "total_time": [], - "total_comparisons": [], - "search_hops": [] - }, - "search-with-L=2000-bw=4": { - "latency_95": [], - "mean_latency": [], - "mean_io_time": [], - "mean_cpus": [], - "qps": [], - "mean_ios": [], - "mean_comps": [], - "mean_hops": [], - "recall": [] - }, - "search-with-L=100-bw=4": { - "latency_95": [], - "mean_latency": [], - "mean_io_time": [], - "mean_cpus": [], - "qps": [], - "mean_ios": [], - "mean_comps": [], - "mean_hops": [], - "recall": [] - }, - "search-with-L=200-bw=4": { - "latency_95": [], - "mean_latency": [], - "mean_io_time": [], - "mean_cpus": [], - "qps": [], - "mean_ios": [], - "mean_comps": [], - "mean_hops": [], - "recall": [] - } -} -DATA_TEMPLATE_SEARCH = { - "search_disk_index-search_completed": { - "duration_seconds": [], - "peak_memory_usage": [] - }, - "disk_index_perf_test": { - "total_duration_seconds": [], - }, - "search-with-L=2000-bw=4": { - "latency_95": [], - "mean_latency": [], - "mean_io_time": [], - "mean_cpus": [], - "qps": [], - "mean_ios": [], - "mean_comps": [], - "mean_hops": [], - "recall": [] - }, - "search-with-L=100-bw=4": { - "latency_95": [], - "mean_latency": [], - "mean_io_time": [], - "mean_cpus": [], - "qps": [], - "mean_ios": [], - "mean_comps": [], - "mean_hops": [], - "recall": [] - }, - "search-with-L=200-bw=4": { - "latency_95": [], - "mean_latency": [], - "mean_io_time": [], - "mean_cpus": [], - "qps": [], - "mean_ios": [], - "mean_comps": [], - "mean_hops": [], - "recall": [] - } -} - -# Thresholds for benchmark values -# Format: [threshold_percentage, direction, contract_value] -# - threshold_percentage: Maximum allowed deviation percentage -# - direction: 'GT' = higher is better, 'LT' = lower is better -# - contract_value: Promised performance value (empty string if none) -# -# For 'GT' metrics (like QPS, recall): regression if value decreases beyond threshold -# For 'LT' metrics (like latency, memory): regression if value increases beyond threshold -DATA_THRESHOLDS = { - "DiskIndexBuild-PqConstruction": { - "duration_seconds": [10, 'LT', ""], - "peak_memory_usage": [10, 'LT', ""] - }, - "DiskIndexBuild-InmemIndexBuild": { - "duration_seconds": [10, 'LT', ""], - "peak_memory_usage": [10, 'LT', ""] - }, - "search_disk_index-search_completed": { - "duration_seconds": [10, 'LT', ""], - "peak_memory_usage": [10, 'LT', 1.42] - }, - "disk_index_perf_test": { - "total_duration_seconds": [10, 'LT', ""], - }, - "index-build statistics": { - # Calibrated from 5 GitHub runner runs (10 observations): - # Wikipedia: 35.9–37.2s, OpenAI: 23.0–76.4s (SQ_1_2.0 variance) - # Contract: worst × 1.5 to absorb shared-runner variance - "total_time": [10, 'LT', 115], - "total_comparisons": [1, 'LT', ""], - "search_hops": [1, 'LT', ""] - }, - "search-with-L=2000-bw=4": { - # Calibrated from 5 GitHub runner runs (10 observations): - # QPS: 9.56–9.75 (both datasets) - # Recall: wiki 99.87%, oai 99.67–99.91% - # mean_ios: ~2007 (deterministic) - # mean_comps: wiki ~27609, oai 21618–24733 - "latency_95": [10, 'LT', ""], - "mean_latency": [10, 'LT', ""], - "mean_io_time": [10, 'LT', ""], - "mean_cpus": [15, 'LT', ""], # wider threshold — CPU time is noisy on shared runners - "qps": [10, 'GT', 6.5], - "mean_ios": [1, 'LT', 2410], - "mean_comps": [1, 'LT', 33200], - "mean_hops": [1, 'LT', ""], - "recall": [1, 'GT', 98.0] - }, - "search-with-L=100-bw=4": { - "latency_95": [10, 'LT', ""], - "mean_latency": [10, 'LT', ""], - "mean_io_time": [10, 'LT', ""], - "mean_cpus": [15, 'LT', ""], # wider threshold — CPU time is noisy on shared runners - "qps": [10, 'GT', ""], - "mean_ios": [10, 'LT', ""], - "mean_comps": [10, 'LT', ""], - "mean_hops": [10, 'LT', ""], - "recall": [1, 'GT', ""] - }, - "search-with-L=200-bw=4": { - "latency_95": [10, 'LT', ""], - "mean_latency": [10, 'LT', ""], - "mean_io_time": [10, 'LT', ""], - "mean_cpus": [15, 'LT', ""], # wider threshold — CPU time is noisy on shared runners - "qps": [10, 'GT', ""], - "mean_ios": [10, 'LT', ""], - "mean_comps": [10, 'LT', ""], - "mean_hops": [10, 'LT', ""], - "recall": [1, 'GT', ""] - } -} - - -# ============================================================================= -# CSV Parsing -# ============================================================================= - -def parse_csv(file_path: str, data: dict[str, dict[str, list]]) -> dict[str, dict[str, list]]: - """ - Parse benchmark CSV file and populate data structure. - - CSV format produced by compare_disk_index_json_output.py: - Column 0: Parent Span Name (category, e.g. "index-build statistics") - Column 1: Span Name (display name, unused for matching) - Column 2: Stat Key (metric key, e.g. "qps") - Column 3: Stat Value (Target) - Column 4: Stat Value (Baseline) - Column 5: Deviation (%) - """ - with open(file_path, 'r', encoding='utf-8') as f: - reader = csv.reader(f) - next(reader) # Skip header row - - for row in reader: - if len(row) < 6: - continue - - category = row[0].strip() - metric_name = row[2].strip() - - if category in data and metric_name in data[category]: - # Append: [current_value, baseline_value, change_percentage] - data[category][metric_name].append(row[3]) # target (current) - data[category][metric_name].append(row[4]) # baseline - data[category][metric_name].append(row[5]) # deviation % - - return data - - -def get_data_template(data_type: str) -> dict[str, dict[str, list]]: - """Get a fresh copy of the data template.""" - import copy - if data_type == 'search': - return copy.deepcopy(DATA_TEMPLATE_SEARCH) - return copy.deepcopy(DATA_TEMPLATE_FULL) - - -# ============================================================================= -# Threshold Checking -# ============================================================================= - -def get_target_change_range(threshold: float, direction: str, mode: str) -> tuple[float, float]: - """ - Calculate acceptable change range based on threshold and direction. - - Args: - threshold: Maximum allowed deviation percentage - direction: 'GT' (higher is better) or 'LT' (lower is better) - mode: 'aa' (A/A test, symmetric) or 'pr' (PR test, directional) - - Returns: - Tuple of (min_allowed, max_allowed) change percentages - """ - if mode == 'aa': - # A/A test: symmetric threshold - return (-threshold, threshold) - else: - # PR test: directional threshold - if direction == 'GT': - # Higher is better: allow any improvement, flag regressions - return (-threshold, float('inf')) - else: - # Lower is better: allow any improvement (negative change), flag increases - return (float('-inf'), threshold) - - -def format_interval(start: float, end: float) -> str: - """Format a numeric interval as a string.""" - start_str = '-inf' if start == float('-inf') else f"{start}%" - end_str = 'inf' if end == float('inf') else f"{end}%" - return f"({start_str} - {end_str})" - - -def is_change_threshold_failed(change: float, target_range: tuple[float, float]) -> bool: - """Check if the change exceeds the allowed threshold range.""" - return change < target_range[0] or change > target_range[1] - - -def is_promise_broken(current_value: float, target_value: Any, direction: str) -> tuple[bool, str]: - """ - Check if the current value violates a promised contract value. - - Returns: - Tuple of (is_broken, formatted_target_value) - """ - if target_value == "": - return False, "N/A" - - target_value = float(target_value) - - if direction == 'GT': - # Higher is better: current should be >= target - if current_value < target_value: - return True, f"> {target_value}" - else: - # Lower is better: current should be <= target - if current_value > target_value: - return True, f"< {target_value}" - - return False, str(target_value) - - -def get_outcome_message(threshold_failed: bool, promise_broken: bool) -> str: - """Generate human-readable outcome message.""" - if threshold_failed and promise_broken: - return 'Regression detected, Promise broken' - elif promise_broken: - return 'Promise broken' - elif threshold_failed: - return 'Regression detected' - return 'OK' - - -def check_thresholds( - data: dict[str, dict[str, list]], - thresholds: dict[str, dict[str, list]], - mode: str, - run_id: str | None = None -) -> tuple[bool, str]: - """ - Check all metrics against their thresholds. - - Returns: - Tuple of (has_failures, failure_report_markdown) - """ - failed_rows = [] - - for category in data: - for metric in data[category]: - # Skip metrics without thresholds defined - if category not in thresholds or metric not in thresholds[category]: - print(f"Skipping {category}/{metric} - no threshold defined") - continue - - values = data[category][metric] - if not values: - # No data for this metric in the CSV — skip silently - continue - - # Parse values: [current, baseline, change%] - try: - value_current = float(values[0]) - value_baseline = float(values[1]) - change = float(values[2]) if values[2] else 0.0 - except (ValueError, IndexError) as e: - print(f"ERROR: Failed to parse {category}/{metric}: {e}") - return True, f"Parse error for {category}/{metric}" - - # Get threshold config - threshold_config = thresholds[category][metric] - threshold_pct = threshold_config[0] - direction = threshold_config[1] - contract_value = threshold_config[2] - - # Check thresholds - target_range = get_target_change_range(threshold_pct, direction, mode) - threshold_failed = is_change_threshold_failed(change, target_range) - promise_broken, target_formatted = is_promise_broken(value_current, contract_value, direction) - - if threshold_failed: - print(f"THRESHOLD FAILED: {category}/{metric} change={change}% allowed={format_interval(*target_range)}") - if promise_broken: - print(f"PROMISE BROKEN: {category}/{metric} value={value_current} required={target_formatted}") - - if threshold_failed or promise_broken: - outcome = get_outcome_message(threshold_failed, promise_broken) - failed_rows.append( - f"| {category}/{metric} | {value_baseline} | {value_current} | " - f"{target_formatted} | {change}% | {format_interval(*target_range)} | {outcome} |" - ) - - if failed_rows: - # Build failure report - logs_link = "" - if run_id: - repo = os.getenv('GITHUB_REPOSITORY', 'microsoft/DiskANN') - logs_link = f"https://github.com/{repo}/actions/runs/{run_id}" - - report = "### ❌ Benchmark Check Failed\n\n" - if logs_link: - report += f"Please investigate the [workflow logs]({logs_link}) to determine if the failure is due to your changes.\n\n" - - report += "| Metric | Baseline | Current | Contract | Change | Allowed | Outcome |\n" - report += "|--------|----------|---------|----------|--------|---------|--------|\n" - report += "\n".join(failed_rows) - - return True, report - - return False, "" - - -# ============================================================================= -# GitHub Integration -# ============================================================================= - -def post_github_pr_comment(comment: str) -> bool: - """ - Post a comment to a GitHub pull request. - - Requires environment variables: - GITHUB_TOKEN: Personal access token or GitHub Actions token - GITHUB_REPOSITORY: Owner/repo format - GITHUB_PR_NUMBER: Pull request number - """ - if not HAS_REQUESTS: - print("WARNING: 'requests' module not available, cannot post PR comment") - return False - - token = os.getenv('GITHUB_TOKEN') - repo = os.getenv('GITHUB_REPOSITORY') - pr_number = os.getenv('GITHUB_PR_NUMBER') - - if not all([token, repo, pr_number]): - print("WARNING: Missing GitHub environment variables for PR comment") - print(f" GITHUB_TOKEN: {'set' if token else 'missing'}") - print(f" GITHUB_REPOSITORY: {repo or 'missing'}") - print(f" GITHUB_PR_NUMBER: {pr_number or 'missing'}") - return False - - url = f"https://api.github.com/repos/{repo}/issues/{pr_number}/comments" - headers = { - "Accept": "application/vnd.github+json", - "Authorization": f"Bearer {token}", - "X-GitHub-Api-Version": "2022-11-28" - } - body = {"body": comment} - - try: - response = requests.post(url, headers=headers, json=body, timeout=30) - response.raise_for_status() - print(f"Successfully posted comment to PR #{pr_number}") - return True - except requests.RequestException as e: - print(f"ERROR: Failed to post PR comment: {e}") - return False - - -def write_github_step_summary(content: str) -> None: - """Write content to GitHub Actions step summary.""" - summary_file = os.getenv('GITHUB_STEP_SUMMARY') - if summary_file: - with open(summary_file, 'a', encoding='utf-8') as f: - f.write(content) - f.write("\n") - - -def write_github_output(name: str, value: str) -> None: - """Write an output variable for GitHub Actions.""" - output_file = os.getenv('GITHUB_OUTPUT') - if output_file: - with open(output_file, 'a', encoding='utf-8') as f: - f.write(f"{name}={value}\n") - - -# ============================================================================= -# Main -# ============================================================================= - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description='Parse benchmark results and validate against thresholds.', - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - # Check PR benchmark results - python benchmark_result_parse.py --mode pr --file results_change.csv - - # Check A/A test results (symmetric thresholds) - python benchmark_result_parse.py --mode aa --file results_change.csv - - # Check search-only benchmarks - python benchmark_result_parse.py --mode pr --file results_change.csv --data search - """ - ) - parser.add_argument( - '--mode', - type=str, - default='aa', - choices=['aa', 'pr', 'lkg'], - help='Benchmark mode: aa=A/A test (symmetric), pr=PR test (directional), lkg=last known good' - ) - parser.add_argument( - '--data', - type=str, - default='both', - choices=['both', 'search'], - help='Type of benchmark data: both=full benchmark, search=search-only' - ) - parser.add_argument( - '--file', - type=str, - default=None, - help='Path to CSV file (overrides FILE_PATH env var)' - ) - parser.add_argument( - '--no-comment', - action='store_true', - help='Skip posting PR comment even in pr mode' - ) - return parser.parse_args() - - -def main() -> int: - args = parse_args() - - # Get file path - file_path = args.file or os.getenv('FILE_PATH') - if not file_path: - print("ERROR: No input file specified. Use --file or set FILE_PATH env var.") - return 1 - - if not os.path.exists(file_path): - print(f"ERROR: File not found: {file_path}") - return 1 - - print(f"Benchmark mode: {args.mode}") - print(f"Data type: {args.data}") - print(f"Input file: {file_path}") - - # Parse CSV - data_template = get_data_template(args.data) - data = parse_csv(file_path, data_template) - - # Debug output - print("\nParsed data:") - print(json.dumps({k: {sk: sv for sk, sv in v.items() if sv} for k, v in data.items() if any(v.values())}, indent=2)) - - # Check thresholds - run_id = os.getenv('GITHUB_RUN_ID') - has_failures, report = check_thresholds(data, DATA_THRESHOLDS, args.mode, run_id) - - if has_failures: - print("\n" + report) - - # Write to GitHub step summary - write_github_step_summary(report) - - # Post PR comment if in pr mode - if args.mode == 'pr' and not args.no_comment: - post_github_pr_comment(report) - - # Set output for downstream steps - write_github_output('benchmark_failed', 'true') - - return 1 - - print("\n✅ All benchmark values passed!") - write_github_step_summary("### ✅ Benchmark Check Passed\n\nAll metrics within acceptable thresholds.") - write_github_output('benchmark_failed', 'false') - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.github/scripts/benchmark_validate.py b/.github/scripts/benchmark_validate.py new file mode 100644 index 000000000..cb69f6054 --- /dev/null +++ b/.github/scripts/benchmark_validate.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +""" +Benchmark Validator for GitHub Actions + +Compares two benchmark JSON outputs (baseline vs target), checks thresholds, +writes a Markdown summary, and optionally posts a PR comment on failure. + +This single script replaces the previous three-step pipeline: + compare_disk_index_json_output.py → csv_to_markdown.py → benchmark_result_parse.py + +Usage: + # PR mode (directional thresholds, posts PR comment on failure) + python benchmark_validate.py --mode pr --baseline baseline.json --target target.json + + # A/A mode (symmetric thresholds) + python benchmark_validate.py --mode aa --baseline baseline.json --target target.json + +Environment Variables (for PR comments): + GITHUB_TOKEN: GitHub token for API access + GITHUB_REPOSITORY: Owner/repo (e.g., "microsoft/DiskANN") + GITHUB_PR_NUMBER: Pull request number + GITHUB_RUN_ID: Workflow run ID for linking to logs + GITHUB_STEP_SUMMARY: Path to step summary file +""" + +import json +import os +import sys +import argparse +from typing import Any +from urllib.request import urlopen, Request +from urllib.error import URLError + + +# ============================================================================= +# JSON Extraction +# ============================================================================= + +def load_json(path: str) -> list[dict[str, Any]]: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def extract_build_metrics(results: dict) -> dict[str, float]: + build = results.get("build", {}) + if not build: + return {} + + metrics: dict[str, float] = {} + + build_time = build.get("build_time") + if build_time: + metrics["total_time"] = build_time / 1e6 # μs → s + + for span in build.get("span_metrics", {}).get("spans", []): + name = span.get("span_name", "") + data = span.get("metrics", {}) + if name == "DiskIndexBuild-PqConstruction": + metrics["pq_construction_time"] = data.get("duration_seconds", 0) + elif name == "DiskIndexBuild-InmemIndexBuild": + metrics["inmem_index_build_time"] = data.get("duration_seconds", 0) + elif name == "DiskIndexBuild-DiskLayout": + metrics["disk_layout_time"] = data.get("duration_seconds", 0) + + return metrics + + +def extract_search_metrics(results: dict, search_l: int, beam_width: int) -> dict[str, float]: + search = results.get("search", {}) + if not search: + return {} + + metrics: dict[str, float] = {} + + # From search_results_per_l + for sr in search.get("search_results_per_l", []): + if sr.get("search_l") == search_l: + metrics["qps"] = sr.get("qps", 0) + metrics["recall"] = sr.get("recall", 0) + metrics["mean_latency"] = sr.get("mean_latency", 0) + metrics["mean_ios"] = sr.get("mean_ios", 0) + metrics["mean_comps"] = sr.get("mean_comparisons", 0) + metrics["mean_hops"] = sr.get("mean_hops", 0) + metrics["mean_io_time"] = sr.get("mean_io_time", 0) + metrics["mean_cpus"] = sr.get("mean_cpu_time", 0) + metrics["latency_95"] = sr.get("p999_latency", 0) + break + + # Override with span metrics if available + span_name = f"search-with-L={search_l}-bw={beam_width}" + for span in search.get("span_metrics", {}).get("spans", []): + if span.get("span_name") == span_name: + data = span.get("metrics", {}) + for key in ("qps", "recall", "mean_latency", "mean_ios", "mean_comps", + "mean_hops", "mean_io_time", "mean_cpus"): + if key in data: + metrics[key] = data[key] + break + + return metrics + + +def compute_diff(baseline_json: list[dict], target_json: list[dict]) -> list[dict]: + """ + Compare baseline and target JSONs. + Returns a flat list of metric diffs: + [{category, metric, baseline, target, deviation}, ...] + """ + rows = [] + + for baseline, target in zip(baseline_json, target_json): + b_results = baseline.get("results", {}) + t_results = target.get("results", {}) + + inp = target.get("input", {}) + search_phase = inp.get("content", {}).get("search_phase", {}) + search_list = search_phase.get("search_list", [2000]) + beam_width = search_phase.get("beam_width", 4) + primary_l = search_list[0] if search_list else 2000 + + # Build metrics + b_build = extract_build_metrics(b_results) + t_build = extract_build_metrics(t_results) + + for key in ("total_time", "pq_construction_time", "inmem_index_build_time", "disk_layout_time"): + if key in t_build or key in b_build: + bv = b_build.get(key, 0) + tv = t_build.get(key, 0) + rows.append({ + "category": "index-build statistics", + "metric": key, + "baseline": bv, + "target": tv, + "deviation": ((tv - bv) / bv * 100) if bv else 0, + }) + + # Search metrics + b_search = extract_search_metrics(b_results, primary_l, beam_width) + t_search = extract_search_metrics(t_results, primary_l, beam_width) + span_cat = f"search-with-L={primary_l}-bw={beam_width}" + + for key in ("qps", "recall", "mean_latency", "latency_95", "mean_ios", + "mean_comps", "mean_hops", "mean_io_time", "mean_cpus"): + if key in t_search or key in b_search: + bv = b_search.get(key, 0) + tv = t_search.get(key, 0) + rows.append({ + "category": span_cat, + "metric": key, + "baseline": bv, + "target": tv, + "deviation": ((tv - bv) / bv * 100) if bv else 0, + }) + + return rows + + +# ============================================================================= +# Thresholds +# ============================================================================= + +# Format: [max_deviation_%, direction, contract_value] +# direction: 'GT' = higher is better, 'LT' = lower is better +# contract_value: absolute limit (empty string = none) +THRESHOLDS: dict[str, dict[str, list]] = { + "DiskIndexBuild-PqConstruction": { + "duration_seconds": [10, "LT", ""], + "peak_memory_usage": [10, "LT", ""], + }, + "DiskIndexBuild-InmemIndexBuild": { + "duration_seconds": [10, "LT", ""], + "peak_memory_usage": [10, "LT", ""], + }, + "search_disk_index-search_completed": { + "duration_seconds": [10, "LT", ""], + "peak_memory_usage": [10, "LT", 1.42], + }, + "disk_index_perf_test": { + "total_duration_seconds": [10, "LT", ""], + }, + "index-build statistics": { + # Calibrated from 5 GitHub runner runs (10 observations): + # Wikipedia: 35.9–37.2s, OpenAI: 23.0–76.4s (SQ_1_2.0 variance) + # Contract: worst × 1.5 to absorb shared-runner variance + "total_time": [10, "LT", 115], + "total_comparisons": [1, "LT", ""], + "search_hops": [1, "LT", ""], + }, + "search-with-L=2000-bw=4": { + # Calibrated from 5 GitHub runner runs (10 observations) + "latency_95": [10, "LT", ""], + "mean_latency": [10, "LT", ""], + "mean_io_time": [10, "LT", ""], + "mean_cpus": [15, "LT", ""], # wider — CPU time is noisy on shared runners + "qps": [10, "GT", 6.5], + "mean_ios": [1, "LT", 2410], + "mean_comps": [1, "LT", 33200], + "mean_hops": [1, "LT", ""], + "recall": [1, "GT", 98.0], + }, + "search-with-L=100-bw=4": { + "latency_95": [10, "LT", ""], + "mean_latency": [10, "LT", ""], + "mean_io_time": [10, "LT", ""], + "mean_cpus": [15, "LT", ""], + "qps": [10, "GT", ""], + "mean_ios": [10, "LT", ""], + "mean_comps": [10, "LT", ""], + "mean_hops": [10, "LT", ""], + "recall": [1, "GT", ""], + }, + "search-with-L=200-bw=4": { + "latency_95": [10, "LT", ""], + "mean_latency": [10, "LT", ""], + "mean_io_time": [10, "LT", ""], + "mean_cpus": [15, "LT", ""], + "qps": [10, "GT", ""], + "mean_ios": [10, "LT", ""], + "mean_comps": [10, "LT", ""], + "mean_hops": [10, "LT", ""], + "recall": [1, "GT", ""], + }, +} + + +def allowed_range(threshold: float, direction: str, mode: str) -> tuple[float, float]: + """Acceptable change range (in %).""" + if mode == "aa": + return (-threshold, threshold) + if direction == "GT": + return (-threshold, float("inf")) + return (float("-inf"), threshold) + + +def fmt_range(lo: float, hi: float) -> str: + lo_s = "-inf" if lo == float("-inf") else f"{lo}%" + hi_s = "inf" if hi == float("inf") else f"{hi}%" + return f"({lo_s} – {hi_s})" + + +def check_contract(value: float, contract: Any, direction: str) -> tuple[bool, str]: + """Check if value violates a hard contract. Returns (broken, formatted_contract).""" + if contract == "": + return False, "N/A" + contract = float(contract) + if direction == "GT" and value < contract: + return True, f"> {contract}" + if direction == "LT" and value > contract: + return True, f"< {contract}" + return False, str(contract) + + +# ============================================================================= +# Validation +# ============================================================================= + +def validate(diffs: list[dict], mode: str, run_id: str | None) -> tuple[bool, str]: + """ + Check all diffs against thresholds. + Returns (has_failures, markdown_report). + """ + failed_rows: list[str] = [] + + for d in diffs: + cat, metric = d["category"], d["metric"] + if cat not in THRESHOLDS or metric not in THRESHOLDS[cat]: + continue + + pct, direction, contract = THRESHOLDS[cat][metric] + rng = allowed_range(pct, direction, mode) + dev = d["deviation"] + + threshold_failed = dev < rng[0] or dev > rng[1] + contract_broken, contract_fmt = check_contract(d["target"], contract, direction) + + if threshold_failed: + print(f"THRESHOLD FAILED: {cat}/{metric} change={dev:.2f}% allowed={fmt_range(*rng)}") + if contract_broken: + print(f"CONTRACT BROKEN: {cat}/{metric} value={d['target']} required={contract_fmt}") + + if threshold_failed or contract_broken: + outcome = [] + if threshold_failed: + outcome.append("Regression detected") + if contract_broken: + outcome.append("Contract broken") + failed_rows.append( + f"| {cat}/{metric} | {d['baseline']:.4g} | {d['target']:.4g} | " + f"{contract_fmt} | {dev:.2f}% | {fmt_range(*rng)} | {', '.join(outcome)} |" + ) + + if not failed_rows: + return False, "" + + logs_link = "" + if run_id: + repo = os.getenv("GITHUB_REPOSITORY", "microsoft/DiskANN") + logs_link = f"https://github.com/{repo}/actions/runs/{run_id}" + + report = "### ❌ Benchmark Check Failed\n\n" + if logs_link: + report += f"Please investigate the [workflow logs]({logs_link}) to determine if the failure is due to your changes.\n\n" + report += "| Metric | Baseline | Current | Contract | Change | Allowed | Outcome |\n" + report += "|--------|----------|---------|----------|--------|---------|--------|\n" + report += "\n".join(failed_rows) + + return True, report + + +# ============================================================================= +# Markdown output +# ============================================================================= + +def diffs_to_markdown(diffs: list[dict], title: str) -> str: + """Render diffs as a Markdown table.""" + lines = [ + f"### {title}", + "", + "| Category | Metric | Baseline | Current | Change |", + "|----------|--------|----------|---------|--------|", + ] + for d in diffs: + lines.append( + f"| {d['category']} | {d['metric']} | {d['baseline']:.4g} | " + f"{d['target']:.4g} | {d['deviation']:+.2f}% |" + ) + return "\n".join(lines) + + +# ============================================================================= +# GitHub helpers (stdlib only — no requests dependency) +# ============================================================================= + +def post_pr_comment(body: str) -> bool: + token = os.getenv("GITHUB_TOKEN") + repo = os.getenv("GITHUB_REPOSITORY") + pr = os.getenv("GITHUB_PR_NUMBER") + if not all([token, repo, pr]): + print("WARNING: Missing GitHub env vars for PR comment " + f"(TOKEN={'set' if token else 'missing'}, REPO={repo or 'missing'}, PR={pr or 'missing'})") + return False + + url = f"https://api.github.com/repos/{repo}/issues/{pr}/comments" + data = json.dumps({"body": body}).encode() + req = Request(url, data=data, method="POST", headers={ + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {token}", + "X-GitHub-Api-Version": "2022-11-28", + "Content-Type": "application/json", + }) + try: + with urlopen(req, timeout=30) as resp: + if resp.status < 300: + print(f"Posted comment to PR #{pr}") + return True + except URLError as e: + print(f"ERROR posting PR comment: {e}") + return False + + +def write_step_summary(content: str) -> None: + path = os.getenv("GITHUB_STEP_SUMMARY") + if path: + with open(path, "a", encoding="utf-8") as f: + f.write(content + "\n") + + +# ============================================================================= +# Main +# ============================================================================= + +def main() -> int: + parser = argparse.ArgumentParser( + description="Compare two benchmark JSONs, validate thresholds, output Markdown." + ) + parser.add_argument("--mode", choices=["aa", "pr"], default="aa", + help="aa = symmetric thresholds, pr = directional") + parser.add_argument("--baseline", required=True, help="Baseline JSON path") + parser.add_argument("--target", required=True, help="Target JSON path") + parser.add_argument("--title", default="Benchmark Results", + help="Title for the Markdown summary table") + parser.add_argument("--no-comment", action="store_true", + help="Skip posting PR comment on failure") + args = parser.parse_args() + + print(f"Mode: {args.mode}") + print(f"Baseline: {args.baseline}") + print(f"Target: {args.target}") + + baseline = load_json(args.baseline) + target = load_json(args.target) + + if len(baseline) != len(target): + print(f"ERROR: JSON arrays differ in length: {len(baseline)} vs {len(target)}") + return 1 + + # Compare + diffs = compute_diff(baseline, target) + print(f"\nCompared {len(diffs)} metrics") + + # Write Markdown summary + md = diffs_to_markdown(diffs, args.title) + write_step_summary(md) + + # Validate thresholds + run_id = os.getenv("GITHUB_RUN_ID") + has_failures, report = validate(diffs, args.mode, run_id) + + if has_failures: + print("\n" + report) + write_step_summary(report) + if args.mode == "pr" and not args.no_comment: + post_pr_comment(report) + return 1 + + print("\n✅ All metrics within thresholds") + write_step_summary("### ✅ Benchmark Check Passed\n\nAll metrics within acceptable thresholds.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/scripts/compare_disk_index_json_output.py b/.github/scripts/compare_disk_index_json_output.py deleted file mode 100644 index ca9c9d26b..000000000 --- a/.github/scripts/compare_disk_index_json_output.py +++ /dev/null @@ -1,256 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT license. - -""" -Compare two disk-index benchmark JSON files and emit a diff CSV. - -This script takes baseline and branch (target) JSON files from the benchmark crate's -disk-index benchmarks and produces a CSV file comparing the metrics with deviation percentages. - -The output format matches the CSV structure expected by benchmark_result_parse.py: - Parent Span Name, Span Name, Stat Key, Stat Value (Target), Stat Value (Baseline), Deviation (%) - -Usage: - python compare_disk_index_json_output.py \\ - --baseline baseline/target/tmp/_benchmark_crate_baseline.json \\ - --branch diskann_rust/target/tmp/_benchmark_crate_target.json \\ - --out diskann_rust/target/tmp/_change.csv -""" - -import json -import csv -import argparse -from typing import List, Dict, Any, Optional - - -def load_json(path: str) -> List[Dict[str, Any]]: - """Load JSON file and return the parsed content.""" - with open(path, "r", encoding="utf-8") as f: - return json.load(f) - - -def calc_deviation(baseline: float, target: float) -> str: - """Calculate the percentage deviation from baseline to target.""" - try: - if baseline != 0: - dev = ((target - baseline) / baseline) * 100 - return f"{dev:.2f}" - return "" - except Exception: - return "" - - -def extract_build_metrics(results: Dict[str, Any]) -> Dict[str, Any]: - """Extract build metrics from the results structure.""" - if not results: - return {} - - build = results.get("build", {}) - if not build: - return {} - - metrics = {} - - # Total build time (in seconds) - build_time = build.get("build_time") - if build_time: - # build_time is in microseconds, convert to seconds - metrics["total_time"] = build_time / 1e6 - - # Extract span metrics - span_metrics = build.get("span_metrics", {}) - spans = span_metrics.get("spans", []) - - for span in spans: - span_name = span.get("span_name", "") - span_data = span.get("metrics", {}) - - if span_name == "DiskIndexBuild-PqConstruction": - metrics["pq_construction_time"] = span_data.get("duration_seconds", 0) - elif span_name == "DiskIndexBuild-InmemIndexBuild": - metrics["inmem_index_build_time"] = span_data.get("duration_seconds", 0) - elif span_name == "DiskIndexBuild-DiskLayout": - metrics["disk_layout_time"] = span_data.get("duration_seconds", 0) - elif span_name == "disk-index-build": - metrics["total_build_duration"] = span_data.get("duration_seconds", 0) - - return metrics - - -def extract_search_metrics(results: Dict[str, Any], search_l: int, beam_width: int) -> Dict[str, Any]: - """Extract search metrics for a specific search_l value.""" - if not results: - return {} - - search = results.get("search", {}) - if not search: - return {} - - metrics = {} - - # Find the search result for the specified search_l - search_results = search.get("search_results_per_l", []) - for sr in search_results: - if sr.get("search_l") == search_l: - metrics["qps"] = sr.get("qps", 0) - metrics["recall"] = sr.get("recall", 0) - metrics["mean_latency"] = sr.get("mean_latency", 0) - metrics["mean_ios"] = sr.get("mean_ios", 0) - metrics["mean_comps"] = sr.get("mean_comparisons", 0) - metrics["mean_hops"] = sr.get("mean_hops", 0) - metrics["mean_io_time"] = sr.get("mean_io_time", 0) - metrics["mean_cpus"] = sr.get("mean_cpu_time", 0) - metrics["latency_95"] = sr.get("p999_latency", 0) # Use p999 as proxy for 95th percentile - break - - # Also try span metrics - span_metrics = search.get("span_metrics", {}) - spans = span_metrics.get("spans", []) - - search_span_name = f"search-with-L={search_l}-bw={beam_width}" - for span in spans: - if span.get("span_name") == search_span_name: - span_data = span.get("metrics", {}) - # Override with span metrics if they exist - if "qps" in span_data: - metrics["qps"] = span_data["qps"] - if "recall" in span_data: - metrics["recall"] = span_data["recall"] - if "mean_latency" in span_data: - metrics["mean_latency"] = span_data["mean_latency"] - if "mean_ios" in span_data: - metrics["mean_ios"] = span_data["mean_ios"] - if "mean_comps" in span_data: - metrics["mean_comps"] = span_data["mean_comps"] - if "mean_hops" in span_data: - metrics["mean_hops"] = span_data["mean_hops"] - if "mean_io_time" in span_data: - metrics["mean_io_time"] = span_data["mean_io_time"] - if "mean_cpus" in span_data: - metrics["mean_cpus"] = span_data["mean_cpus"] - break - - return metrics - - -def make_rows(baseline_list: List[Dict], target_list: List[Dict]) -> List[List[str]]: - """Generate comparison rows for the CSV output.""" - rows = [] - - for baseline, target in zip(baseline_list, target_list): - baseline_results = baseline.get("results", {}) - target_results = target.get("results", {}) - - # Get input info for context - inp = target.get("input", {}) - content = inp.get("content", {}) - search_phase = content.get("search_phase", {}) - - # Determine search_l and beam_width for search metrics - search_list = search_phase.get("search_list", [2000]) - beam_width = search_phase.get("beam_width", 4) - - # Use the first (or primary) search_l value - primary_search_l = search_list[0] if search_list else 2000 - - # Extract build metrics - baseline_build = extract_build_metrics(baseline_results) - target_build = extract_build_metrics(target_results) - - # Build metrics rows - build_metrics = [ - ("total_time", "total build time (s)"), - ("pq_construction_time", "PQ construction (s)"), - ("inmem_index_build_time", "in-memory index build (s)"), - ("disk_layout_time", "disk layout (s)"), - ] - - for key, display_name in build_metrics: - if key in target_build or key in baseline_build: - target_val = target_build.get(key, 0) - baseline_val = baseline_build.get(key, 0) - rows.append([ - "index-build statistics", - display_name, - key, - str(target_val), - str(baseline_val), - calc_deviation(baseline_val, target_val) - ]) - - # Extract search metrics for the primary search_l - baseline_search = extract_search_metrics(baseline_results, primary_search_l, beam_width) - target_search = extract_search_metrics(target_results, primary_search_l, beam_width) - - search_span_name = f"search-with-L={primary_search_l}-bw={beam_width}" - - # Search metrics rows - search_metrics = [ - ("qps", "queries per second"), - ("recall", "recall (%)"), - ("mean_latency", "mean latency (μs)"), - ("latency_95", "p999 latency (μs)"), - ("mean_ios", "mean IOs"), - ("mean_comps", "mean comparisons"), - ("mean_hops", "mean hops"), - ("mean_io_time", "mean IO time (μs)"), - ("mean_cpus", "mean CPU time (μs)"), - ] - - for key, display_name in search_metrics: - if key in target_search or key in baseline_search: - target_val = target_search.get(key, 0) - baseline_val = baseline_search.get(key, 0) - rows.append([ - search_span_name, - display_name, - key, - str(target_val), - str(baseline_val), - calc_deviation(baseline_val, target_val) - ]) - - return rows - - -def write_csv(rows: List[List[str]], out_path: str): - """Write the comparison rows to a CSV file.""" - header = [ - "Parent Span Name", - "Span Name", - "Stat Key", - "Stat Value (Target)", - "Stat Value (Baseline)", - "Deviation (%)" - ] - with open(out_path, "w", newline="", encoding="utf-8") as f: - writer = csv.writer(f) - writer.writerow(header) - writer.writerows(rows) - - -def main(): - parser = argparse.ArgumentParser( - description="Compare two disk-index benchmark JSONs and emit a diff CSV." - ) - parser.add_argument("--baseline", "-b", required=True, help="Path to baseline JSON") - parser.add_argument("--branch", "-r", required=True, help="Path to branch/target JSON") - parser.add_argument("--out", "-o", required=True, help="Where to write output CSV") - args = parser.parse_args() - - baseline_list = load_json(args.baseline) - target_list = load_json(args.branch) - - if len(baseline_list) != len(target_list): - raise ValueError( - f"baseline/branch JSON arrays differ in length: {len(baseline_list)} vs {len(target_list)}" - ) - - rows = make_rows(baseline_list, target_list) - write_csv(rows, args.out) - print(f"✓ Written diff CSV to {args.out}") - - -if __name__ == "__main__": - main() diff --git a/.github/scripts/csv_to_markdown.py b/.github/scripts/csv_to_markdown.py deleted file mode 100644 index 885a20208..000000000 --- a/.github/scripts/csv_to_markdown.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT license. - -"""Convert a CSV file to a Markdown table and optionally append to GitHub Step Summary.""" - -import argparse -import csv -import os -import sys - - -def csv_to_markdown(csv_path: str) -> str: - """Convert a CSV file to a Markdown table string.""" - with open(csv_path) as f: - rows = list(csv.reader(f)) - if len(rows) < 2: - return "" - header = rows[0] - sep = ["---"] * len(header) - return "\n".join(" | ".join(r) for r in [header, sep] + rows[1:]) - - -def main() -> int: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--csv", required=True, help="Input CSV file path") - parser.add_argument("--md", required=True, help="Output Markdown file path") - parser.add_argument("--title", default="", help="Section title for GitHub Step Summary") - args = parser.parse_args() - - md = csv_to_markdown(args.csv) - if not md: - print("No data") - return 0 - - with open(args.md, "w") as f: - f.write(md + "\n") - - # Append to GitHub Step Summary if available - summary_path = os.environ.get("GITHUB_STEP_SUMMARY") - if summary_path and args.title: - with open(summary_path, "a") as f: - f.write(f"### {args.title}\n") - f.write(md + "\n") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml index c1fd45ad8..d03f123f8 100644 --- a/.github/workflows/benchmarks-aa.yml +++ b/.github/workflows/benchmarks-aa.yml @@ -100,27 +100,13 @@ jobs: run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \ --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json - - name: Generate diff stats (baseline vs target) - run: | - python diskann_rust/.github/scripts/compare_disk_index_json_output.py \ - --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \ - --branch diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \ - --out diskann_rust/target/tmp/wikipedia-100K_change.csv - - - name: Convert results to Markdown - working-directory: diskann_rust - run: | - python .github/scripts/csv_to_markdown.py \ - --csv target/tmp/wikipedia-100K_change.csv \ - --md target/tmp/wikipedia-100K_change.md \ - --title 'A/A Results: Wikipedia-100K Dataset' - - name: Validate benchmark results - working-directory: diskann_rust run: | - python .github/scripts/benchmark_result_parse.py \ + python diskann_rust/.github/scripts/benchmark_validate.py \ --mode aa \ - --file target/tmp/wikipedia-100K_change.csv + --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \ + --target diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \ + --title 'A/A Results: Wikipedia-100K Dataset' env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_REPOSITORY: ${{ github.repository }} @@ -132,8 +118,6 @@ jobs: with: name: aa-results-wikipedia-100K path: | - diskann_rust/target/tmp/wikipedia-100K_change.csv - diskann_rust/target/tmp/wikipedia-100K_change.md diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json retention-days: 30 @@ -206,27 +190,13 @@ jobs: run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \ --output-file target/tmp/openai-100K_benchmark_crate_target.json - - name: Generate diff stats (baseline vs target) - run: | - python diskann_rust/.github/scripts/compare_disk_index_json_output.py \ - --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \ - --branch diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \ - --out diskann_rust/target/tmp/openai-100K_change.csv - - - name: Convert results to Markdown - working-directory: diskann_rust - run: | - python .github/scripts/csv_to_markdown.py \ - --csv target/tmp/openai-100K_change.csv \ - --md target/tmp/openai-100K_change.md \ - --title 'A/A Results: OpenAI ArXiv 100K Dataset' - - name: Validate benchmark results - working-directory: diskann_rust run: | - python .github/scripts/benchmark_result_parse.py \ + python diskann_rust/.github/scripts/benchmark_validate.py \ --mode aa \ - --file target/tmp/openai-100K_change.csv + --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \ + --target diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \ + --title 'A/A Results: OpenAI ArXiv 100K Dataset' env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_REPOSITORY: ${{ github.repository }} @@ -238,8 +208,6 @@ jobs: with: name: aa-results-openai-100K path: | - diskann_rust/target/tmp/openai-100K_change.csv - diskann_rust/target/tmp/openai-100K_change.md diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json baseline/target/tmp/openai-100K_benchmark_crate_baseline.json retention-days: 30 diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 8cdf767a9..a7cc47753 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -22,8 +22,7 @@ on: paths: - 'diskann-benchmark/perf_test_inputs/**-disk-index.json' - '.github/workflows/benchmarks.yml' - - '.github/scripts/compare_disk_index_json_output.py' - - '.github/scripts/benchmark_result_parse.py' + - '.github/scripts/benchmark_validate.py' # Cancel in-progress runs when a new run is triggered concurrency: @@ -88,8 +87,7 @@ jobs: - name: Install system dependencies run: | sudo apt-get update - sudo apt-get install -y openssl libssl-dev pkg-config python3-pip - pip install csvtomd numpy scipy + sudo apt-get install -y openssl libssl-dev pkg-config # Download pre-packaged Wikipedia-100K dataset from GitHub Release # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance) @@ -119,27 +117,13 @@ jobs: run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \ --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json - - name: Generate diff stats (baseline vs target) - run: | - python diskann_rust/.github/scripts/compare_disk_index_json_output.py \ - --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \ - --branch diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \ - --out diskann_rust/target/tmp/wikipedia-100K_change.csv - - - name: Convert results to Markdown - working-directory: diskann_rust - run: | - python .github/scripts/csv_to_markdown.py \ - --csv target/tmp/wikipedia-100K_change.csv \ - --md target/tmp/wikipedia-100K_change.md \ - --title 'Benchmark Results: Wikipedia-100K Dataset' - - name: Validate benchmark results - working-directory: diskann_rust run: | - python .github/scripts/benchmark_result_parse.py \ + python diskann_rust/.github/scripts/benchmark_validate.py \ --mode pr \ - --file target/tmp/wikipedia-100K_change.csv + --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \ + --target diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \ + --title 'Benchmark Results: Wikipedia-100K Dataset' env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_REPOSITORY: ${{ github.repository }} @@ -152,8 +136,6 @@ jobs: with: name: benchmark-results-wikipedia-100K path: | - diskann_rust/target/tmp/wikipedia-100K_change.csv - diskann_rust/target/tmp/wikipedia-100K_change.md diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json retention-days: 30 @@ -199,8 +181,7 @@ jobs: - name: Install system dependencies run: | sudo apt-get update - sudo apt-get install -y openssl libssl-dev pkg-config python3-pip - pip install csvtomd numpy scipy + sudo apt-get install -y openssl libssl-dev pkg-config # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance) @@ -228,27 +209,13 @@ jobs: run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \ --output-file target/tmp/openai-100K_benchmark_crate_target.json - - name: Generate diff stats (baseline vs target) - run: | - python diskann_rust/.github/scripts/compare_disk_index_json_output.py \ - --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \ - --branch diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \ - --out diskann_rust/target/tmp/openai-100K_change.csv - - - name: Convert results to Markdown - working-directory: diskann_rust - run: | - python .github/scripts/csv_to_markdown.py \ - --csv target/tmp/openai-100K_change.csv \ - --md target/tmp/openai-100K_change.md \ - --title 'Benchmark Results: OpenAI ArXiv 100K Dataset' - - name: Validate benchmark results - working-directory: diskann_rust run: | - python .github/scripts/benchmark_result_parse.py \ + python diskann_rust/.github/scripts/benchmark_validate.py \ --mode pr \ - --file target/tmp/openai-100K_change.csv + --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \ + --target diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \ + --title 'Benchmark Results: OpenAI ArXiv 100K Dataset' env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_REPOSITORY: ${{ github.repository }} @@ -261,8 +228,6 @@ jobs: with: name: benchmark-results-openai-100K path: | - diskann_rust/target/tmp/openai-100K_change.csv - diskann_rust/target/tmp/openai-100K_change.md diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json baseline/target/tmp/openai-100K_benchmark_crate_baseline.json retention-days: 30 \ No newline at end of file From 9779e3cfe458d2776b506a1f3403e9825486e294 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Tue, 31 Mar 2026 09:57:01 +0800 Subject: [PATCH 25/31] switch benchmark jobs to self-hosted 1ES runner pool (diskann-github) --- .github/workflows/benchmarks-aa.yml | 4 ++-- .github/workflows/benchmarks.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml index d03f123f8..efd299564 100644 --- a/.github/workflows/benchmarks-aa.yml +++ b/.github/workflows/benchmarks-aa.yml @@ -35,7 +35,7 @@ jobs: # A/A benchmark: Wikipedia-100K dataset (main vs main) aa-wikipedia-100K: name: A/A - Wikipedia 100K - runs-on: ubuntu-latest + runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ] timeout-minutes: 120 steps: @@ -125,7 +125,7 @@ jobs: # A/A benchmark: OpenAI ArXiv 100K dataset (main vs main) aa-openai-100K: name: A/A - OAI ArXiv 100K - runs-on: ubuntu-latest + runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ] timeout-minutes: 120 steps: diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index a7cc47753..9f4cf2fc5 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -46,7 +46,7 @@ jobs: # Macro benchmark: Wikipedia-100K dataset macro-benchmark-wikipedia-100K: name: Macro Benchmark - Wikipedia 100K - runs-on: ubuntu-latest + runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ] # TODO: For production benchmarks, consider using a self-hosted runner with: # - NVMe storage for consistent I/O performance # - CPU pinning (taskset) for reduced variance @@ -143,7 +143,7 @@ jobs: # Macro benchmark: OpenAI ArXiv dataset macro-benchmark-oai-large: name: Macro Benchmark - OAI ArXiv 100K - runs-on: ubuntu-latest + runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ] # TODO: For production benchmarks, consider using a self-hosted runner timeout-minutes: 120 From db7e97ac23e55606f6405b5c65f45549d3da7087 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Tue, 31 Mar 2026 10:05:07 +0800 Subject: [PATCH 26/31] replace gh CLI with curl for dataset downloads (gh not available on 1ES runners) --- .github/workflows/benchmarks-aa.yml | 8 ++------ .github/workflows/benchmarks.yml | 8 ++------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml index efd299564..ed7783233 100644 --- a/.github/workflows/benchmarks-aa.yml +++ b/.github/workflows/benchmarks-aa.yml @@ -78,11 +78,9 @@ jobs: # Download pre-packaged Wikipedia-100K dataset from GitHub Release # Source: https://github.com/harsha-simhadri/big-ann-benchmarks - name: Download wikipedia-100K dataset - env: - GH_TOKEN: ${{ github.token }} run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'wikipedia-100K.tar.gz' --dir . + curl -L -o wikipedia-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/wikipedia-100K.tar.gz tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/ cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/ @@ -168,11 +166,9 @@ jobs: # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release # Source: https://github.com/harsha-simhadri/big-ann-benchmarks - name: Download openai-100K dataset - env: - GH_TOKEN: ${{ github.token }} run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'openai-100K.tar.gz' --dir . + curl -L -o openai-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/openai-100K.tar.gz tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/ cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/ diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 9f4cf2fc5..d5015df88 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -93,11 +93,9 @@ jobs: # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance) # Source: https://github.com/harsha-simhadri/big-ann-benchmarks - name: Download wikipedia-100K dataset - env: - GH_TOKEN: ${{ github.token }} run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'wikipedia-100K.tar.gz' --dir . + curl -L -o wikipedia-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/wikipedia-100K.tar.gz tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/ cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/ @@ -187,11 +185,9 @@ jobs: # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance) # Source: https://github.com/harsha-simhadri/big-ann-benchmarks - name: Download openai-100K dataset - env: - GH_TOKEN: ${{ github.token }} run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'openai-100K.tar.gz' --dir . + curl -L -o openai-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/openai-100K.tar.gz tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/ cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/ From ea91c62f060ce69e3e2aa1413394c5993a2d78ed Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Tue, 31 Mar 2026 10:21:47 +0800 Subject: [PATCH 27/31] fix latency_95: read p95_latency instead of p999_latency from benchmark JSON --- .github/scripts/benchmark_validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/benchmark_validate.py b/.github/scripts/benchmark_validate.py index cb69f6054..ebcb95332 100644 --- a/.github/scripts/benchmark_validate.py +++ b/.github/scripts/benchmark_validate.py @@ -86,7 +86,7 @@ def extract_search_metrics(results: dict, search_l: int, beam_width: int) -> dic metrics["mean_hops"] = sr.get("mean_hops", 0) metrics["mean_io_time"] = sr.get("mean_io_time", 0) metrics["mean_cpus"] = sr.get("mean_cpu_time", 0) - metrics["latency_95"] = sr.get("p999_latency", 0) + metrics["latency_95"] = sr.get("p95_latency", 0) break # Override with span metrics if available From 360cdc70fbbd33d7127ec11093b2ab9b6fcf06ac Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Wed, 1 Apr 2026 11:29:20 +0800 Subject: [PATCH 28/31] revert to ubuntu-latest runners, switch dataset source to BAB v0.4.0 --- .github/workflows/benchmarks-aa.yml | 8 ++++---- .github/workflows/benchmarks.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml index ed7783233..9161400af 100644 --- a/.github/workflows/benchmarks-aa.yml +++ b/.github/workflows/benchmarks-aa.yml @@ -35,7 +35,7 @@ jobs: # A/A benchmark: Wikipedia-100K dataset (main vs main) aa-wikipedia-100K: name: A/A - Wikipedia 100K - runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ] + runs-on: ubuntu-latest timeout-minutes: 120 steps: @@ -80,7 +80,7 @@ jobs: - name: Download wikipedia-100K dataset run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - curl -L -o wikipedia-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/wikipedia-100K.tar.gz + curl -L -o wikipedia-100K.tar.gz https://github.com/harsha-simhadri/big-ann-benchmarks/releases/download/v0.4.0/wikipedia-100K.tar.gz tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/ cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/ @@ -123,7 +123,7 @@ jobs: # A/A benchmark: OpenAI ArXiv 100K dataset (main vs main) aa-openai-100K: name: A/A - OAI ArXiv 100K - runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ] + runs-on: ubuntu-latest timeout-minutes: 120 steps: @@ -168,7 +168,7 @@ jobs: - name: Download openai-100K dataset run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - curl -L -o openai-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/openai-100K.tar.gz + curl -L -o openai-100K.tar.gz https://github.com/harsha-simhadri/big-ann-benchmarks/releases/download/v0.4.0/openai-100K.tar.gz tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/ cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/ diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index d5015df88..aa832b094 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -46,7 +46,7 @@ jobs: # Macro benchmark: Wikipedia-100K dataset macro-benchmark-wikipedia-100K: name: Macro Benchmark - Wikipedia 100K - runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ] + runs-on: ubuntu-latest # TODO: For production benchmarks, consider using a self-hosted runner with: # - NVMe storage for consistent I/O performance # - CPU pinning (taskset) for reduced variance @@ -95,7 +95,7 @@ jobs: - name: Download wikipedia-100K dataset run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - curl -L -o wikipedia-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/wikipedia-100K.tar.gz + curl -L -o wikipedia-100K.tar.gz https://github.com/harsha-simhadri/big-ann-benchmarks/releases/download/v0.4.0/wikipedia-100K.tar.gz tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/ cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/ @@ -141,7 +141,7 @@ jobs: # Macro benchmark: OpenAI ArXiv dataset macro-benchmark-oai-large: name: Macro Benchmark - OAI ArXiv 100K - runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ] + runs-on: ubuntu-latest # TODO: For production benchmarks, consider using a self-hosted runner timeout-minutes: 120 @@ -187,7 +187,7 @@ jobs: - name: Download openai-100K dataset run: | mkdir -p diskann_rust/target/tmp baseline/target/tmp - curl -L -o openai-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/openai-100K.tar.gz + curl -L -o openai-100K.tar.gz https://github.com/harsha-simhadri/big-ann-benchmarks/releases/download/v0.4.0/openai-100K.tar.gz tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/ cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/ From 9d635be150fa7bd62f07338db8d2db2bf6d91b90 Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Thu, 2 Apr 2026 11:17:30 +0800 Subject: [PATCH 29/31] address PR review: reduce search_list to 200, remove hardcoded Rust version, fix missing-field handling, clean up orphaned thresholds, switch data source to BAB v0.4.0 --- .github/scripts/benchmark_validate.py | 93 +++++++------------ .github/workflows/benchmarks-aa.yml | 9 +- .github/workflows/benchmarks.yml | 10 +- .../openai-100K-disk-index.json | 2 +- .../wikipedia-100K-disk-index.json | 2 +- 5 files changed, 46 insertions(+), 70 deletions(-) diff --git a/.github/scripts/benchmark_validate.py b/.github/scripts/benchmark_validate.py index ebcb95332..9081de76c 100644 --- a/.github/scripts/benchmark_validate.py +++ b/.github/scripts/benchmark_validate.py @@ -59,11 +59,11 @@ def extract_build_metrics(results: dict) -> dict[str, float]: name = span.get("span_name", "") data = span.get("metrics", {}) if name == "DiskIndexBuild-PqConstruction": - metrics["pq_construction_time"] = data.get("duration_seconds", 0) + metrics["pq_construction_time"] = data.get("duration_seconds") elif name == "DiskIndexBuild-InmemIndexBuild": - metrics["inmem_index_build_time"] = data.get("duration_seconds", 0) + metrics["inmem_index_build_time"] = data.get("duration_seconds") elif name == "DiskIndexBuild-DiskLayout": - metrics["disk_layout_time"] = data.get("duration_seconds", 0) + metrics["disk_layout_time"] = data.get("duration_seconds") return metrics @@ -78,15 +78,15 @@ def extract_search_metrics(results: dict, search_l: int, beam_width: int) -> dic # From search_results_per_l for sr in search.get("search_results_per_l", []): if sr.get("search_l") == search_l: - metrics["qps"] = sr.get("qps", 0) - metrics["recall"] = sr.get("recall", 0) - metrics["mean_latency"] = sr.get("mean_latency", 0) - metrics["mean_ios"] = sr.get("mean_ios", 0) - metrics["mean_comps"] = sr.get("mean_comparisons", 0) - metrics["mean_hops"] = sr.get("mean_hops", 0) - metrics["mean_io_time"] = sr.get("mean_io_time", 0) - metrics["mean_cpus"] = sr.get("mean_cpu_time", 0) - metrics["latency_95"] = sr.get("p95_latency", 0) + metrics["qps"] = sr.get("qps") + metrics["recall"] = sr.get("recall") + metrics["mean_latency"] = sr.get("mean_latency") + metrics["mean_ios"] = sr.get("mean_ios") + metrics["mean_comps"] = sr.get("mean_comparisons") + metrics["mean_hops"] = sr.get("mean_hops") + metrics["mean_io_time"] = sr.get("mean_io_time") + metrics["mean_cpus"] = sr.get("mean_cpu_time") + metrics["latency_95"] = sr.get("p95_latency") break # Override with span metrics if available @@ -117,25 +117,26 @@ def compute_diff(baseline_json: list[dict], target_json: list[dict]) -> list[dic inp = target.get("input", {}) search_phase = inp.get("content", {}).get("search_phase", {}) - search_list = search_phase.get("search_list", [2000]) + search_list = search_phase.get("search_list", [200]) beam_width = search_phase.get("beam_width", 4) - primary_l = search_list[0] if search_list else 2000 + primary_l = search_list[0] if search_list else 200 # Build metrics b_build = extract_build_metrics(b_results) t_build = extract_build_metrics(t_results) for key in ("total_time", "pq_construction_time", "inmem_index_build_time", "disk_layout_time"): - if key in t_build or key in b_build: - bv = b_build.get(key, 0) - tv = t_build.get(key, 0) - rows.append({ - "category": "index-build statistics", - "metric": key, - "baseline": bv, - "target": tv, - "deviation": ((tv - bv) / bv * 100) if bv else 0, - }) + bv = b_build.get(key) + tv = t_build.get(key) + if bv is None or tv is None: + continue # skip metrics missing from either side + rows.append({ + "category": "index-build statistics", + "metric": key, + "baseline": bv, + "target": tv, + "deviation": ((tv - bv) / bv * 100) if bv else 0, + }) # Search metrics b_search = extract_search_metrics(b_results, primary_l, beam_width) @@ -144,16 +145,17 @@ def compute_diff(baseline_json: list[dict], target_json: list[dict]) -> list[dic for key in ("qps", "recall", "mean_latency", "latency_95", "mean_ios", "mean_comps", "mean_hops", "mean_io_time", "mean_cpus"): - if key in t_search or key in b_search: - bv = b_search.get(key, 0) - tv = t_search.get(key, 0) - rows.append({ - "category": span_cat, - "metric": key, - "baseline": bv, - "target": tv, - "deviation": ((tv - bv) / bv * 100) if bv else 0, - }) + bv = b_search.get(key) + tv = t_search.get(key) + if bv is None or tv is None: + continue # skip metrics missing from either side + rows.append({ + "category": span_cat, + "metric": key, + "baseline": bv, + "target": tv, + "deviation": ((tv - bv) / bv * 100) if bv else 0, + }) return rows @@ -189,29 +191,6 @@ def compute_diff(baseline_json: list[dict], target_json: list[dict]) -> list[dic "total_comparisons": [1, "LT", ""], "search_hops": [1, "LT", ""], }, - "search-with-L=2000-bw=4": { - # Calibrated from 5 GitHub runner runs (10 observations) - "latency_95": [10, "LT", ""], - "mean_latency": [10, "LT", ""], - "mean_io_time": [10, "LT", ""], - "mean_cpus": [15, "LT", ""], # wider — CPU time is noisy on shared runners - "qps": [10, "GT", 6.5], - "mean_ios": [1, "LT", 2410], - "mean_comps": [1, "LT", 33200], - "mean_hops": [1, "LT", ""], - "recall": [1, "GT", 98.0], - }, - "search-with-L=100-bw=4": { - "latency_95": [10, "LT", ""], - "mean_latency": [10, "LT", ""], - "mean_io_time": [10, "LT", ""], - "mean_cpus": [15, "LT", ""], - "qps": [10, "GT", ""], - "mean_ios": [10, "LT", ""], - "mean_comps": [10, "LT", ""], - "mean_hops": [10, "LT", ""], - "recall": [1, "GT", ""], - }, "search-with-L=200-bw=4": { "latency_95": [10, "LT", ""], "mean_latency": [10, "LT", ""], diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml index 9161400af..c8256bd59 100644 --- a/.github/workflows/benchmarks-aa.yml +++ b/.github/workflows/benchmarks-aa.yml @@ -21,7 +21,6 @@ concurrency: env: RUST_BACKTRACE: 1 - rust_stable: "1.92" defaults: run: @@ -53,10 +52,10 @@ jobs: path: baseline lfs: true - - name: Install Rust ${{ env.rust_stable }} + - name: Install Rust uses: dtolnay/rust-toolchain@master with: - toolchain: ${{ env.rust_stable }} + toolchain: stable - name: Cache Rust dependencies (target) uses: Swatinem/rust-cache@v2 @@ -141,10 +140,10 @@ jobs: path: baseline lfs: true - - name: Install Rust ${{ env.rust_stable }} + - name: Install Rust uses: dtolnay/rust-toolchain@master with: - toolchain: ${{ env.rust_stable }} + toolchain: stable - name: Cache Rust dependencies (target) uses: Swatinem/rust-cache@v2 diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index aa832b094..194267355 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -31,8 +31,6 @@ concurrency: env: RUST_BACKTRACE: 1 - # Use the Rust version specified in rust-toolchain.toml - rust_stable: "1.92" defaults: run: @@ -67,10 +65,10 @@ jobs: path: baseline lfs: true - - name: Install Rust ${{ env.rust_stable }} + - name: Install Rust uses: dtolnay/rust-toolchain@master with: - toolchain: ${{ env.rust_stable }} + toolchain: stable - name: Cache Rust dependencies (current) uses: Swatinem/rust-cache@v2 @@ -159,10 +157,10 @@ jobs: path: baseline lfs: true - - name: Install Rust ${{ env.rust_stable }} + - name: Install Rust uses: dtolnay/rust-toolchain@master with: - toolchain: ${{ env.rust_stable }} + toolchain: stable - name: Cache Rust dependencies (current) uses: Swatinem/rust-cache@v2 diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json index 3a2a1d9e2..d021640fc 100644 --- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json @@ -24,7 +24,7 @@ "queries": "OpenAIArXiv/openai_query.bin", "groundtruth": "OpenAIArXiv/openai-100K", "search_list": [ - 2000 + 200 ], "beam_width": 4, "recall_at": 100, diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json index 6a52b1e32..e5f06aa1b 100644 --- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json +++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json @@ -24,7 +24,7 @@ "queries": "wikipedia_cohere/wikipedia_query.bin", "groundtruth": "wikipedia_cohere/wikipedia-100K", "search_list": [ - 2000 + 200 ], "beam_width": 4, "recall_at": 100, From 4ca80d101b7117afb8046bb4d6a7cc4efe3a1d8d Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Thu, 2 Apr 2026 11:30:38 +0800 Subject: [PATCH 30/31] widen latency_95 threshold to 15% for shared-runner noise --- .github/scripts/benchmark_validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/benchmark_validate.py b/.github/scripts/benchmark_validate.py index 9081de76c..9dd5de14b 100644 --- a/.github/scripts/benchmark_validate.py +++ b/.github/scripts/benchmark_validate.py @@ -192,7 +192,7 @@ def compute_diff(baseline_json: list[dict], target_json: list[dict]) -> list[dic "search_hops": [1, "LT", ""], }, "search-with-L=200-bw=4": { - "latency_95": [10, "LT", ""], + "latency_95": [15, "LT", ""], # wider — p95 latency is noisy on shared runners "mean_latency": [10, "LT", ""], "mean_io_time": [10, "LT", ""], "mean_cpus": [15, "LT", ""], From d168932ec0e26ee814f8bbd3d59e4155bff0403c Mon Sep 17 00:00:00 2001 From: "Yuanyuan Tian (from Dev Box)" Date: Thu, 2 Apr 2026 12:12:57 +0800 Subject: [PATCH 31/31] replace push trigger with pull_request trigger targeting main --- .github/workflows/benchmarks.yml | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 194267355..d75e0efe7 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -16,11 +16,21 @@ on: required: true default: 'main' type: string - push: + pull_request: branches: - - 'user/tianyuanyuan/add-benchmark-pipeline' + - main paths: - - 'diskann-benchmark/perf_test_inputs/**-disk-index.json' + - 'diskann/**' + - 'diskann-disk/**' + - 'diskann-linalg/**' + - 'diskann-providers/**' + - 'diskann-quantization/**' + - 'diskann-vector/**' + - 'diskann-wide/**' + - 'diskann-utils/**' + - 'diskann-platform/**' + - 'diskann-label-filter/**' + - 'diskann-benchmark/**' - '.github/workflows/benchmarks.yml' - '.github/scripts/benchmark_validate.py'