From c6402fb9a28408718a9f6768f6acf574bb6d6f42 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Thu, 19 Mar 2026 16:08:32 +0800
Subject: [PATCH 01/31] Migrate DiskANN benchmark pipeline from ADO to GitHub
 Actions

- Add benchmarks.yml workflow using workflow_dispatch, comparing current
  branch against a configurable baseline ref
- Add compare_disk_index_json_output.py to diff benchmark crate JSON outputs
  into a CSV suitable for benchmark_result_parse.py
- Add benchmark_result_parse.py for validating results and posting PR comments
- Add wikipedia-100K-disk-index.json benchmark config using the public
  Wikipedia-100K dataset from big-ann-benchmarks (100K Cohere embeddings,
  768-dim, cosine distance) to replace internal ADO datasets
---
 .github/scripts/benchmark_result_parse.py     | 507 ++++++++++++++++++
 .../scripts/compare_disk_index_json_output.py | 258 +++++++++
 .github/workflows/benchmarks.yml              | 318 +++++++++++
 .../wikipedia-100K-disk-index.json            |  40 ++
 4 files changed, 1123 insertions(+)
 create mode 100644 .github/scripts/benchmark_result_parse.py
 create mode 100644 .github/scripts/compare_disk_index_json_output.py
 create mode 100644 .github/workflows/benchmarks.yml
 create mode 100644 diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json

diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py
new file mode 100644
index 000000000..0308b2990
--- /dev/null
+++ b/.github/scripts/benchmark_result_parse.py
@@ -0,0 +1,507 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+"""
+Benchmark Result Parser for GitHub Actions
+
+Parses benchmark CSV results and validates against thresholds.
+Posts comments to GitHub PRs when regressions are detected.
+
+Migrated from ADO: .pipelines/templates/BenchmarkResultParse.py
+
+Usage:
+    python benchmark_result_parse.py --mode pr --file results.csv
+    python benchmark_result_parse.py --mode aa --file results.csv --data search
+
+Environment Variables (for PR comments):
+    GITHUB_TOKEN: GitHub token for API access
+    GITHUB_REPOSITORY: Owner/repo (e.g., "microsoft/DiskANN")
+    GITHUB_PR_NUMBER: Pull request number
+    GITHUB_RUN_ID: Workflow run ID for linking to logs
+"""
+
+import csv
+import os
+import sys
+import argparse
+import json
+from typing import Any
+
+# Optional: requests for posting PR comments
+try:
+    import requests
+    HAS_REQUESTS = True
+except ImportError:
+    HAS_REQUESTS = False
+
+
+# =============================================================================
+# Data Structures
+# =============================================================================
+
+# Template for full benchmark data (build + search)
+DATA_TEMPLATE_FULL = {
+    "DiskIndexBuild-PqConstruction": {
+        "duration_seconds": [],
+        "peak_memory_usage": []
+    },
+    "DiskIndexBuild-InmemIndexBuild": {
+        "duration_seconds": [],
+        "peak_memory_usage": []
+    },
+    "search_disk_index-search_completed": {
+        "duration_seconds": [],
+        "peak_memory_usage": []
+    },
+    "disk_index_perf_test": {
+        "total_duration_seconds": [],
+    },
+    "index-build statistics": {
+        "total_time": [],
+        "total_comparisons": [],
+        "search_hops": []
+    },
+    "search-with-L=2000-bw=4": {
+        "latency_95": [],
+        "mean_latency": [],
+        "mean_io_time": [],
+        "mean_cpus": [],
+        "qps": [],
+        "mean_ios": [],
+        "mean_comps": [],
+        "mean_hops": [],
+        "recall": []
+    }
+}
+
+# Template for search-only benchmark data
+DATA_TEMPLATE_SEARCH = {
+    "search_disk_index-search_completed": {
+        "duration_seconds": [],
+        "peak_memory_usage": []
+    },
+    "disk_index_perf_test": {
+        "total_duration_seconds": [],
+    },
+    "search-with-L=2000-bw=4": {
+        "latency_95": [],
+        "mean_latency": [],
+        "mean_io_time": [],
+        "mean_cpus": [],
+        "qps": [],
+        "mean_ios": [],
+        "mean_comps": [],
+        "mean_hops": [],
+        "recall": []
+    }
+}
+
+# Thresholds for benchmark values
+# Format: [threshold_percentage, direction, contract_value]
+# - threshold_percentage: Maximum allowed deviation percentage
+# - direction: 'GT' = higher is better, 'LT' = lower is better
+# - contract_value: Promised performance value (empty string if none)
+#
+# For 'GT' metrics (like QPS, recall): regression if value decreases beyond threshold
+# For 'LT' metrics (like latency, memory): regression if value increases beyond threshold
+DATA_THRESHOLDS = {
+    "DiskIndexBuild-PqConstruction": {
+        "duration_seconds": [10, 'LT', ""],
+        "peak_memory_usage": [10, 'LT', ""]
+    },
+    "DiskIndexBuild-InmemIndexBuild": {
+        "duration_seconds": [10, 'LT', ""],
+        "peak_memory_usage": [10, 'LT', ""]
+    },
+    "search_disk_index-search_completed": {
+        "duration_seconds": [10, 'LT', ""],
+        "peak_memory_usage": [10, 'LT', 1.42]
+    },
+    "disk_index_perf_test": {
+        "total_duration_seconds": [10, 'LT', ""],
+    },
+    "index-build statistics": {
+        "total_time": [10, 'LT', 1206],
+        "total_comparisons": [1, 'LT', ""],
+        "search_hops": [1, 'LT', ""]
+    },
+    "search-with-L=2000-bw=4": {
+        "latency_95": [10, 'LT', ""],
+        "mean_latency": [10, 'LT', ""],
+        "mean_io_time": [10, 'LT', ""],
+        "mean_cpus": [10, 'LT', ""],
+        "qps": [10, 'GT', 29],
+        "mean_ios": [1, 'LT', 2026],
+        "mean_comps": [1, 'LT', 50000],
+        "mean_hops": [1, 'LT', ""],
+        "recall": [1, 'GT', 95.1]
+    }
+}
+
+
+# =============================================================================
+# CSV Parsing
+# =============================================================================
+
+def parse_csv(file_path: str, data: dict[str, dict[str, list]]) -> dict[str, dict[str, list]]:
+    """
+    Parse benchmark CSV file and populate data structure.
+    
+    CSV format expected:
+        Column 0: (unused)
+        Column 1: Category name (e.g., "search-with-L=2000-bw=4")
+        Column 2: Metric name (e.g., "qps")
+        Column 3: Current value
+        Column 4: Baseline value
+        Column 5: Change percentage
+    """
+    with open(file_path, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        next(reader)  # Skip header row
+        
+        current_key = None
+        for row in reader:
+            if len(row) < 6:
+                continue
+                
+            # Column 1 contains category name (only set on first row of category)
+            if row[1]:
+                current_key = row[1]
+            elif current_key and current_key in data:
+                metric_name = row[2]
+                if metric_name in data[current_key]:
+                    # Append: [current_value, baseline_value, change_percentage]
+                    data[current_key][metric_name].append(row[3])  # current
+                    data[current_key][metric_name].append(row[4])  # baseline
+                    data[current_key][metric_name].append(row[5])  # change %
+    
+    return data
+
+
+def get_data_template(data_type: str) -> dict[str, dict[str, list]]:
+    """Get a fresh copy of the data template."""
+    import copy
+    if data_type == 'search':
+        return copy.deepcopy(DATA_TEMPLATE_SEARCH)
+    return copy.deepcopy(DATA_TEMPLATE_FULL)
+
+
+# =============================================================================
+# Threshold Checking
+# =============================================================================
+
+def get_target_change_range(threshold: float, direction: str, mode: str) -> tuple[float, float]:
+    """
+    Calculate acceptable change range based on threshold and direction.
+    
+    Args:
+        threshold: Maximum allowed deviation percentage
+        direction: 'GT' (higher is better) or 'LT' (lower is better)
+        mode: 'aa' (A/A test, symmetric) or 'pr' (PR test, directional)
+    
+    Returns:
+        Tuple of (min_allowed, max_allowed) change percentages
+    """
+    if mode == 'aa':
+        # A/A test: symmetric threshold
+        return (-threshold, threshold)
+    else:
+        # PR test: directional threshold
+        if direction == 'GT':
+            # Higher is better: allow any improvement, flag regressions
+            return (-threshold, float('inf'))
+        else:
+            # Lower is better: allow any improvement (negative change), flag increases
+            return (float('-inf'), threshold)
+
+
+def format_interval(start: float, end: float) -> str:
+    """Format a numeric interval as a string."""
+    start_str = '-inf' if start == float('-inf') else f"{start}%"
+    end_str = 'inf' if end == float('inf') else f"{end}%"
+    return f"({start_str} - {end_str})"
+
+
+def is_change_threshold_failed(change: float, target_range: tuple[float, float]) -> bool:
+    """Check if the change exceeds the allowed threshold range."""
+    return change < target_range[0] or change > target_range[1]
+
+
+def is_promise_broken(current_value: float, target_value: Any, direction: str) -> tuple[bool, str]:
+    """
+    Check if the current value violates a promised contract value.
+    
+    Returns:
+        Tuple of (is_broken, formatted_target_value)
+    """
+    if target_value == "":
+        return False, "N/A"
+    
+    target_value = float(target_value)
+    
+    if direction == 'GT':
+        # Higher is better: current should be >= target
+        if current_value < target_value:
+            return True, f"> {target_value}"
+    else:
+        # Lower is better: current should be <= target
+        if current_value > target_value:
+            return True, f"< {target_value}"
+    
+    return False, str(target_value)
+
+
+def get_outcome_message(threshold_failed: bool, promise_broken: bool) -> str:
+    """Generate human-readable outcome message."""
+    if threshold_failed and promise_broken:
+        return 'Regression detected, Promise broken'
+    elif promise_broken:
+        return 'Promise broken'
+    elif threshold_failed:
+        return 'Regression detected'
+    return 'OK'
+
+
+def check_thresholds(
+    data: dict[str, dict[str, list]],
+    thresholds: dict[str, dict[str, list]],
+    mode: str,
+    run_id: str | None = None
+) -> tuple[bool, str]:
+    """
+    Check all metrics against their thresholds.
+    
+    Returns:
+        Tuple of (has_failures, failure_report_markdown)
+    """
+    failed_rows = []
+    
+    for category in data:
+        for metric in data[category]:
+            # Skip metrics without thresholds defined
+            if category not in thresholds or metric not in thresholds[category]:
+                print(f"Skipping {category}/{metric} - no threshold defined")
+                continue
+            
+            values = data[category][metric]
+            if not values:
+                print(f"ERROR: {category}/{metric} has no data")
+                return True, f"Missing data for {category}/{metric}"
+            
+            # Parse values: [current, baseline, change%]
+            try:
+                value_current = float(values[0])
+                value_baseline = float(values[1])
+                change = float(values[2]) if values[2] else 0.0
+            except (ValueError, IndexError) as e:
+                print(f"ERROR: Failed to parse {category}/{metric}: {e}")
+                return True, f"Parse error for {category}/{metric}"
+            
+            # Get threshold config
+            threshold_config = thresholds[category][metric]
+            threshold_pct = threshold_config[0]
+            direction = threshold_config[1]
+            contract_value = threshold_config[2]
+            
+            # Check thresholds
+            target_range = get_target_change_range(threshold_pct, direction, mode)
+            threshold_failed = is_change_threshold_failed(change, target_range)
+            promise_broken, target_formatted = is_promise_broken(value_current, contract_value, direction)
+            
+            if threshold_failed:
+                print(f"THRESHOLD FAILED: {category}/{metric} change={change}% allowed={format_interval(*target_range)}")
+            if promise_broken:
+                print(f"PROMISE BROKEN: {category}/{metric} value={value_current} required={target_formatted}")
+            
+            if threshold_failed or promise_broken:
+                outcome = get_outcome_message(threshold_failed, promise_broken)
+                failed_rows.append(
+                    f"| {category}/{metric} | {value_baseline} | {value_current} | "
+                    f"{target_formatted} | {change}% | {format_interval(*target_range)} | {outcome} |"
+                )
+    
+    if failed_rows:
+        # Build failure report
+        logs_link = ""
+        if run_id:
+            repo = os.getenv('GITHUB_REPOSITORY', 'microsoft/DiskANN')
+            logs_link = f"https://github.com/{repo}/actions/runs/{run_id}"
+        
+        report = "### ❌ Benchmark Check Failed\n\n"
+        if logs_link:
+            report += f"Please investigate the [workflow logs]({logs_link}) to determine if the failure is due to your changes.\n\n"
+        
+        report += "| Metric | Baseline | Current | Contract | Change | Allowed | Outcome |\n"
+        report += "|--------|----------|---------|----------|--------|---------|--------|\n"
+        report += "\n".join(failed_rows)
+        
+        return True, report
+    
+    return False, ""
+
+
+# =============================================================================
+# GitHub Integration
+# =============================================================================
+
+def post_github_pr_comment(comment: str) -> bool:
+    """
+    Post a comment to a GitHub pull request.
+    
+    Requires environment variables:
+        GITHUB_TOKEN: Personal access token or GitHub Actions token
+        GITHUB_REPOSITORY: Owner/repo format
+        GITHUB_PR_NUMBER: Pull request number
+    """
+    if not HAS_REQUESTS:
+        print("WARNING: 'requests' module not available, cannot post PR comment")
+        return False
+    
+    token = os.getenv('GITHUB_TOKEN')
+    repo = os.getenv('GITHUB_REPOSITORY')
+    pr_number = os.getenv('GITHUB_PR_NUMBER')
+    
+    if not all([token, repo, pr_number]):
+        print("WARNING: Missing GitHub environment variables for PR comment")
+        print(f"  GITHUB_TOKEN: {'set' if token else 'missing'}")
+        print(f"  GITHUB_REPOSITORY: {repo or 'missing'}")
+        print(f"  GITHUB_PR_NUMBER: {pr_number or 'missing'}")
+        return False
+    
+    url = f"https://api.github.com/repos/{repo}/issues/{pr_number}/comments"
+    headers = {
+        "Accept": "application/vnd.github+json",
+        "Authorization": f"Bearer {token}",
+        "X-GitHub-Api-Version": "2022-11-28"
+    }
+    body = {"body": comment}
+    
+    try:
+        response = requests.post(url, headers=headers, json=body, timeout=30)
+        response.raise_for_status()
+        print(f"Successfully posted comment to PR #{pr_number}")
+        return True
+    except requests.RequestException as e:
+        print(f"ERROR: Failed to post PR comment: {e}")
+        return False
+
+
+def write_github_step_summary(content: str) -> None:
+    """Write content to GitHub Actions step summary."""
+    summary_file = os.getenv('GITHUB_STEP_SUMMARY')
+    if summary_file:
+        with open(summary_file, 'a', encoding='utf-8') as f:
+            f.write(content)
+            f.write("\n")
+
+
+def write_github_output(name: str, value: str) -> None:
+    """Write an output variable for GitHub Actions."""
+    output_file = os.getenv('GITHUB_OUTPUT')
+    if output_file:
+        with open(output_file, 'a', encoding='utf-8') as f:
+            f.write(f"{name}={value}\n")
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description='Parse benchmark results and validate against thresholds.',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Check PR benchmark results
+    python benchmark_result_parse.py --mode pr --file results_change.csv
+    
+    # Check A/A test results (symmetric thresholds)
+    python benchmark_result_parse.py --mode aa --file results_change.csv
+    
+    # Check search-only benchmarks
+    python benchmark_result_parse.py --mode pr --file results_change.csv --data search
+        """
+    )
+    parser.add_argument(
+        '--mode',
+        type=str,
+        default='aa',
+        choices=['aa', 'pr', 'lkg'],
+        help='Benchmark mode: aa=A/A test (symmetric), pr=PR test (directional), lkg=last known good'
+    )
+    parser.add_argument(
+        '--data',
+        type=str,
+        default='both',
+        choices=['both', 'search'],
+        help='Type of benchmark data: both=full benchmark, search=search-only'
+    )
+    parser.add_argument(
+        '--file',
+        type=str,
+        default=None,
+        help='Path to CSV file (overrides FILE_PATH env var)'
+    )
+    parser.add_argument(
+        '--no-comment',
+        action='store_true',
+        help='Skip posting PR comment even in pr mode'
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    
+    # Get file path
+    file_path = args.file or os.getenv('FILE_PATH')
+    if not file_path:
+        print("ERROR: No input file specified. Use --file or set FILE_PATH env var.")
+        return 1
+    
+    if not os.path.exists(file_path):
+        print(f"ERROR: File not found: {file_path}")
+        return 1
+    
+    print(f"Benchmark mode: {args.mode}")
+    print(f"Data type: {args.data}")
+    print(f"Input file: {file_path}")
+    
+    # Parse CSV
+    data_template = get_data_template(args.data)
+    data = parse_csv(file_path, data_template)
+    
+    # Debug output
+    print("\nParsed data:")
+    print(json.dumps({k: {sk: sv for sk, sv in v.items() if sv} for k, v in data.items() if any(v.values())}, indent=2))
+    
+    # Check thresholds
+    run_id = os.getenv('GITHUB_RUN_ID')
+    has_failures, report = check_thresholds(data, DATA_THRESHOLDS, args.mode, run_id)
+    
+    if has_failures:
+        print("\n" + report)
+        
+        # Write to GitHub step summary
+        write_github_step_summary(report)
+        
+        # Post PR comment if in pr mode
+        if args.mode == 'pr' and not args.no_comment:
+            post_github_pr_comment(report)
+        
+        # Set output for downstream steps
+        write_github_output('benchmark_failed', 'true')
+        
+        return 1
+    
+    print("\n✅ All benchmark values passed!")
+    write_github_step_summary("### ✅ Benchmark Check Passed\n\nAll metrics within acceptable thresholds.")
+    write_github_output('benchmark_failed', 'false')
+    
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/.github/scripts/compare_disk_index_json_output.py b/.github/scripts/compare_disk_index_json_output.py
new file mode 100644
index 000000000..e3fa5afce
--- /dev/null
+++ b/.github/scripts/compare_disk_index_json_output.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+"""
+Compare two disk-index benchmark JSON files and emit a diff CSV.
+
+This script takes baseline and branch (target) JSON files from the benchmark crate's
+disk-index benchmarks and produces a CSV file comparing the metrics with deviation percentages.
+
+The output format matches the CSV structure expected by benchmark_result_parse.py:
+  Parent Span Name, Span Name, Stat Key, Stat Value (Target), Stat Value (Baseline), Deviation (%)
+
+Migrated from ADO: .pipelines/templates/compare_disk_index_json_output.py
+
+Usage:
+    python compare_disk_index_json_output.py \\
+        --baseline baseline/target/tmp/<dataset>_benchmark_crate_baseline.json \\
+        --branch diskann_rust/target/tmp/<dataset>_benchmark_crate_target.json \\
+        --out diskann_rust/target/tmp/<dataset>_change.csv
+"""
+
+import json
+import csv
+import argparse
+from typing import List, Dict, Any, Optional
+
+
+def load_json(path: str) -> List[Dict[str, Any]]:
+    """Load JSON file and return the parsed content."""
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def calc_deviation(baseline: float, target: float) -> str:
+    """Calculate the percentage deviation from baseline to target."""
+    try:
+        if baseline != 0:
+            dev = ((target - baseline) / baseline) * 100
+            return f"{dev:.2f}"
+        return ""
+    except Exception:
+        return ""
+
+
+def extract_build_metrics(results: Dict[str, Any]) -> Dict[str, Any]:
+    """Extract build metrics from the results structure."""
+    if not results:
+        return {}
+
+    build = results.get("build", {})
+    if not build:
+        return {}
+
+    metrics = {}
+
+    # Total build time (in seconds)
+    build_time = build.get("build_time")
+    if build_time:
+        # build_time is in microseconds, convert to seconds
+        metrics["total_time"] = build_time / 1e6
+
+    # Extract span metrics
+    span_metrics = build.get("span_metrics", {})
+    spans = span_metrics.get("spans", [])
+
+    for span in spans:
+        span_name = span.get("span_name", "")
+        span_data = span.get("metrics", {})
+
+        if span_name == "DiskIndexBuild-PqConstruction":
+            metrics["pq_construction_time"] = span_data.get("duration_seconds", 0)
+        elif span_name == "DiskIndexBuild-InmemIndexBuild":
+            metrics["inmem_index_build_time"] = span_data.get("duration_seconds", 0)
+        elif span_name == "DiskIndexBuild-DiskLayout":
+            metrics["disk_layout_time"] = span_data.get("duration_seconds", 0)
+        elif span_name == "disk-index-build":
+            metrics["total_build_duration"] = span_data.get("duration_seconds", 0)
+
+    return metrics
+
+
+def extract_search_metrics(results: Dict[str, Any], search_l: int, beam_width: int) -> Dict[str, Any]:
+    """Extract search metrics for a specific search_l value."""
+    if not results:
+        return {}
+
+    search = results.get("search", {})
+    if not search:
+        return {}
+
+    metrics = {}
+
+    # Find the search result for the specified search_l
+    search_results = search.get("search_results_per_l", [])
+    for sr in search_results:
+        if sr.get("search_l") == search_l:
+            metrics["qps"] = sr.get("qps", 0)
+            metrics["recall"] = sr.get("recall", 0)
+            metrics["mean_latency"] = sr.get("mean_latency", 0)
+            metrics["mean_ios"] = sr.get("mean_ios", 0)
+            metrics["mean_comps"] = sr.get("mean_comparisons", 0)
+            metrics["mean_hops"] = sr.get("mean_hops", 0)
+            metrics["mean_io_time"] = sr.get("mean_io_time", 0)
+            metrics["mean_cpus"] = sr.get("mean_cpu_time", 0)
+            metrics["latency_95"] = sr.get("p999_latency", 0)  # Use p999 as proxy for 95th percentile
+            break
+
+    # Also try span metrics
+    span_metrics = search.get("span_metrics", {})
+    spans = span_metrics.get("spans", [])
+
+    search_span_name = f"search-with-L={search_l}-bw={beam_width}"
+    for span in spans:
+        if span.get("span_name") == search_span_name:
+            span_data = span.get("metrics", {})
+            # Override with span metrics if they exist
+            if "qps" in span_data:
+                metrics["qps"] = span_data["qps"]
+            if "recall" in span_data:
+                metrics["recall"] = span_data["recall"]
+            if "mean_latency" in span_data:
+                metrics["mean_latency"] = span_data["mean_latency"]
+            if "mean_ios" in span_data:
+                metrics["mean_ios"] = span_data["mean_ios"]
+            if "mean_comps" in span_data:
+                metrics["mean_comps"] = span_data["mean_comps"]
+            if "mean_hops" in span_data:
+                metrics["mean_hops"] = span_data["mean_hops"]
+            if "mean_io_time" in span_data:
+                metrics["mean_io_time"] = span_data["mean_io_time"]
+            if "mean_cpus" in span_data:
+                metrics["mean_cpus"] = span_data["mean_cpus"]
+            break
+
+    return metrics
+
+
+def make_rows(baseline_list: List[Dict], target_list: List[Dict]) -> List[List[str]]:
+    """Generate comparison rows for the CSV output."""
+    rows = []
+
+    for baseline, target in zip(baseline_list, target_list):
+        baseline_results = baseline.get("results", {})
+        target_results = target.get("results", {})
+
+        # Get input info for context
+        inp = target.get("input", {})
+        content = inp.get("content", {})
+        search_phase = content.get("search_phase", {})
+
+        # Determine search_l and beam_width for search metrics
+        search_list = search_phase.get("search_list", [2000])
+        beam_width = search_phase.get("beam_width", 4)
+
+        # Use the first (or primary) search_l value
+        primary_search_l = search_list[0] if search_list else 2000
+
+        # Extract build metrics
+        baseline_build = extract_build_metrics(baseline_results)
+        target_build = extract_build_metrics(target_results)
+
+        # Build metrics rows
+        build_metrics = [
+            ("total_time", "total build time (s)"),
+            ("pq_construction_time", "PQ construction (s)"),
+            ("inmem_index_build_time", "in-memory index build (s)"),
+            ("disk_layout_time", "disk layout (s)"),
+        ]
+
+        for key, display_name in build_metrics:
+            if key in target_build or key in baseline_build:
+                target_val = target_build.get(key, 0)
+                baseline_val = baseline_build.get(key, 0)
+                rows.append([
+                    "index-build statistics",
+                    display_name,
+                    key,
+                    str(target_val),
+                    str(baseline_val),
+                    calc_deviation(baseline_val, target_val)
+                ])
+
+        # Extract search metrics for the primary search_l
+        baseline_search = extract_search_metrics(baseline_results, primary_search_l, beam_width)
+        target_search = extract_search_metrics(target_results, primary_search_l, beam_width)
+
+        search_span_name = f"search-with-L={primary_search_l}-bw={beam_width}"
+
+        # Search metrics rows
+        search_metrics = [
+            ("qps", "queries per second"),
+            ("recall", "recall (%)"),
+            ("mean_latency", "mean latency (μs)"),
+            ("latency_95", "p999 latency (μs)"),
+            ("mean_ios", "mean IOs"),
+            ("mean_comps", "mean comparisons"),
+            ("mean_hops", "mean hops"),
+            ("mean_io_time", "mean IO time (μs)"),
+            ("mean_cpus", "mean CPU time (μs)"),
+        ]
+
+        for key, display_name in search_metrics:
+            if key in target_search or key in baseline_search:
+                target_val = target_search.get(key, 0)
+                baseline_val = baseline_search.get(key, 0)
+                rows.append([
+                    search_span_name,
+                    display_name,
+                    key,
+                    str(target_val),
+                    str(baseline_val),
+                    calc_deviation(baseline_val, target_val)
+                ])
+
+    return rows
+
+
+def write_csv(rows: List[List[str]], out_path: str):
+    """Write the comparison rows to a CSV file."""
+    header = [
+        "Parent Span Name",
+        "Span Name",
+        "Stat Key",
+        "Stat Value (Target)",
+        "Stat Value (Baseline)",
+        "Deviation (%)"
+    ]
+    with open(out_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        writer.writerow(header)
+        writer.writerows(rows)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compare two disk-index benchmark JSONs and emit a diff CSV."
+    )
+    parser.add_argument("--baseline", "-b", required=True, help="Path to baseline JSON")
+    parser.add_argument("--branch", "-r", required=True, help="Path to branch/target JSON")
+    parser.add_argument("--out", "-o", required=True, help="Where to write output CSV")
+    args = parser.parse_args()
+
+    baseline_list = load_json(args.baseline)
+    target_list = load_json(args.branch)
+
+    if len(baseline_list) != len(target_list):
+        raise ValueError(
+            f"baseline/branch JSON arrays differ in length: {len(baseline_list)} vs {len(target_list)}"
+        )
+
+    rows = make_rows(baseline_list, target_list)
+    write_csv(rows, args.out)
+    print(f"✓ Written diff CSV to {args.out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
new file mode 100644
index 000000000..daf181ac6
--- /dev/null
+++ b/.github/workflows/benchmarks.yml
@@ -0,0 +1,318 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+# DiskANN Benchmarks Workflow
+# Migrated from ADO pipeline: .pipelines/DiskANN-Benchmarks.yml
+#
+# This workflow runs macro benchmarks comparing the current branch against a baseline.
+# It is manually triggered and requires a baseline reference (branch, tag, or commit).
+
+name: Benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      baseline_ref:
+        description: 'A branch, commit SHA, or tag name to compare the current branch with'
+        required: true
+        default: 'main'
+        type: string
+
+# Cancel in-progress runs when a new run is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  # Use the Rust version specified in rust-toolchain.toml
+  rust_stable: "1.92"
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+  pull-requests: write  # Required for posting PR comments
+
+jobs:
+  # Macro benchmark: Mimir Enron dataset
+  macro-benchmark-mimir-enron:
+    name: Macro Benchmark - Mimir Enron
+    runs-on: ubuntu-latest
+    # TODO: For production benchmarks, consider using a self-hosted runner with:
+    # - NVMe storage for consistent I/O performance
+    # - CPU pinning (taskset) for reduced variance
+    # - Dedicated hardware to avoid noisy neighbor effects
+    timeout-minutes: 120
+
+    steps:
+      - name: Checkout current branch
+        uses: actions/checkout@v4
+        with:
+          path: diskann_rust
+          lfs: true
+
+      - name: Checkout baseline (${{ inputs.baseline_ref }})
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.baseline_ref }}
+          path: baseline
+          lfs: true
+
+      - name: Install Rust ${{ env.rust_stable }}
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: ${{ env.rust_stable }}
+
+      - name: Cache Rust dependencies (current)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: diskann_rust -> target
+          key: benchmark-current
+
+      - name: Cache Rust dependencies (baseline)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: baseline -> target
+          key: benchmark-baseline
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y openssl libssl-dev pkg-config python3-pip
+          pip install csvtomd numpy scipy
+
+      # Download the public Wikipedia-100K dataset via big-ann-benchmarks
+      # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance)
+      # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
+      - name: Clone big-ann-benchmarks
+        run: git clone --depth 1 https://github.com/harsha-simhadri/big-ann-benchmarks.git
+
+      - name: Download wikipedia-100K dataset
+        working-directory: big-ann-benchmarks
+        run: python create_dataset.py --dataset wikipedia-100K
+
+      - name: Copy dataset to benchmark directories
+        run: |
+          mkdir -p diskann_rust/target/tmp baseline/target/tmp
+          cp -r big-ann-benchmarks/data/wikipedia_cohere diskann_rust/target/tmp/
+          cp -r big-ann-benchmarks/data/wikipedia_cohere baseline/target/tmp/
+
+      - name: Run baseline benchmark
+        working-directory: baseline
+        run: |
+          # Note: For accurate benchmarks, consider using CPU pinning on self-hosted runners:
+          # sudo taskset -c 0,2,4,6 ionice -c 1 -n 0 cargo run ...
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
+            --output-file target/tmp/wikipedia-100K_benchmark_crate_baseline.json
+
+      - name: Run current branch benchmark
+        working-directory: diskann_rust
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
+            --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json
+
+      - name: Generate diff stats (baseline vs target)
+        continue-on-error: true
+        run: |
+          python diskann_rust/.github/scripts/compare_disk_index_json_output.py \
+            --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \
+            --branch diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \
+            --out diskann_rust/target/tmp/wikipedia-100K_change.csv
+
+      - name: Convert results to Markdown
+        working-directory: diskann_rust
+        run: |
+          csvtomd target/tmp/wikipedia-100K_change.csv > target/tmp/wikipedia-100K_change.md
+          echo "### Benchmark Results: Wikipedia-100K Dataset" >> $GITHUB_STEP_SUMMARY
+          cat target/tmp/wikipedia-100K_change.md >> $GITHUB_STEP_SUMMARY
+
+      - name: Validate benchmark results
+        working-directory: diskann_rust
+        run: |
+          python .github/scripts/benchmark_result_parse.py \
+            --mode pr \
+            --file target/tmp/wikipedia-100K_change.csv
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        if: always()  # Upload even if validation fails
+        with:
+          name: benchmark-results-wikipedia-100K
+          path: |
+            diskann_rust/target/tmp/wikipedia-100K_change.csv
+            diskann_rust/target/tmp/wikipedia-100K_change.md
+            diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json
+            baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json
+          retention-days: 30
+
+  # Macro benchmark: OAI Large dataset
+  macro-benchmark-oai-large:
+    name: Macro Benchmark - OAI Large
+    runs-on: ubuntu-latest
+    # TODO: For production benchmarks, consider using a self-hosted runner
+    timeout-minutes: 120
+
+    steps:
+      - name: Checkout current branch
+        uses: actions/checkout@v4
+        with:
+          path: diskann_rust
+          lfs: true
+
+      - name: Checkout baseline (${{ inputs.baseline_ref }})
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.baseline_ref }}
+          path: baseline
+          lfs: true
+
+      - name: Install Rust ${{ env.rust_stable }}
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: ${{ env.rust_stable }}
+
+      - name: Cache Rust dependencies (current)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: diskann_rust -> target
+          key: benchmark-current
+
+      - name: Cache Rust dependencies (baseline)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: baseline -> target
+          key: benchmark-baseline
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y openssl libssl-dev pkg-config python3-pip
+          pip install csvtomd numpy scipy
+
+      # Download the public Wikipedia-100K dataset via big-ann-benchmarks
+      # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance)
+      # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
+      - name: Clone big-ann-benchmarks
+        run: git clone --depth 1 https://github.com/harsha-simhadri/big-ann-benchmarks.git
+
+      - name: Download wikipedia-100K dataset
+        working-directory: big-ann-benchmarks
+        run: python create_dataset.py --dataset wikipedia-100K
+
+      - name: Copy dataset to benchmark directories
+        run: |
+          mkdir -p diskann_rust/target/tmp baseline/target/tmp
+          cp -r big-ann-benchmarks/data/wikipedia_cohere diskann_rust/target/tmp/
+          cp -r big-ann-benchmarks/data/wikipedia_cohere baseline/target/tmp/
+
+      - name: Run baseline benchmark
+        working-directory: baseline
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
+            --output-file target/tmp/wikipedia-100K_benchmark_crate_baseline.json
+
+      - name: Run current branch benchmark
+        working-directory: diskann_rust
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
+            --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json
+
+      - name: Generate diff stats (baseline vs target)
+        continue-on-error: true
+        run: |
+          python diskann_rust/.github/scripts/compare_disk_index_json_output.py \
+            --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \
+            --branch diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \
+            --out diskann_rust/target/tmp/wikipedia-100K_change.csv
+
+      - name: Convert results to Markdown
+        working-directory: diskann_rust
+        run: |
+          csvtomd target/tmp/wikipedia-100K_change.csv > target/tmp/wikipedia-100K_change.md
+          echo "### Benchmark Results: Wikipedia-100K Dataset" >> $GITHUB_STEP_SUMMARY
+          cat target/tmp/wikipedia-100K_change.md >> $GITHUB_STEP_SUMMARY
+
+      - name: Validate benchmark results
+        working-directory: diskann_rust
+        run: |
+          python .github/scripts/benchmark_result_parse.py \
+            --mode pr \
+            --file target/tmp/wikipedia-100K_change.csv
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        if: always()  # Upload even if validation fails
+        with:
+          name: benchmark-results-oai-wikipedia-100K
+          path: |
+            diskann_rust/target/tmp/wikipedia-100K_change.csv
+            diskann_rust/target/tmp/wikipedia-100K_change.md
+            diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json
+            baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json
+          retention-days: 30
+
+  # NOTE: IAI micro-benchmarks are temporarily disabled in the ADO pipeline
+  # due to callgrind not running with Rust version `ms-1.86.0`.
+  # Uncomment when ready to enable:
+  #
+  # micro-benchmark-iai:
+  #   name: Micro Benchmark - IAI
+  #   runs-on: ubuntu-latest
+  #   timeout-minutes: 120
+  #
+  #   steps:
+  #     - name: Checkout current branch
+  #       uses: actions/checkout@v4
+  #       with:
+  #         path: diskann_rust
+  #
+  #     - name: Checkout baseline (${{ inputs.baseline_ref }})
+  #       uses: actions/checkout@v4
+  #       with:
+  #         ref: ${{ inputs.baseline_ref }}
+  #         path: baseline
+  #
+  #     - name: Install Rust ${{ env.rust_stable }}
+  #       uses: dtolnay/rust-toolchain@master
+  #       with:
+  #         toolchain: ${{ env.rust_stable }}
+  #
+  #     - name: Install valgrind and iai-callgrind-runner
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install -y valgrind
+  #         cargo install --version 0.14.0 iai-callgrind-runner
+  #
+  #     - name: Run baseline IAI benchmarks
+  #       working-directory: baseline
+  #       run: |
+  #         cargo bench --bench bench_main_iai
+  #         cargo bench --bench bench_main_vector_iai
+  #
+  #     - name: Copy IAI baseline files
+  #       run: |
+  #         mkdir -p diskann_rust/target
+  #         cp -R baseline/target/iai diskann_rust/target/
+  #
+  #     - name: Run current branch IAI benchmarks
+  #       working-directory: diskann_rust
+  #       run: |
+  #         cargo bench --bench bench_main_iai
+  #         cargo bench --bench bench_main_vector_iai
diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
new file mode 100644
index 000000000..1557a594a
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
@@ -0,0 +1,40 @@
+{
+    "search_directories": [
+        "target/tmp"
+    ],
+    "jobs": [
+        {
+            "type": "disk-index",
+            "content": {
+                "source": {
+                    "disk-index-source": "Build",
+                    "data_type": "float32",
+                    "data": "wikipedia_cohere/wikipedia_base.bin.crop_nb_100000",
+                    "distance": "cosine_normalized",
+                    "dim": 768,
+                    "max_degree": 32,
+                    "l_build": 50,
+                    "num_threads": 4,
+                    "build_ram_limit_gb": 4.0,
+                    "num_pq_chunks": 96,
+                    "quantization_type": "FP",
+                    "save_path": "wikipedia_100k_benchmark_index"
+                },
+                "search_phase": {
+                    "queries": "wikipedia_cohere/wikipedia_query.bin",
+                    "groundtruth": "wikipedia_cohere/wikipedia-100K",
+                    "search_list": [
+                        100,
+                        200
+                    ],
+                    "beam_width": 4,
+                    "recall_at": 100,
+                    "num_threads": 4,
+                    "is_flat_search": false,
+                    "distance": "cosine_normalized",
+                    "vector_filters_file": null
+                }
+            }
+        }
+    ]
+}

From 221735309934177a7979590aee5b24b238fb0c3c Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Fri, 20 Mar 2026 13:37:35 +0800
Subject: [PATCH 02/31] Fix openai-100K distance metric and add gitignore
 patterns

---
 .github/scripts/benchmark_result_parse.py     | 115 +++++++++++++-----
 .../scripts/compare_disk_index_json_output.py |   2 -
 .github/workflows/benchmarks.yml              |  57 +++++----
 .../openai-100K-disk-index.json               |  40 ++++++
 4 files changed, 155 insertions(+), 59 deletions(-)
 create mode 100644 diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json

diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py
index 0308b2990..0bbede446 100644
--- a/.github/scripts/benchmark_result_parse.py
+++ b/.github/scripts/benchmark_result_parse.py
@@ -8,8 +8,6 @@
 Parses benchmark CSV results and validates against thresholds.
 Posts comments to GitHub PRs when regressions are detected.
 
-Migrated from ADO: .pipelines/templates/BenchmarkResultParse.py
-
 Usage:
     python benchmark_result_parse.py --mode pr --file results.csv
     python benchmark_result_parse.py --mode aa --file results.csv --data search
@@ -72,10 +70,30 @@
         "mean_comps": [],
         "mean_hops": [],
         "recall": []
+    },
+    "search-with-L=100-bw=4": {
+        "latency_95": [],
+        "mean_latency": [],
+        "mean_io_time": [],
+        "mean_cpus": [],
+        "qps": [],
+        "mean_ios": [],
+        "mean_comps": [],
+        "mean_hops": [],
+        "recall": []
+    },
+    "search-with-L=200-bw=4": {
+        "latency_95": [],
+        "mean_latency": [],
+        "mean_io_time": [],
+        "mean_cpus": [],
+        "qps": [],
+        "mean_ios": [],
+        "mean_comps": [],
+        "mean_hops": [],
+        "recall": []
     }
 }
-
-# Template for search-only benchmark data
 DATA_TEMPLATE_SEARCH = {
     "search_disk_index-search_completed": {
         "duration_seconds": [],
@@ -94,6 +112,28 @@
         "mean_comps": [],
         "mean_hops": [],
         "recall": []
+    },
+    "search-with-L=100-bw=4": {
+        "latency_95": [],
+        "mean_latency": [],
+        "mean_io_time": [],
+        "mean_cpus": [],
+        "qps": [],
+        "mean_ios": [],
+        "mean_comps": [],
+        "mean_hops": [],
+        "recall": []
+    },
+    "search-with-L=200-bw=4": {
+        "latency_95": [],
+        "mean_latency": [],
+        "mean_io_time": [],
+        "mean_cpus": [],
+        "qps": [],
+        "mean_ios": [],
+        "mean_comps": [],
+        "mean_hops": [],
+        "recall": []
     }
 }
 
@@ -136,6 +176,28 @@
         "mean_comps": [1, 'LT', 50000],
         "mean_hops": [1, 'LT', ""],
         "recall": [1, 'GT', 95.1]
+    },
+    "search-with-L=100-bw=4": {
+        "latency_95": [10, 'LT', ""],
+        "mean_latency": [10, 'LT', ""],
+        "mean_io_time": [10, 'LT', ""],
+        "mean_cpus": [10, 'LT', ""],
+        "qps": [10, 'GT', ""],
+        "mean_ios": [10, 'LT', ""],
+        "mean_comps": [10, 'LT', ""],
+        "mean_hops": [10, 'LT', ""],
+        "recall": [1, 'GT', ""]
+    },
+    "search-with-L=200-bw=4": {
+        "latency_95": [10, 'LT', ""],
+        "mean_latency": [10, 'LT', ""],
+        "mean_io_time": [10, 'LT', ""],
+        "mean_cpus": [10, 'LT', ""],
+        "qps": [10, 'GT', ""],
+        "mean_ios": [10, 'LT', ""],
+        "mean_comps": [10, 'LT', ""],
+        "mean_hops": [10, 'LT', ""],
+        "recall": [1, 'GT', ""]
     }
 }
 
@@ -147,35 +209,32 @@
 def parse_csv(file_path: str, data: dict[str, dict[str, list]]) -> dict[str, dict[str, list]]:
     """
     Parse benchmark CSV file and populate data structure.
-    
-    CSV format expected:
-        Column 0: (unused)
-        Column 1: Category name (e.g., "search-with-L=2000-bw=4")
-        Column 2: Metric name (e.g., "qps")
-        Column 3: Current value
-        Column 4: Baseline value
-        Column 5: Change percentage
+
+    CSV format produced by compare_disk_index_json_output.py:
+        Column 0: Parent Span Name  (category, e.g. "index-build statistics")
+        Column 1: Span Name         (display name, unused for matching)
+        Column 2: Stat Key          (metric key, e.g. "qps")
+        Column 3: Stat Value (Target)
+        Column 4: Stat Value (Baseline)
+        Column 5: Deviation (%)
     """
     with open(file_path, 'r', encoding='utf-8') as f:
         reader = csv.reader(f)
         next(reader)  # Skip header row
-        
-        current_key = None
+
         for row in reader:
             if len(row) < 6:
                 continue
-                
-            # Column 1 contains category name (only set on first row of category)
-            if row[1]:
-                current_key = row[1]
-            elif current_key and current_key in data:
-                metric_name = row[2]
-                if metric_name in data[current_key]:
-                    # Append: [current_value, baseline_value, change_percentage]
-                    data[current_key][metric_name].append(row[3])  # current
-                    data[current_key][metric_name].append(row[4])  # baseline
-                    data[current_key][metric_name].append(row[5])  # change %
-    
+
+            category = row[0].strip()
+            metric_name = row[2].strip()
+
+            if category in data and metric_name in data[category]:
+                # Append: [current_value, baseline_value, change_percentage]
+                data[category][metric_name].append(row[3])  # target (current)
+                data[category][metric_name].append(row[4])  # baseline
+                data[category][metric_name].append(row[5])  # deviation %
+
     return data
 
 
@@ -286,8 +345,8 @@ def check_thresholds(
             
             values = data[category][metric]
             if not values:
-                print(f"ERROR: {category}/{metric} has no data")
-                return True, f"Missing data for {category}/{metric}"
+                # No data for this metric in the CSV — skip silently
+                continue
             
             # Parse values: [current, baseline, change%]
             try:
diff --git a/.github/scripts/compare_disk_index_json_output.py b/.github/scripts/compare_disk_index_json_output.py
index e3fa5afce..ca9c9d26b 100644
--- a/.github/scripts/compare_disk_index_json_output.py
+++ b/.github/scripts/compare_disk_index_json_output.py
@@ -11,8 +11,6 @@
 The output format matches the CSV structure expected by benchmark_result_parse.py:
   Parent Span Name, Span Name, Stat Key, Stat Value (Target), Stat Value (Baseline), Deviation (%)
 
-Migrated from ADO: .pipelines/templates/compare_disk_index_json_output.py
-
 Usage:
     python compare_disk_index_json_output.py \\
         --baseline baseline/target/tmp/<dataset>_benchmark_crate_baseline.json \\
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index daf181ac6..875c00834 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -2,7 +2,6 @@
 # Licensed under the MIT license.
 
 # DiskANN Benchmarks Workflow
-# Migrated from ADO pipeline: .pipelines/DiskANN-Benchmarks.yml
 #
 # This workflow runs macro benchmarks comparing the current branch against a baseline.
 # It is manually triggered and requires a baseline reference (branch, tag, or commit).
@@ -37,9 +36,9 @@ permissions:
   pull-requests: write  # Required for posting PR comments
 
 jobs:
-  # Macro benchmark: Mimir Enron dataset
-  macro-benchmark-mimir-enron:
-    name: Macro Benchmark - Mimir Enron
+  # Macro benchmark: Wikipedia-100K dataset
+  macro-benchmark-wikipedia-100K:
+    name: Macro Benchmark - Wikipedia 100K
     runs-on: ubuntu-latest
     # TODO: For production benchmarks, consider using a self-hosted runner with:
     # - NVMe storage for consistent I/O performance
@@ -155,9 +154,9 @@ jobs:
             baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json
           retention-days: 30
 
-  # Macro benchmark: OAI Large dataset
+  # Macro benchmark: OpenAI ArXiv dataset
   macro-benchmark-oai-large:
-    name: Macro Benchmark - OAI Large
+    name: Macro Benchmark - OAI ArXiv 100K
     runs-on: ubuntu-latest
     # TODO: For production benchmarks, consider using a self-hosted runner
     timeout-minutes: 120
@@ -199,57 +198,57 @@ jobs:
           sudo apt-get install -y openssl libssl-dev pkg-config python3-pip
           pip install csvtomd numpy scipy
 
-      # Download the public Wikipedia-100K dataset via big-ann-benchmarks
-      # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance)
+      # Download the public OpenAI ArXiv 100K dataset via big-ann-benchmarks
+      # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance)
       # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
       - name: Clone big-ann-benchmarks
         run: git clone --depth 1 https://github.com/harsha-simhadri/big-ann-benchmarks.git
 
-      - name: Download wikipedia-100K dataset
+      - name: Download openai-100K dataset
         working-directory: big-ann-benchmarks
-        run: python create_dataset.py --dataset wikipedia-100K
+        run: python create_dataset.py --dataset openai-100K
 
       - name: Copy dataset to benchmark directories
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          cp -r big-ann-benchmarks/data/wikipedia_cohere diskann_rust/target/tmp/
-          cp -r big-ann-benchmarks/data/wikipedia_cohere baseline/target/tmp/
+          cp -r big-ann-benchmarks/data/OpenAIArXiv diskann_rust/target/tmp/
+          cp -r big-ann-benchmarks/data/OpenAIArXiv baseline/target/tmp/
 
       - name: Run baseline benchmark
         working-directory: baseline
         run: |
           cargo run -p diskann-benchmark --features disk-index --release -- \
-            run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
-            --output-file target/tmp/wikipedia-100K_benchmark_crate_baseline.json
+            run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \
+            --output-file target/tmp/openai-100K_benchmark_crate_baseline.json
 
       - name: Run current branch benchmark
         working-directory: diskann_rust
         run: |
           cargo run -p diskann-benchmark --features disk-index --release -- \
-            run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
-            --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json
+            run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \
+            --output-file target/tmp/openai-100K_benchmark_crate_target.json
 
       - name: Generate diff stats (baseline vs target)
         continue-on-error: true
         run: |
           python diskann_rust/.github/scripts/compare_disk_index_json_output.py \
-            --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \
-            --branch diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \
-            --out diskann_rust/target/tmp/wikipedia-100K_change.csv
+            --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \
+            --branch diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \
+            --out diskann_rust/target/tmp/openai-100K_change.csv
 
       - name: Convert results to Markdown
         working-directory: diskann_rust
         run: |
-          csvtomd target/tmp/wikipedia-100K_change.csv > target/tmp/wikipedia-100K_change.md
-          echo "### Benchmark Results: Wikipedia-100K Dataset" >> $GITHUB_STEP_SUMMARY
-          cat target/tmp/wikipedia-100K_change.md >> $GITHUB_STEP_SUMMARY
+          csvtomd target/tmp/openai-100K_change.csv > target/tmp/openai-100K_change.md
+          echo "### Benchmark Results: OpenAI ArXiv 100K Dataset" >> $GITHUB_STEP_SUMMARY
+          cat target/tmp/openai-100K_change.md >> $GITHUB_STEP_SUMMARY
 
       - name: Validate benchmark results
         working-directory: diskann_rust
         run: |
           python .github/scripts/benchmark_result_parse.py \
             --mode pr \
-            --file target/tmp/wikipedia-100K_change.csv
+            --file target/tmp/openai-100K_change.csv
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_REPOSITORY: ${{ github.repository }}
@@ -260,15 +259,15 @@ jobs:
         uses: actions/upload-artifact@v4
         if: always()  # Upload even if validation fails
         with:
-          name: benchmark-results-oai-wikipedia-100K
+          name: benchmark-results-openai-100K
           path: |
-            diskann_rust/target/tmp/wikipedia-100K_change.csv
-            diskann_rust/target/tmp/wikipedia-100K_change.md
-            diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json
-            baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json
+            diskann_rust/target/tmp/openai-100K_change.csv
+            diskann_rust/target/tmp/openai-100K_change.md
+            diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json
+            baseline/target/tmp/openai-100K_benchmark_crate_baseline.json
           retention-days: 30
 
-  # NOTE: IAI micro-benchmarks are temporarily disabled in the ADO pipeline
+  # NOTE: IAI micro-benchmarks are temporarily disabled
   # due to callgrind not running with Rust version `ms-1.86.0`.
   # Uncomment when ready to enable:
   #
diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
new file mode 100644
index 000000000..969724cae
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
@@ -0,0 +1,40 @@
+{
+    "search_directories": [
+        "target/tmp"
+    ],
+    "jobs": [
+        {
+            "type": "disk-index",
+            "content": {
+                "source": {
+                    "disk-index-source": "Build",
+                    "data_type": "float32",
+                    "data": "OpenAIArXiv/openai_base.bin.crop_nb_100000",
+                    "distance": "squared_l2",
+                    "dim": 1536,
+                    "max_degree": 32,
+                    "l_build": 50,
+                    "num_threads": 8,
+                    "build_ram_limit_gb": 4.0,
+                    "num_pq_chunks": 192,
+                    "quantization_type": "FP",
+                    "save_path": "openai_100k_benchmark_index"
+                },
+                "search_phase": {
+                    "queries": "OpenAIArXiv/openai_query.bin",
+                    "groundtruth": "OpenAIArXiv/openai-100K",
+                    "search_list": [
+                        100,
+                        200
+                    ],
+                    "beam_width": 4,
+                    "recall_at": 100,
+                    "num_threads": 4,
+                    "is_flat_search": false,
+                    "distance": "squared_l2",
+                    "vector_filters_file": null
+                }
+            }
+        }
+    ]
+}

From 6b7b250e579999ce48a2db4aaebc13f3f1e7b7ec Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Fri, 20 Mar 2026 13:47:49 +0800
Subject: [PATCH 03/31] Add push trigger to benchmarks workflow for pre-merge
 testing

---
 .github/workflows/benchmarks.yml | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 875c00834..337560116 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -16,6 +16,14 @@ on:
         required: true
         default: 'main'
         type: string
+  push:
+    branches:
+      - 'user/tianyuanyuan/add-benchmark-pipeline'
+    paths:
+      - 'diskann-benchmark/perf_test_inputs/**-disk-index.json'
+      - '.github/workflows/benchmarks.yml'
+      - '.github/scripts/compare_disk_index_json_output.py'
+      - '.github/scripts/benchmark_result_parse.py'
 
 # Cancel in-progress runs when a new run is triggered
 concurrency:
@@ -53,10 +61,10 @@ jobs:
           path: diskann_rust
           lfs: true
 
-      - name: Checkout baseline (${{ inputs.baseline_ref }})
+      - name: Checkout baseline (${{ inputs.baseline_ref || 'main' }})
         uses: actions/checkout@v4
         with:
-          ref: ${{ inputs.baseline_ref }}
+          ref: ${{ inputs.baseline_ref || 'main' }}
           path: baseline
           lfs: true
 
@@ -168,10 +176,10 @@ jobs:
           path: diskann_rust
           lfs: true
 
-      - name: Checkout baseline (${{ inputs.baseline_ref }})
+      - name: Checkout baseline (${{ inputs.baseline_ref || 'main' }})
         uses: actions/checkout@v4
         with:
-          ref: ${{ inputs.baseline_ref }}
+          ref: ${{ inputs.baseline_ref || 'main' }}
           path: baseline
           lfs: true
 

From d4a6abd68f097cbb562f34be3f41c977353bb87d Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Fri, 20 Mar 2026 13:58:51 +0800
Subject: [PATCH 04/31] Fix baseline run: use input config from current branch
 checkout

---
 .github/workflows/benchmarks.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 337560116..a4df8213f 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -113,7 +113,7 @@ jobs:
           # Note: For accurate benchmarks, consider using CPU pinning on self-hosted runners:
           # sudo taskset -c 0,2,4,6 ionice -c 1 -n 0 cargo run ...
           cargo run -p diskann-benchmark --features disk-index --release -- \
-            run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
+            run --input-file ../diskann_rust/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
             --output-file target/tmp/wikipedia-100K_benchmark_crate_baseline.json
 
       - name: Run current branch benchmark
@@ -226,7 +226,7 @@ jobs:
         working-directory: baseline
         run: |
           cargo run -p diskann-benchmark --features disk-index --release -- \
-            run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \
+            run --input-file ../diskann_rust/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \
             --output-file target/tmp/openai-100K_benchmark_crate_baseline.json
 
       - name: Run current branch benchmark

From c6d54a9b0a8b61864b6c79db9e57dc9889b5561a Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Fri, 20 Mar 2026 14:17:29 +0800
Subject: [PATCH 05/31] Fix markdown conversion: replace broken csvtomd with
 inline Python

---
 .github/workflows/benchmarks.yml | 34 ++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index a4df8213f..0670af06e 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -89,7 +89,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install -y openssl libssl-dev pkg-config python3-pip
-          pip install csvtomd numpy scipy
+          pip install numpy scipy
 
       # Download the public Wikipedia-100K dataset via big-ann-benchmarks
       # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance)
@@ -134,9 +134,18 @@ jobs:
       - name: Convert results to Markdown
         working-directory: diskann_rust
         run: |
-          csvtomd target/tmp/wikipedia-100K_change.csv > target/tmp/wikipedia-100K_change.md
-          echo "### Benchmark Results: Wikipedia-100K Dataset" >> $GITHUB_STEP_SUMMARY
-          cat target/tmp/wikipedia-100K_change.md >> $GITHUB_STEP_SUMMARY
+          python3 -c "
+          import csv, os
+          rows = list(csv.reader(open('target/tmp/wikipedia-100K_change.csv')))
+          if len(rows) < 2:
+              print('No data'); exit(0)
+          header = rows[0]
+          sep = ['---'] * len(header)
+          md = '\n'.join(' | '.join(r) for r in [header, sep] + rows[1:])
+          open('target/tmp/wikipedia-100K_change.md', 'w').write(md + '\n')
+          "
+          echo '### Benchmark Results: Wikipedia-100K Dataset' >> "$GITHUB_STEP_SUMMARY"
+          cat target/tmp/wikipedia-100K_change.md >> "$GITHUB_STEP_SUMMARY"
 
       - name: Validate benchmark results
         working-directory: diskann_rust
@@ -204,7 +213,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install -y openssl libssl-dev pkg-config python3-pip
-          pip install csvtomd numpy scipy
+          pip install numpy scipy
 
       # Download the public OpenAI ArXiv 100K dataset via big-ann-benchmarks
       # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance)
@@ -247,9 +256,18 @@ jobs:
       - name: Convert results to Markdown
         working-directory: diskann_rust
         run: |
-          csvtomd target/tmp/openai-100K_change.csv > target/tmp/openai-100K_change.md
-          echo "### Benchmark Results: OpenAI ArXiv 100K Dataset" >> $GITHUB_STEP_SUMMARY
-          cat target/tmp/openai-100K_change.md >> $GITHUB_STEP_SUMMARY
+          python3 -c "
+          import csv, os
+          rows = list(csv.reader(open('target/tmp/openai-100K_change.csv')))
+          if len(rows) < 2:
+              print('No data'); exit(0)
+          header = rows[0]
+          sep = ['---'] * len(header)
+          md = '\n'.join(' | '.join(r) for r in [header, sep] + rows[1:])
+          open('target/tmp/openai-100K_change.md', 'w').write(md + '\n')
+          "
+          echo '### Benchmark Results: OpenAI ArXiv 100K Dataset' >> "$GITHUB_STEP_SUMMARY"
+          cat target/tmp/openai-100K_change.md >> "$GITHUB_STEP_SUMMARY"
 
       - name: Validate benchmark results
         working-directory: diskann_rust

From 5775198a10473ae82b0864a7a168e3357f4d422d Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Fri, 20 Mar 2026 16:51:59 +0800
Subject: [PATCH 06/31] Update benchmark configs: align build/search params to
 fix low recall

---
 .../perf_test_inputs/openai-100K-disk-index.json    | 13 ++++++-------
 .../perf_test_inputs/wikipedia-100K-disk-index.json | 13 ++++++-------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
index 969724cae..9ae7e148b 100644
--- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
@@ -12,11 +12,11 @@
                     "data": "OpenAIArXiv/openai_base.bin.crop_nb_100000",
                     "distance": "squared_l2",
                     "dim": 1536,
-                    "max_degree": 32,
-                    "l_build": 50,
+                    "max_degree": 59,
+                    "l_build": 64,
                     "num_threads": 8,
-                    "build_ram_limit_gb": 4.0,
-                    "num_pq_chunks": 192,
+                    "build_ram_limit_gb": 10.0,
+                    "num_pq_chunks": 384,
                     "quantization_type": "FP",
                     "save_path": "openai_100k_benchmark_index"
                 },
@@ -24,11 +24,10 @@
                     "queries": "OpenAIArXiv/openai_query.bin",
                     "groundtruth": "OpenAIArXiv/openai-100K",
                     "search_list": [
-                        100,
-                        200
+                        2000
                     ],
                     "beam_width": 4,
-                    "recall_at": 100,
+                    "recall_at": 1000,
                     "num_threads": 4,
                     "is_flat_search": false,
                     "distance": "squared_l2",
diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
index 1557a594a..7deaf788e 100644
--- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
@@ -12,11 +12,11 @@
                     "data": "wikipedia_cohere/wikipedia_base.bin.crop_nb_100000",
                     "distance": "cosine_normalized",
                     "dim": 768,
-                    "max_degree": 32,
-                    "l_build": 50,
+                    "max_degree": 59,
+                    "l_build": 72,
                     "num_threads": 4,
-                    "build_ram_limit_gb": 4.0,
-                    "num_pq_chunks": 96,
+                    "build_ram_limit_gb": 10.0,
+                    "num_pq_chunks": 192,
                     "quantization_type": "FP",
                     "save_path": "wikipedia_100k_benchmark_index"
                 },
@@ -24,11 +24,10 @@
                     "queries": "wikipedia_cohere/wikipedia_query.bin",
                     "groundtruth": "wikipedia_cohere/wikipedia-100K",
                     "search_list": [
-                        100,
-                        200
+                        2000
                     ],
                     "beam_width": 4,
-                    "recall_at": 100,
+                    "recall_at": 1000,
                     "num_threads": 4,
                     "is_flat_search": false,
                     "distance": "cosine_normalized",

From 7ac841e952a3355f3f40d738af5b46b23372162a Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Mon, 23 Mar 2026 13:13:34 +0800
Subject: [PATCH 07/31] Fix recall_at: set to 100 to match groundtruth file
 K=100

---
 diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json  | 2 +-
 .../perf_test_inputs/wikipedia-100K-disk-index.json             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
index 9ae7e148b..93c1358ba 100644
--- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
@@ -27,7 +27,7 @@
                         2000
                     ],
                     "beam_width": 4,
-                    "recall_at": 1000,
+                    "recall_at": 100,
                     "num_threads": 4,
                     "is_flat_search": false,
                     "distance": "squared_l2",
diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
index 7deaf788e..1c0af41b7 100644
--- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
@@ -27,7 +27,7 @@
                         2000
                     ],
                     "beam_width": 4,
-                    "recall_at": 1000,
+                    "recall_at": 100,
                     "num_threads": 4,
                     "is_flat_search": false,
                     "distance": "cosine_normalized",

From 7d5a72eddfd94965a082b50672f285971025f908 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Mon, 23 Mar 2026 13:54:31 +0800
Subject: [PATCH 08/31] Remove stale absolute contracts
 (qps/recall/total_time): calibrated for ADO mimir-enron, not applicable to
 public datasets on GitHub runners. Threshold calibration tracked in PBI.

---
 .github/scripts/benchmark_result_parse.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py
index 0bbede446..ab3549dc1 100644
--- a/.github/scripts/benchmark_result_parse.py
+++ b/.github/scripts/benchmark_result_parse.py
@@ -162,7 +162,8 @@
         "total_duration_seconds": [10, 'LT', ""],
     },
     "index-build statistics": {
-        "total_time": [10, 'LT', 1206],
+        # total_time contract TBD: requires baseline run on target hardware (see PBI: threshold calibration)
+        "total_time": [10, 'LT', ""],
         "total_comparisons": [1, 'LT', ""],
         "search_hops": [1, 'LT', ""]
     },
@@ -171,11 +172,14 @@
         "mean_latency": [10, 'LT', ""],
         "mean_io_time": [10, 'LT', ""],
         "mean_cpus": [10, 'LT', ""],
-        "qps": [10, 'GT', 29],
-        "mean_ios": [1, 'LT', 2026],
-        "mean_comps": [1, 'LT', 50000],
+        # qps/recall/mean_ios/mean_comps contracts TBD: prior values were calibrated for
+        # internal mimir-enron 1M-vector dataset on production hardware, not applicable here.
+        # See PBI: define alert thresholds for public dataset benchmarks.
+        "qps": [10, 'GT', ""],
+        "mean_ios": [1, 'LT', ""],
+        "mean_comps": [1, 'LT', ""],
         "mean_hops": [1, 'LT', ""],
-        "recall": [1, 'GT', 95.1]
+        "recall": [1, 'GT', ""]
     },
     "search-with-L=100-bw=4": {
         "latency_95": [10, 'LT', ""],

From c294c16c59a85c1ab6a234aae12f0c5837dccced Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Mon, 23 Mar 2026 13:55:32 +0800
Subject: [PATCH 09/31] remove comments

---
 .github/scripts/benchmark_result_parse.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py
index ab3549dc1..600a66f35 100644
--- a/.github/scripts/benchmark_result_parse.py
+++ b/.github/scripts/benchmark_result_parse.py
@@ -162,7 +162,6 @@
         "total_duration_seconds": [10, 'LT', ""],
     },
     "index-build statistics": {
-        # total_time contract TBD: requires baseline run on target hardware (see PBI: threshold calibration)
         "total_time": [10, 'LT', ""],
         "total_comparisons": [1, 'LT', ""],
         "search_hops": [1, 'LT', ""]
@@ -172,9 +171,6 @@
         "mean_latency": [10, 'LT', ""],
         "mean_io_time": [10, 'LT', ""],
         "mean_cpus": [10, 'LT', ""],
-        # qps/recall/mean_ios/mean_comps contracts TBD: prior values were calibrated for
-        # internal mimir-enron 1M-vector dataset on production hardware, not applicable here.
-        # See PBI: define alert thresholds for public dataset benchmarks.
         "qps": [10, 'GT', ""],
         "mean_ios": [1, 'LT', ""],
         "mean_comps": [1, 'LT', ""],

From b8e1d6deb484799a4bc14daa4ddf49cde96f2fe8 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Mon, 23 Mar 2026 14:47:43 +0800
Subject: [PATCH 10/31] Fix build_ram_limit_gb: reduce 10->4 to fit GitHub
 runner RAM (7GB limit)

---
 diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json  | 2 +-
 .../perf_test_inputs/wikipedia-100K-disk-index.json             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
index 93c1358ba..940269195 100644
--- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
@@ -15,7 +15,7 @@
                     "max_degree": 59,
                     "l_build": 64,
                     "num_threads": 8,
-                    "build_ram_limit_gb": 10.0,
+                    "build_ram_limit_gb": 4.0,
                     "num_pq_chunks": 384,
                     "quantization_type": "FP",
                     "save_path": "openai_100k_benchmark_index"
diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
index 1c0af41b7..d15026e63 100644
--- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
@@ -15,7 +15,7 @@
                     "max_degree": 59,
                     "l_build": 72,
                     "num_threads": 4,
-                    "build_ram_limit_gb": 10.0,
+                    "build_ram_limit_gb": 4.0,
                     "num_pq_chunks": 192,
                     "quantization_type": "FP",
                     "save_path": "wikipedia_100k_benchmark_index"

From 137cae094ef7c3f039de29d7f60bd62f57ec1ae3 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Mon, 23 Mar 2026 15:21:22 +0800
Subject: [PATCH 11/31] Fix wikipedia distance: cosine_normalized->cosine
 (vectors are not L2-normalized, metric is inner product)

---
 .../perf_test_inputs/wikipedia-100K-disk-index.json           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
index d15026e63..c4131e720 100644
--- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
@@ -10,7 +10,7 @@
                     "disk-index-source": "Build",
                     "data_type": "float32",
                     "data": "wikipedia_cohere/wikipedia_base.bin.crop_nb_100000",
-                    "distance": "cosine_normalized",
+                    "distance": "cosine",
                     "dim": 768,
                     "max_degree": 59,
                     "l_build": 72,
@@ -30,7 +30,7 @@
                     "recall_at": 100,
                     "num_threads": 4,
                     "is_flat_search": false,
-                    "distance": "cosine_normalized",
+                    "distance": "cosine",
                     "vector_filters_file": null
                 }
             }

From 96b63a8779a3d296af080ce16f86151aff5dae15 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Mon, 23 Mar 2026 15:51:15 +0800
Subject: [PATCH 12/31] Fix wikipedia distance: cosine->inner_product
 (groundtruth uses raw ip, not cosine similarity)

---
 .../perf_test_inputs/wikipedia-100K-disk-index.json           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
index c4131e720..6a52b1e32 100644
--- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
@@ -10,7 +10,7 @@
                     "disk-index-source": "Build",
                     "data_type": "float32",
                     "data": "wikipedia_cohere/wikipedia_base.bin.crop_nb_100000",
-                    "distance": "cosine",
+                    "distance": "inner_product",
                     "dim": 768,
                     "max_degree": 59,
                     "l_build": 72,
@@ -30,7 +30,7 @@
                     "recall_at": 100,
                     "num_threads": 4,
                     "is_flat_search": false,
-                    "distance": "cosine",
+                    "distance": "inner_product",
                     "vector_filters_file": null
                 }
             }

From 1b4cfc87dfe19c693a5155b20bee92c351fdd016 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Tue, 24 Mar 2026 15:09:03 +0800
Subject: [PATCH 13/31] Align build threads

---
 diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json  | 2 +-
 .../perf_test_inputs/wikipedia-100K-disk-index.json             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
index 940269195..b9f3e195d 100644
--- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
@@ -14,7 +14,7 @@
                     "dim": 1536,
                     "max_degree": 59,
                     "l_build": 64,
-                    "num_threads": 8,
+                    "num_threads": 1,
                     "build_ram_limit_gb": 4.0,
                     "num_pq_chunks": 384,
                     "quantization_type": "FP",
diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
index 6a52b1e32..5093eaf4d 100644
--- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
@@ -14,7 +14,7 @@
                     "dim": 768,
                     "max_degree": 59,
                     "l_build": 72,
-                    "num_threads": 4,
+                    "num_threads": 1,
                     "build_ram_limit_gb": 4.0,
                     "num_pq_chunks": 192,
                     "quantization_type": "FP",

From 5e6a6e0fb5e5897621b0ae3566d09b7569a25381 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Tue, 24 Mar 2026 15:51:14 +0800
Subject: [PATCH 14/31] Speed up benchmarks: build threads 1->4, openai
 pq_chunks 384->192

---
 .../perf_test_inputs/openai-100K-disk-index.json              | 4 ++--
 .../perf_test_inputs/wikipedia-100K-disk-index.json           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
index b9f3e195d..3723d66b6 100644
--- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
@@ -14,9 +14,9 @@
                     "dim": 1536,
                     "max_degree": 59,
                     "l_build": 64,
-                    "num_threads": 1,
+                    "num_threads": 4,
                     "build_ram_limit_gb": 4.0,
-                    "num_pq_chunks": 384,
+                    "num_pq_chunks": 192,
                     "quantization_type": "FP",
                     "save_path": "openai_100k_benchmark_index"
                 },
diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
index 5093eaf4d..6a52b1e32 100644
--- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
@@ -14,7 +14,7 @@
                     "dim": 768,
                     "max_degree": 59,
                     "l_build": 72,
-                    "num_threads": 1,
+                    "num_threads": 4,
                     "build_ram_limit_gb": 4.0,
                     "num_pq_chunks": 192,
                     "quantization_type": "FP",

From 59d25b966e465a3fda520ca1d2792fdbf8d323b8 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Tue, 24 Mar 2026 16:24:44 +0800
Subject: [PATCH 15/31] Temp: disable concurrency cancellation for A/A batch
 testing

---
 .github/workflows/benchmarks.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 0670af06e..fc0de7c07 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -27,8 +27,10 @@ on:
 
 # Cancel in-progress runs when a new run is triggered
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
-  cancel-in-progress: true
+  # Use unique group per run for A/A testing (allows parallel runs).
+  # For production, change back to: github.event.pull_request.number || github.sha
+  group: ${{ github.workflow }}-${{ github.run_id }}
+  cancel-in-progress: false
 
 env:
   RUST_BACKTRACE: 1

From 751e7756ec368963dbb3f7edec996e05c39e0f90 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Thu, 26 Mar 2026 13:56:35 +0800
Subject: [PATCH 16/31] revert A/A test settings, update OpenAI config to
 SQ_1_2.0

---
 .github/workflows/benchmarks.yml                       | 10 ++++------
 .../perf_test_inputs/openai-100K-disk-index.json       |  6 +++---
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index fc0de7c07..aa659dd09 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -27,10 +27,8 @@ on:
 
 # Cancel in-progress runs when a new run is triggered
 concurrency:
-  # Use unique group per run for A/A testing (allows parallel runs).
-  # For production, change back to: github.event.pull_request.number || github.sha
-  group: ${{ github.workflow }}-${{ github.run_id }}
-  cancel-in-progress: false
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
 
 env:
   RUST_BACKTRACE: 1
@@ -91,7 +89,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install -y openssl libssl-dev pkg-config python3-pip
-          pip install numpy scipy
+          pip install csvtomd numpy scipy
 
       # Download the public Wikipedia-100K dataset via big-ann-benchmarks
       # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance)
@@ -215,7 +213,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install -y openssl libssl-dev pkg-config python3-pip
-          pip install numpy scipy
+          pip install csvtomd numpy scipy
 
       # Download the public OpenAI ArXiv 100K dataset via big-ann-benchmarks
       # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance)
diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
index 3723d66b6..3a2a1d9e2 100644
--- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
@@ -13,11 +13,11 @@
                     "distance": "squared_l2",
                     "dim": 1536,
                     "max_degree": 59,
-                    "l_build": 64,
+                    "l_build": 80,
                     "num_threads": 4,
                     "build_ram_limit_gb": 4.0,
-                    "num_pq_chunks": 192,
-                    "quantization_type": "FP",
+                    "num_pq_chunks": 384,
+                    "quantization_type": "SQ_1_2.0",
                     "save_path": "openai_100k_benchmark_index"
                 },
                 "search_phase": {

From 2c4d2353096b996aef9b92c3abff0dfc845532bb Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Fri, 27 Mar 2026 11:09:16 +0800
Subject: [PATCH 17/31] Remove micro-benchmark-iai comments

---
 .github/workflows/benchmarks.yml | 53 +-------------------------------
 1 file changed, 1 insertion(+), 52 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index aa659dd09..324bb1cda 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -124,7 +124,6 @@ jobs:
             --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json
 
       - name: Generate diff stats (baseline vs target)
-        continue-on-error: true
         run: |
           python diskann_rust/.github/scripts/compare_disk_index_json_output.py \
             --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \
@@ -246,7 +245,6 @@ jobs:
             --output-file target/tmp/openai-100K_benchmark_crate_target.json
 
       - name: Generate diff stats (baseline vs target)
-        continue-on-error: true
         run: |
           python diskann_rust/.github/scripts/compare_disk_index_json_output.py \
             --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \
@@ -291,53 +289,4 @@ jobs:
             diskann_rust/target/tmp/openai-100K_change.md
             diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json
             baseline/target/tmp/openai-100K_benchmark_crate_baseline.json
-          retention-days: 30
-
-  # NOTE: IAI micro-benchmarks are temporarily disabled
-  # due to callgrind not running with Rust version `ms-1.86.0`.
-  # Uncomment when ready to enable:
-  #
-  # micro-benchmark-iai:
-  #   name: Micro Benchmark - IAI
-  #   runs-on: ubuntu-latest
-  #   timeout-minutes: 120
-  #
-  #   steps:
-  #     - name: Checkout current branch
-  #       uses: actions/checkout@v4
-  #       with:
-  #         path: diskann_rust
-  #
-  #     - name: Checkout baseline (${{ inputs.baseline_ref }})
-  #       uses: actions/checkout@v4
-  #       with:
-  #         ref: ${{ inputs.baseline_ref }}
-  #         path: baseline
-  #
-  #     - name: Install Rust ${{ env.rust_stable }}
-  #       uses: dtolnay/rust-toolchain@master
-  #       with:
-  #         toolchain: ${{ env.rust_stable }}
-  #
-  #     - name: Install valgrind and iai-callgrind-runner
-  #       run: |
-  #         sudo apt-get update
-  #         sudo apt-get install -y valgrind
-  #         cargo install --version 0.14.0 iai-callgrind-runner
-  #
-  #     - name: Run baseline IAI benchmarks
-  #       working-directory: baseline
-  #       run: |
-  #         cargo bench --bench bench_main_iai
-  #         cargo bench --bench bench_main_vector_iai
-  #
-  #     - name: Copy IAI baseline files
-  #       run: |
-  #         mkdir -p diskann_rust/target
-  #         cp -R baseline/target/iai diskann_rust/target/
-  #
-  #     - name: Run current branch IAI benchmarks
-  #       working-directory: diskann_rust
-  #       run: |
-  #         cargo bench --bench bench_main_iai
-  #         cargo bench --bench bench_main_vector_iai
+          retention-days: 30
\ No newline at end of file

From f58e0846b4e9eda6126f6f12afccff0b483102aa Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Fri, 27 Mar 2026 11:47:59 +0800
Subject: [PATCH 18/31] use GitHub Release assets for benchmark datasets

---
 .github/workflows/benchmarks.yml | 34 +++++++++++---------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 324bb1cda..a2fd6ad34 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -91,21 +91,16 @@ jobs:
           sudo apt-get install -y openssl libssl-dev pkg-config python3-pip
           pip install csvtomd numpy scipy
 
-      # Download the public Wikipedia-100K dataset via big-ann-benchmarks
+      # Download pre-packaged Wikipedia-100K dataset from GitHub Release
       # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance)
-      # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
-      - name: Clone big-ann-benchmarks
-        run: git clone --depth 1 https://github.com/harsha-simhadri/big-ann-benchmarks.git
-
       - name: Download wikipedia-100K dataset
-        working-directory: big-ann-benchmarks
-        run: python create_dataset.py --dataset wikipedia-100K
-
-      - name: Copy dataset to benchmark directories
+        env:
+          GH_TOKEN: ${{ github.token }}
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          cp -r big-ann-benchmarks/data/wikipedia_cohere diskann_rust/target/tmp/
-          cp -r big-ann-benchmarks/data/wikipedia_cohere baseline/target/tmp/
+          gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'wikipedia-100K.tar.gz' --dir .
+          tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/
+          cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/
 
       - name: Run baseline benchmark
         working-directory: baseline
@@ -214,21 +209,16 @@ jobs:
           sudo apt-get install -y openssl libssl-dev pkg-config python3-pip
           pip install csvtomd numpy scipy
 
-      # Download the public OpenAI ArXiv 100K dataset via big-ann-benchmarks
+      # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release
       # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance)
-      # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
-      - name: Clone big-ann-benchmarks
-        run: git clone --depth 1 https://github.com/harsha-simhadri/big-ann-benchmarks.git
-
       - name: Download openai-100K dataset
-        working-directory: big-ann-benchmarks
-        run: python create_dataset.py --dataset openai-100K
-
-      - name: Copy dataset to benchmark directories
+        env:
+          GH_TOKEN: ${{ github.token }}
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          cp -r big-ann-benchmarks/data/OpenAIArXiv diskann_rust/target/tmp/
-          cp -r big-ann-benchmarks/data/OpenAIArXiv baseline/target/tmp/
+          gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'openai-100K.tar.gz' --dir .
+          tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/
+          cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/
 
       - name: Run baseline benchmark
         working-directory: baseline

From 0f5c277396413a3c79cd4e1e622d9d9134d3edff Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Fri, 27 Mar 2026 14:09:01 +0800
Subject: [PATCH 19/31] extract csv-to-markdown into reusable script

---
 .github/scripts/csv_to_markdown.py | 50 ++++++++++++++++++++++++++++++
 .github/workflows/benchmarks.yml   | 32 +++++--------------
 2 files changed, 58 insertions(+), 24 deletions(-)
 create mode 100644 .github/scripts/csv_to_markdown.py

diff --git a/.github/scripts/csv_to_markdown.py b/.github/scripts/csv_to_markdown.py
new file mode 100644
index 000000000..885a20208
--- /dev/null
+++ b/.github/scripts/csv_to_markdown.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+"""Convert a CSV file to a Markdown table and optionally append to GitHub Step Summary."""
+
+import argparse
+import csv
+import os
+import sys
+
+
+def csv_to_markdown(csv_path: str) -> str:
+    """Convert a CSV file to a Markdown table string."""
+    with open(csv_path) as f:
+        rows = list(csv.reader(f))
+    if len(rows) < 2:
+        return ""
+    header = rows[0]
+    sep = ["---"] * len(header)
+    return "\n".join(" | ".join(r) for r in [header, sep] + rows[1:])
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--csv", required=True, help="Input CSV file path")
+    parser.add_argument("--md", required=True, help="Output Markdown file path")
+    parser.add_argument("--title", default="", help="Section title for GitHub Step Summary")
+    args = parser.parse_args()
+
+    md = csv_to_markdown(args.csv)
+    if not md:
+        print("No data")
+        return 0
+
+    with open(args.md, "w") as f:
+        f.write(md + "\n")
+
+    # Append to GitHub Step Summary if available
+    summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+    if summary_path and args.title:
+        with open(summary_path, "a") as f:
+            f.write(f"### {args.title}\n")
+            f.write(md + "\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index a2fd6ad34..a5c5c3a03 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -128,18 +128,10 @@ jobs:
       - name: Convert results to Markdown
         working-directory: diskann_rust
         run: |
-          python3 -c "
-          import csv, os
-          rows = list(csv.reader(open('target/tmp/wikipedia-100K_change.csv')))
-          if len(rows) < 2:
-              print('No data'); exit(0)
-          header = rows[0]
-          sep = ['---'] * len(header)
-          md = '\n'.join(' | '.join(r) for r in [header, sep] + rows[1:])
-          open('target/tmp/wikipedia-100K_change.md', 'w').write(md + '\n')
-          "
-          echo '### Benchmark Results: Wikipedia-100K Dataset' >> "$GITHUB_STEP_SUMMARY"
-          cat target/tmp/wikipedia-100K_change.md >> "$GITHUB_STEP_SUMMARY"
+          python .github/scripts/csv_to_markdown.py \
+            --csv target/tmp/wikipedia-100K_change.csv \
+            --md target/tmp/wikipedia-100K_change.md \
+            --title 'Benchmark Results: Wikipedia-100K Dataset'
 
       - name: Validate benchmark results
         working-directory: diskann_rust
@@ -244,18 +236,10 @@ jobs:
       - name: Convert results to Markdown
         working-directory: diskann_rust
         run: |
-          python3 -c "
-          import csv, os
-          rows = list(csv.reader(open('target/tmp/openai-100K_change.csv')))
-          if len(rows) < 2:
-              print('No data'); exit(0)
-          header = rows[0]
-          sep = ['---'] * len(header)
-          md = '\n'.join(' | '.join(r) for r in [header, sep] + rows[1:])
-          open('target/tmp/openai-100K_change.md', 'w').write(md + '\n')
-          "
-          echo '### Benchmark Results: OpenAI ArXiv 100K Dataset' >> "$GITHUB_STEP_SUMMARY"
-          cat target/tmp/openai-100K_change.md >> "$GITHUB_STEP_SUMMARY"
+          python .github/scripts/csv_to_markdown.py \
+            --csv target/tmp/openai-100K_change.csv \
+            --md target/tmp/openai-100K_change.md \
+            --title 'Benchmark Results: OpenAI ArXiv 100K Dataset'
 
       - name: Validate benchmark results
         working-directory: diskann_rust

From b8cddb5a28e4593180c3cb53ea3d9ab9ca7887c9 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Fri, 27 Mar 2026 15:03:00 +0800
Subject: [PATCH 20/31] calibrate contract thresholds from GitHub runner data

---
 .github/scripts/benchmark_result_parse.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py
index 600a66f35..c51036056 100644
--- a/.github/scripts/benchmark_result_parse.py
+++ b/.github/scripts/benchmark_result_parse.py
@@ -162,20 +162,28 @@
         "total_duration_seconds": [10, 'LT', ""],
     },
     "index-build statistics": {
-        "total_time": [10, 'LT', ""],
+        # Calibrated from 5 GitHub runner runs (10 observations):
+        #   Wikipedia: 35.9–37.2s, OpenAI: 23.0–76.4s (SQ_1_2.0 variance)
+        # Contract: worst × 1.5 to absorb shared-runner variance
+        "total_time": [10, 'LT', 115],
         "total_comparisons": [1, 'LT', ""],
         "search_hops": [1, 'LT', ""]
     },
     "search-with-L=2000-bw=4": {
+        # Calibrated from 5 GitHub runner runs (10 observations):
+        #   QPS: 9.56–9.75 (both datasets)
+        #   Recall: wiki 99.87%, oai 99.67–99.91%
+        #   mean_ios: ~2007 (deterministic)
+        #   mean_comps: wiki ~27609, oai 21618–24733
         "latency_95": [10, 'LT', ""],
         "mean_latency": [10, 'LT', ""],
         "mean_io_time": [10, 'LT', ""],
         "mean_cpus": [10, 'LT', ""],
-        "qps": [10, 'GT', ""],
-        "mean_ios": [1, 'LT', ""],
-        "mean_comps": [1, 'LT', ""],
+        "qps": [10, 'GT', 6.5],
+        "mean_ios": [1, 'LT', 2410],
+        "mean_comps": [1, 'LT', 33200],
         "mean_hops": [1, 'LT', ""],
-        "recall": [1, 'GT', ""]
+        "recall": [1, 'GT', 98.0]
     },
     "search-with-L=100-bw=4": {
         "latency_95": [10, 'LT', ""],

From accdf2cca9bfe511c0dcef96babfd879aa6d3fba Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Fri, 27 Mar 2026 15:10:12 +0800
Subject: [PATCH 21/31] add daily A/A benchmark stability test with failure
 notification

---
 .github/workflows/benchmarks-aa.yml | 276 ++++++++++++++++++++++++++++
 1 file changed, 276 insertions(+)
 create mode 100644 .github/workflows/benchmarks-aa.yml

diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml
new file mode 100644
index 000000000..b272f7c69
--- /dev/null
+++ b/.github/workflows/benchmarks-aa.yml
@@ -0,0 +1,276 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+# DiskANN Daily A/A Benchmark Stability Test
+#
+# Runs main vs main at 9 AM UTC every day to detect environment noise.
+# If any threshold is breached, a GitHub issue is created to notify @microsoft/diskann-admin.
+# Can also be triggered manually for debugging.
+
+name: Benchmarks (A/A)
+
+on:
+  schedule:
+    # Daily at 9 AM UTC
+    - cron: '0 9 * * *'
+  workflow_dispatch:  # Allow manual trigger for debugging
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  rust_stable: "1.92"
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+  issues: write  # Required for creating failure notification issues
+
+jobs:
+  # A/A benchmark: Wikipedia-100K dataset (main vs main)
+  aa-wikipedia-100K:
+    name: A/A - Wikipedia 100K
+    runs-on: ubuntu-latest
+    timeout-minutes: 120
+
+    steps:
+      - name: Checkout main (target)
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          path: diskann_rust
+          lfs: true
+
+      - name: Checkout main (baseline)
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          path: baseline
+          lfs: true
+
+      - name: Install Rust ${{ env.rust_stable }}
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: ${{ env.rust_stable }}
+
+      - name: Cache Rust dependencies (target)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: diskann_rust -> target
+          key: aa-target
+
+      - name: Cache Rust dependencies (baseline)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: baseline -> target
+          key: aa-baseline
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y openssl libssl-dev pkg-config
+
+      # Download pre-packaged Wikipedia-100K dataset from GitHub Release
+      - name: Download wikipedia-100K dataset
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          mkdir -p diskann_rust/target/tmp baseline/target/tmp
+          gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'wikipedia-100K.tar.gz' --dir .
+          tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/
+          cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/
+
+      - name: Run baseline benchmark
+        working-directory: baseline
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file ../diskann_rust/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
+            --output-file target/tmp/wikipedia-100K_benchmark_crate_baseline.json
+
+      - name: Run target benchmark
+        working-directory: diskann_rust
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
+            --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json
+
+      - name: Generate diff stats (baseline vs target)
+        run: |
+          python diskann_rust/.github/scripts/compare_disk_index_json_output.py \
+            --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \
+            --branch diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \
+            --out diskann_rust/target/tmp/wikipedia-100K_change.csv
+
+      - name: Convert results to Markdown
+        working-directory: diskann_rust
+        run: |
+          python .github/scripts/csv_to_markdown.py \
+            --csv target/tmp/wikipedia-100K_change.csv \
+            --md target/tmp/wikipedia-100K_change.md \
+            --title 'A/A Results: Wikipedia-100K Dataset'
+
+      - name: Validate benchmark results
+        working-directory: diskann_rust
+        run: |
+          python .github/scripts/benchmark_result_parse.py \
+            --mode aa \
+            --file target/tmp/wikipedia-100K_change.csv
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: aa-results-wikipedia-100K
+          path: |
+            diskann_rust/target/tmp/wikipedia-100K_change.csv
+            diskann_rust/target/tmp/wikipedia-100K_change.md
+            diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json
+            baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json
+          retention-days: 30
+
+  # A/A benchmark: OpenAI ArXiv 100K dataset (main vs main)
+  aa-openai-100K:
+    name: A/A - OAI ArXiv 100K
+    runs-on: ubuntu-latest
+    timeout-minutes: 120
+
+    steps:
+      - name: Checkout main (target)
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          path: diskann_rust
+          lfs: true
+
+      - name: Checkout main (baseline)
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          path: baseline
+          lfs: true
+
+      - name: Install Rust ${{ env.rust_stable }}
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: ${{ env.rust_stable }}
+
+      - name: Cache Rust dependencies (target)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: diskann_rust -> target
+          key: aa-target
+
+      - name: Cache Rust dependencies (baseline)
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: baseline -> target
+          key: aa-baseline
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y openssl libssl-dev pkg-config
+
+      # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release
+      - name: Download openai-100K dataset
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          mkdir -p diskann_rust/target/tmp baseline/target/tmp
+          gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'openai-100K.tar.gz' --dir .
+          tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/
+          cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/
+
+      - name: Run baseline benchmark
+        working-directory: baseline
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file ../diskann_rust/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \
+            --output-file target/tmp/openai-100K_benchmark_crate_baseline.json
+
+      - name: Run target benchmark
+        working-directory: diskann_rust
+        run: |
+          cargo run -p diskann-benchmark --features disk-index --release -- \
+            run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \
+            --output-file target/tmp/openai-100K_benchmark_crate_target.json
+
+      - name: Generate diff stats (baseline vs target)
+        run: |
+          python diskann_rust/.github/scripts/compare_disk_index_json_output.py \
+            --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \
+            --branch diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \
+            --out diskann_rust/target/tmp/openai-100K_change.csv
+
+      - name: Convert results to Markdown
+        working-directory: diskann_rust
+        run: |
+          python .github/scripts/csv_to_markdown.py \
+            --csv target/tmp/openai-100K_change.csv \
+            --md target/tmp/openai-100K_change.md \
+            --title 'A/A Results: OpenAI ArXiv 100K Dataset'
+
+      - name: Validate benchmark results
+        working-directory: diskann_rust
+        run: |
+          python .github/scripts/benchmark_result_parse.py \
+            --mode aa \
+            --file target/tmp/openai-100K_change.csv
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: aa-results-openai-100K
+          path: |
+            diskann_rust/target/tmp/openai-100K_change.csv
+            diskann_rust/target/tmp/openai-100K_change.md
+            diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json
+            baseline/target/tmp/openai-100K_benchmark_crate_baseline.json
+          retention-days: 30
+
+  # Notify diskann-admin on A/A failure
+  notify-on-failure:
+    name: Notify on A/A Failure
+    needs: [aa-wikipedia-100K, aa-openai-100K]
+    runs-on: ubuntu-latest
+    if: failure()
+    steps:
+      - name: Create GitHub issue for A/A failure
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const date = new Date().toISOString().split('T')[0];
+            const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
+            await github.rest.issues.create({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              title: `[Benchmark A/A] Daily stability test failed – ${date}`,
+              body: [
+                `## Daily A/A Benchmark Failure`,
+                ``,
+                `The scheduled A/A benchmark run (main vs main) **failed** on ${date}.`,
+                `This indicates environment noise exceeded the configured thresholds.`,
+                ``,
+                `**Run:** ${runUrl}`,
+                ``,
+                `Please review the benchmark artifacts and determine if thresholds need tuning`,
+                `or if there is a runner environment issue.`,
+                ``,
+                `/cc @microsoft/diskann-admin`,
+              ].join('\n'),
+              labels: ['benchmark', 'A/A-failure'],
+            });

From 5b0e7f72770cb283b32e3c231f2d53fbd679efe9 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Fri, 27 Mar 2026 15:30:03 +0800
Subject: [PATCH 22/31] widen mean_cpus threshold to 15% for shared-runner CPU
 noise

---
 .github/scripts/benchmark_result_parse.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py
index c51036056..27385406d 100644
--- a/.github/scripts/benchmark_result_parse.py
+++ b/.github/scripts/benchmark_result_parse.py
@@ -178,7 +178,7 @@
         "latency_95": [10, 'LT', ""],
         "mean_latency": [10, 'LT', ""],
         "mean_io_time": [10, 'LT', ""],
-        "mean_cpus": [10, 'LT', ""],
+        "mean_cpus": [15, 'LT', ""],  # wider threshold — CPU time is noisy on shared runners
         "qps": [10, 'GT', 6.5],
         "mean_ios": [1, 'LT', 2410],
         "mean_comps": [1, 'LT', 33200],
@@ -189,7 +189,7 @@
         "latency_95": [10, 'LT', ""],
         "mean_latency": [10, 'LT', ""],
         "mean_io_time": [10, 'LT', ""],
-        "mean_cpus": [10, 'LT', ""],
+        "mean_cpus": [15, 'LT', ""],  # wider threshold — CPU time is noisy on shared runners
         "qps": [10, 'GT', ""],
         "mean_ios": [10, 'LT', ""],
         "mean_comps": [10, 'LT', ""],
@@ -200,7 +200,7 @@
         "latency_95": [10, 'LT', ""],
         "mean_latency": [10, 'LT', ""],
         "mean_io_time": [10, 'LT', ""],
-        "mean_cpus": [10, 'LT', ""],
+        "mean_cpus": [15, 'LT', ""],  # wider threshold — CPU time is noisy on shared runners
         "qps": [10, 'GT', ""],
         "mean_ios": [10, 'LT', ""],
         "mean_comps": [10, 'LT', ""],

From 639d4bb2c19cb34bc978834546ee0aee2c115fdf Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Mon, 30 Mar 2026 15:32:20 +0800
Subject: [PATCH 23/31] move benchmark datasets to separate repo
 (YuanyuanTian-hh/diskann-benchmark-data)

---
 .github/workflows/benchmarks-aa.yml | 6 ++++--
 .github/workflows/benchmarks.yml    | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml
index b272f7c69..c1fd45ad8 100644
--- a/.github/workflows/benchmarks-aa.yml
+++ b/.github/workflows/benchmarks-aa.yml
@@ -76,12 +76,13 @@ jobs:
           sudo apt-get install -y openssl libssl-dev pkg-config
 
       # Download pre-packaged Wikipedia-100K dataset from GitHub Release
+      # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
       - name: Download wikipedia-100K dataset
         env:
           GH_TOKEN: ${{ github.token }}
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'wikipedia-100K.tar.gz' --dir .
+          gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'wikipedia-100K.tar.gz' --dir .
           tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/
           cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/
 
@@ -181,12 +182,13 @@ jobs:
           sudo apt-get install -y openssl libssl-dev pkg-config
 
       # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release
+      # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
       - name: Download openai-100K dataset
         env:
           GH_TOKEN: ${{ github.token }}
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'openai-100K.tar.gz' --dir .
+          gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'openai-100K.tar.gz' --dir .
           tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/
           cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/
 
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index a5c5c3a03..8cdf767a9 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -93,12 +93,13 @@ jobs:
 
       # Download pre-packaged Wikipedia-100K dataset from GitHub Release
       # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance)
+      # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
       - name: Download wikipedia-100K dataset
         env:
           GH_TOKEN: ${{ github.token }}
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'wikipedia-100K.tar.gz' --dir .
+          gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'wikipedia-100K.tar.gz' --dir .
           tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/
           cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/
 
@@ -203,12 +204,13 @@ jobs:
 
       # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release
       # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance)
+      # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
       - name: Download openai-100K dataset
         env:
           GH_TOKEN: ${{ github.token }}
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          gh release download benchmark-data-v1 --repo ${{ github.repository }} --pattern 'openai-100K.tar.gz' --dir .
+          gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'openai-100K.tar.gz' --dir .
           tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/
           cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/
 

From 1b52de23ded0fdf6b570d9481e13831a67c95f42 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Mon, 30 Mar 2026 15:57:22 +0800
Subject: [PATCH 24/31] consolidate 3 benchmark scripts into single
 benchmark_validate.py

Replaces the previous 3-step pipeline (JSONCSVMarkdownvalidate)
with a single script that reads both JSONs directly, compares metrics,
writes Markdown to step summary, checks thresholds, and posts PR comments.

Removed:
- compare_disk_index_json_output.py (JSON diff  CSV)
- csv_to_markdown.py (CSV  Markdown)
- benchmark_result_parse.py (CSV  threshold check)

Also removes pip install csvtomd/numpy/scipy  all scripts now use stdlib only.
---
 .github/scripts/benchmark_result_parse.py     | 574 ------------------
 .github/scripts/benchmark_validate.py         | 425 +++++++++++++
 .../scripts/compare_disk_index_json_output.py | 256 --------
 .github/scripts/csv_to_markdown.py            |  50 --
 .github/workflows/benchmarks-aa.yml           |  48 +-
 .github/workflows/benchmarks.yml              |  57 +-
 6 files changed, 444 insertions(+), 966 deletions(-)
 delete mode 100644 .github/scripts/benchmark_result_parse.py
 create mode 100644 .github/scripts/benchmark_validate.py
 delete mode 100644 .github/scripts/compare_disk_index_json_output.py
 delete mode 100644 .github/scripts/csv_to_markdown.py

diff --git a/.github/scripts/benchmark_result_parse.py b/.github/scripts/benchmark_result_parse.py
deleted file mode 100644
index 27385406d..000000000
--- a/.github/scripts/benchmark_result_parse.py
+++ /dev/null
@@ -1,574 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-"""
-Benchmark Result Parser for GitHub Actions
-
-Parses benchmark CSV results and validates against thresholds.
-Posts comments to GitHub PRs when regressions are detected.
-
-Usage:
-    python benchmark_result_parse.py --mode pr --file results.csv
-    python benchmark_result_parse.py --mode aa --file results.csv --data search
-
-Environment Variables (for PR comments):
-    GITHUB_TOKEN: GitHub token for API access
-    GITHUB_REPOSITORY: Owner/repo (e.g., "microsoft/DiskANN")
-    GITHUB_PR_NUMBER: Pull request number
-    GITHUB_RUN_ID: Workflow run ID for linking to logs
-"""
-
-import csv
-import os
-import sys
-import argparse
-import json
-from typing import Any
-
-# Optional: requests for posting PR comments
-try:
-    import requests
-    HAS_REQUESTS = True
-except ImportError:
-    HAS_REQUESTS = False
-
-
-# =============================================================================
-# Data Structures
-# =============================================================================
-
-# Template for full benchmark data (build + search)
-DATA_TEMPLATE_FULL = {
-    "DiskIndexBuild-PqConstruction": {
-        "duration_seconds": [],
-        "peak_memory_usage": []
-    },
-    "DiskIndexBuild-InmemIndexBuild": {
-        "duration_seconds": [],
-        "peak_memory_usage": []
-    },
-    "search_disk_index-search_completed": {
-        "duration_seconds": [],
-        "peak_memory_usage": []
-    },
-    "disk_index_perf_test": {
-        "total_duration_seconds": [],
-    },
-    "index-build statistics": {
-        "total_time": [],
-        "total_comparisons": [],
-        "search_hops": []
-    },
-    "search-with-L=2000-bw=4": {
-        "latency_95": [],
-        "mean_latency": [],
-        "mean_io_time": [],
-        "mean_cpus": [],
-        "qps": [],
-        "mean_ios": [],
-        "mean_comps": [],
-        "mean_hops": [],
-        "recall": []
-    },
-    "search-with-L=100-bw=4": {
-        "latency_95": [],
-        "mean_latency": [],
-        "mean_io_time": [],
-        "mean_cpus": [],
-        "qps": [],
-        "mean_ios": [],
-        "mean_comps": [],
-        "mean_hops": [],
-        "recall": []
-    },
-    "search-with-L=200-bw=4": {
-        "latency_95": [],
-        "mean_latency": [],
-        "mean_io_time": [],
-        "mean_cpus": [],
-        "qps": [],
-        "mean_ios": [],
-        "mean_comps": [],
-        "mean_hops": [],
-        "recall": []
-    }
-}
-DATA_TEMPLATE_SEARCH = {
-    "search_disk_index-search_completed": {
-        "duration_seconds": [],
-        "peak_memory_usage": []
-    },
-    "disk_index_perf_test": {
-        "total_duration_seconds": [],
-    },
-    "search-with-L=2000-bw=4": {
-        "latency_95": [],
-        "mean_latency": [],
-        "mean_io_time": [],
-        "mean_cpus": [],
-        "qps": [],
-        "mean_ios": [],
-        "mean_comps": [],
-        "mean_hops": [],
-        "recall": []
-    },
-    "search-with-L=100-bw=4": {
-        "latency_95": [],
-        "mean_latency": [],
-        "mean_io_time": [],
-        "mean_cpus": [],
-        "qps": [],
-        "mean_ios": [],
-        "mean_comps": [],
-        "mean_hops": [],
-        "recall": []
-    },
-    "search-with-L=200-bw=4": {
-        "latency_95": [],
-        "mean_latency": [],
-        "mean_io_time": [],
-        "mean_cpus": [],
-        "qps": [],
-        "mean_ios": [],
-        "mean_comps": [],
-        "mean_hops": [],
-        "recall": []
-    }
-}
-
-# Thresholds for benchmark values
-# Format: [threshold_percentage, direction, contract_value]
-# - threshold_percentage: Maximum allowed deviation percentage
-# - direction: 'GT' = higher is better, 'LT' = lower is better
-# - contract_value: Promised performance value (empty string if none)
-#
-# For 'GT' metrics (like QPS, recall): regression if value decreases beyond threshold
-# For 'LT' metrics (like latency, memory): regression if value increases beyond threshold
-DATA_THRESHOLDS = {
-    "DiskIndexBuild-PqConstruction": {
-        "duration_seconds": [10, 'LT', ""],
-        "peak_memory_usage": [10, 'LT', ""]
-    },
-    "DiskIndexBuild-InmemIndexBuild": {
-        "duration_seconds": [10, 'LT', ""],
-        "peak_memory_usage": [10, 'LT', ""]
-    },
-    "search_disk_index-search_completed": {
-        "duration_seconds": [10, 'LT', ""],
-        "peak_memory_usage": [10, 'LT', 1.42]
-    },
-    "disk_index_perf_test": {
-        "total_duration_seconds": [10, 'LT', ""],
-    },
-    "index-build statistics": {
-        # Calibrated from 5 GitHub runner runs (10 observations):
-        #   Wikipedia: 35.9–37.2s, OpenAI: 23.0–76.4s (SQ_1_2.0 variance)
-        # Contract: worst × 1.5 to absorb shared-runner variance
-        "total_time": [10, 'LT', 115],
-        "total_comparisons": [1, 'LT', ""],
-        "search_hops": [1, 'LT', ""]
-    },
-    "search-with-L=2000-bw=4": {
-        # Calibrated from 5 GitHub runner runs (10 observations):
-        #   QPS: 9.56–9.75 (both datasets)
-        #   Recall: wiki 99.87%, oai 99.67–99.91%
-        #   mean_ios: ~2007 (deterministic)
-        #   mean_comps: wiki ~27609, oai 21618–24733
-        "latency_95": [10, 'LT', ""],
-        "mean_latency": [10, 'LT', ""],
-        "mean_io_time": [10, 'LT', ""],
-        "mean_cpus": [15, 'LT', ""],  # wider threshold — CPU time is noisy on shared runners
-        "qps": [10, 'GT', 6.5],
-        "mean_ios": [1, 'LT', 2410],
-        "mean_comps": [1, 'LT', 33200],
-        "mean_hops": [1, 'LT', ""],
-        "recall": [1, 'GT', 98.0]
-    },
-    "search-with-L=100-bw=4": {
-        "latency_95": [10, 'LT', ""],
-        "mean_latency": [10, 'LT', ""],
-        "mean_io_time": [10, 'LT', ""],
-        "mean_cpus": [15, 'LT', ""],  # wider threshold — CPU time is noisy on shared runners
-        "qps": [10, 'GT', ""],
-        "mean_ios": [10, 'LT', ""],
-        "mean_comps": [10, 'LT', ""],
-        "mean_hops": [10, 'LT', ""],
-        "recall": [1, 'GT', ""]
-    },
-    "search-with-L=200-bw=4": {
-        "latency_95": [10, 'LT', ""],
-        "mean_latency": [10, 'LT', ""],
-        "mean_io_time": [10, 'LT', ""],
-        "mean_cpus": [15, 'LT', ""],  # wider threshold — CPU time is noisy on shared runners
-        "qps": [10, 'GT', ""],
-        "mean_ios": [10, 'LT', ""],
-        "mean_comps": [10, 'LT', ""],
-        "mean_hops": [10, 'LT', ""],
-        "recall": [1, 'GT', ""]
-    }
-}
-
-
-# =============================================================================
-# CSV Parsing
-# =============================================================================
-
-def parse_csv(file_path: str, data: dict[str, dict[str, list]]) -> dict[str, dict[str, list]]:
-    """
-    Parse benchmark CSV file and populate data structure.
-
-    CSV format produced by compare_disk_index_json_output.py:
-        Column 0: Parent Span Name  (category, e.g. "index-build statistics")
-        Column 1: Span Name         (display name, unused for matching)
-        Column 2: Stat Key          (metric key, e.g. "qps")
-        Column 3: Stat Value (Target)
-        Column 4: Stat Value (Baseline)
-        Column 5: Deviation (%)
-    """
-    with open(file_path, 'r', encoding='utf-8') as f:
-        reader = csv.reader(f)
-        next(reader)  # Skip header row
-
-        for row in reader:
-            if len(row) < 6:
-                continue
-
-            category = row[0].strip()
-            metric_name = row[2].strip()
-
-            if category in data and metric_name in data[category]:
-                # Append: [current_value, baseline_value, change_percentage]
-                data[category][metric_name].append(row[3])  # target (current)
-                data[category][metric_name].append(row[4])  # baseline
-                data[category][metric_name].append(row[5])  # deviation %
-
-    return data
-
-
-def get_data_template(data_type: str) -> dict[str, dict[str, list]]:
-    """Get a fresh copy of the data template."""
-    import copy
-    if data_type == 'search':
-        return copy.deepcopy(DATA_TEMPLATE_SEARCH)
-    return copy.deepcopy(DATA_TEMPLATE_FULL)
-
-
-# =============================================================================
-# Threshold Checking
-# =============================================================================
-
-def get_target_change_range(threshold: float, direction: str, mode: str) -> tuple[float, float]:
-    """
-    Calculate acceptable change range based on threshold and direction.
-    
-    Args:
-        threshold: Maximum allowed deviation percentage
-        direction: 'GT' (higher is better) or 'LT' (lower is better)
-        mode: 'aa' (A/A test, symmetric) or 'pr' (PR test, directional)
-    
-    Returns:
-        Tuple of (min_allowed, max_allowed) change percentages
-    """
-    if mode == 'aa':
-        # A/A test: symmetric threshold
-        return (-threshold, threshold)
-    else:
-        # PR test: directional threshold
-        if direction == 'GT':
-            # Higher is better: allow any improvement, flag regressions
-            return (-threshold, float('inf'))
-        else:
-            # Lower is better: allow any improvement (negative change), flag increases
-            return (float('-inf'), threshold)
-
-
-def format_interval(start: float, end: float) -> str:
-    """Format a numeric interval as a string."""
-    start_str = '-inf' if start == float('-inf') else f"{start}%"
-    end_str = 'inf' if end == float('inf') else f"{end}%"
-    return f"({start_str} - {end_str})"
-
-
-def is_change_threshold_failed(change: float, target_range: tuple[float, float]) -> bool:
-    """Check if the change exceeds the allowed threshold range."""
-    return change < target_range[0] or change > target_range[1]
-
-
-def is_promise_broken(current_value: float, target_value: Any, direction: str) -> tuple[bool, str]:
-    """
-    Check if the current value violates a promised contract value.
-    
-    Returns:
-        Tuple of (is_broken, formatted_target_value)
-    """
-    if target_value == "":
-        return False, "N/A"
-    
-    target_value = float(target_value)
-    
-    if direction == 'GT':
-        # Higher is better: current should be >= target
-        if current_value < target_value:
-            return True, f"> {target_value}"
-    else:
-        # Lower is better: current should be <= target
-        if current_value > target_value:
-            return True, f"< {target_value}"
-    
-    return False, str(target_value)
-
-
-def get_outcome_message(threshold_failed: bool, promise_broken: bool) -> str:
-    """Generate human-readable outcome message."""
-    if threshold_failed and promise_broken:
-        return 'Regression detected, Promise broken'
-    elif promise_broken:
-        return 'Promise broken'
-    elif threshold_failed:
-        return 'Regression detected'
-    return 'OK'
-
-
-def check_thresholds(
-    data: dict[str, dict[str, list]],
-    thresholds: dict[str, dict[str, list]],
-    mode: str,
-    run_id: str | None = None
-) -> tuple[bool, str]:
-    """
-    Check all metrics against their thresholds.
-    
-    Returns:
-        Tuple of (has_failures, failure_report_markdown)
-    """
-    failed_rows = []
-    
-    for category in data:
-        for metric in data[category]:
-            # Skip metrics without thresholds defined
-            if category not in thresholds or metric not in thresholds[category]:
-                print(f"Skipping {category}/{metric} - no threshold defined")
-                continue
-            
-            values = data[category][metric]
-            if not values:
-                # No data for this metric in the CSV — skip silently
-                continue
-            
-            # Parse values: [current, baseline, change%]
-            try:
-                value_current = float(values[0])
-                value_baseline = float(values[1])
-                change = float(values[2]) if values[2] else 0.0
-            except (ValueError, IndexError) as e:
-                print(f"ERROR: Failed to parse {category}/{metric}: {e}")
-                return True, f"Parse error for {category}/{metric}"
-            
-            # Get threshold config
-            threshold_config = thresholds[category][metric]
-            threshold_pct = threshold_config[0]
-            direction = threshold_config[1]
-            contract_value = threshold_config[2]
-            
-            # Check thresholds
-            target_range = get_target_change_range(threshold_pct, direction, mode)
-            threshold_failed = is_change_threshold_failed(change, target_range)
-            promise_broken, target_formatted = is_promise_broken(value_current, contract_value, direction)
-            
-            if threshold_failed:
-                print(f"THRESHOLD FAILED: {category}/{metric} change={change}% allowed={format_interval(*target_range)}")
-            if promise_broken:
-                print(f"PROMISE BROKEN: {category}/{metric} value={value_current} required={target_formatted}")
-            
-            if threshold_failed or promise_broken:
-                outcome = get_outcome_message(threshold_failed, promise_broken)
-                failed_rows.append(
-                    f"| {category}/{metric} | {value_baseline} | {value_current} | "
-                    f"{target_formatted} | {change}% | {format_interval(*target_range)} | {outcome} |"
-                )
-    
-    if failed_rows:
-        # Build failure report
-        logs_link = ""
-        if run_id:
-            repo = os.getenv('GITHUB_REPOSITORY', 'microsoft/DiskANN')
-            logs_link = f"https://github.com/{repo}/actions/runs/{run_id}"
-        
-        report = "### ❌ Benchmark Check Failed\n\n"
-        if logs_link:
-            report += f"Please investigate the [workflow logs]({logs_link}) to determine if the failure is due to your changes.\n\n"
-        
-        report += "| Metric | Baseline | Current | Contract | Change | Allowed | Outcome |\n"
-        report += "|--------|----------|---------|----------|--------|---------|--------|\n"
-        report += "\n".join(failed_rows)
-        
-        return True, report
-    
-    return False, ""
-
-
-# =============================================================================
-# GitHub Integration
-# =============================================================================
-
-def post_github_pr_comment(comment: str) -> bool:
-    """
-    Post a comment to a GitHub pull request.
-    
-    Requires environment variables:
-        GITHUB_TOKEN: Personal access token or GitHub Actions token
-        GITHUB_REPOSITORY: Owner/repo format
-        GITHUB_PR_NUMBER: Pull request number
-    """
-    if not HAS_REQUESTS:
-        print("WARNING: 'requests' module not available, cannot post PR comment")
-        return False
-    
-    token = os.getenv('GITHUB_TOKEN')
-    repo = os.getenv('GITHUB_REPOSITORY')
-    pr_number = os.getenv('GITHUB_PR_NUMBER')
-    
-    if not all([token, repo, pr_number]):
-        print("WARNING: Missing GitHub environment variables for PR comment")
-        print(f"  GITHUB_TOKEN: {'set' if token else 'missing'}")
-        print(f"  GITHUB_REPOSITORY: {repo or 'missing'}")
-        print(f"  GITHUB_PR_NUMBER: {pr_number or 'missing'}")
-        return False
-    
-    url = f"https://api.github.com/repos/{repo}/issues/{pr_number}/comments"
-    headers = {
-        "Accept": "application/vnd.github+json",
-        "Authorization": f"Bearer {token}",
-        "X-GitHub-Api-Version": "2022-11-28"
-    }
-    body = {"body": comment}
-    
-    try:
-        response = requests.post(url, headers=headers, json=body, timeout=30)
-        response.raise_for_status()
-        print(f"Successfully posted comment to PR #{pr_number}")
-        return True
-    except requests.RequestException as e:
-        print(f"ERROR: Failed to post PR comment: {e}")
-        return False
-
-
-def write_github_step_summary(content: str) -> None:
-    """Write content to GitHub Actions step summary."""
-    summary_file = os.getenv('GITHUB_STEP_SUMMARY')
-    if summary_file:
-        with open(summary_file, 'a', encoding='utf-8') as f:
-            f.write(content)
-            f.write("\n")
-
-
-def write_github_output(name: str, value: str) -> None:
-    """Write an output variable for GitHub Actions."""
-    output_file = os.getenv('GITHUB_OUTPUT')
-    if output_file:
-        with open(output_file, 'a', encoding='utf-8') as f:
-            f.write(f"{name}={value}\n")
-
-
-# =============================================================================
-# Main
-# =============================================================================
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description='Parse benchmark results and validate against thresholds.',
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-    # Check PR benchmark results
-    python benchmark_result_parse.py --mode pr --file results_change.csv
-    
-    # Check A/A test results (symmetric thresholds)
-    python benchmark_result_parse.py --mode aa --file results_change.csv
-    
-    # Check search-only benchmarks
-    python benchmark_result_parse.py --mode pr --file results_change.csv --data search
-        """
-    )
-    parser.add_argument(
-        '--mode',
-        type=str,
-        default='aa',
-        choices=['aa', 'pr', 'lkg'],
-        help='Benchmark mode: aa=A/A test (symmetric), pr=PR test (directional), lkg=last known good'
-    )
-    parser.add_argument(
-        '--data',
-        type=str,
-        default='both',
-        choices=['both', 'search'],
-        help='Type of benchmark data: both=full benchmark, search=search-only'
-    )
-    parser.add_argument(
-        '--file',
-        type=str,
-        default=None,
-        help='Path to CSV file (overrides FILE_PATH env var)'
-    )
-    parser.add_argument(
-        '--no-comment',
-        action='store_true',
-        help='Skip posting PR comment even in pr mode'
-    )
-    return parser.parse_args()
-
-
-def main() -> int:
-    args = parse_args()
-    
-    # Get file path
-    file_path = args.file or os.getenv('FILE_PATH')
-    if not file_path:
-        print("ERROR: No input file specified. Use --file or set FILE_PATH env var.")
-        return 1
-    
-    if not os.path.exists(file_path):
-        print(f"ERROR: File not found: {file_path}")
-        return 1
-    
-    print(f"Benchmark mode: {args.mode}")
-    print(f"Data type: {args.data}")
-    print(f"Input file: {file_path}")
-    
-    # Parse CSV
-    data_template = get_data_template(args.data)
-    data = parse_csv(file_path, data_template)
-    
-    # Debug output
-    print("\nParsed data:")
-    print(json.dumps({k: {sk: sv for sk, sv in v.items() if sv} for k, v in data.items() if any(v.values())}, indent=2))
-    
-    # Check thresholds
-    run_id = os.getenv('GITHUB_RUN_ID')
-    has_failures, report = check_thresholds(data, DATA_THRESHOLDS, args.mode, run_id)
-    
-    if has_failures:
-        print("\n" + report)
-        
-        # Write to GitHub step summary
-        write_github_step_summary(report)
-        
-        # Post PR comment if in pr mode
-        if args.mode == 'pr' and not args.no_comment:
-            post_github_pr_comment(report)
-        
-        # Set output for downstream steps
-        write_github_output('benchmark_failed', 'true')
-        
-        return 1
-    
-    print("\n✅ All benchmark values passed!")
-    write_github_step_summary("### ✅ Benchmark Check Passed\n\nAll metrics within acceptable thresholds.")
-    write_github_output('benchmark_failed', 'false')
-    
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/.github/scripts/benchmark_validate.py b/.github/scripts/benchmark_validate.py
new file mode 100644
index 000000000..cb69f6054
--- /dev/null
+++ b/.github/scripts/benchmark_validate.py
@@ -0,0 +1,425 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+"""
+Benchmark Validator for GitHub Actions
+
+Compares two benchmark JSON outputs (baseline vs target), checks thresholds,
+writes a Markdown summary, and optionally posts a PR comment on failure.
+
+This single script replaces the previous three-step pipeline:
+  compare_disk_index_json_output.py → csv_to_markdown.py → benchmark_result_parse.py
+
+Usage:
+    # PR mode (directional thresholds, posts PR comment on failure)
+    python benchmark_validate.py --mode pr --baseline baseline.json --target target.json
+
+    # A/A mode (symmetric thresholds)
+    python benchmark_validate.py --mode aa --baseline baseline.json --target target.json
+
+Environment Variables (for PR comments):
+    GITHUB_TOKEN: GitHub token for API access
+    GITHUB_REPOSITORY: Owner/repo (e.g., "microsoft/DiskANN")
+    GITHUB_PR_NUMBER: Pull request number
+    GITHUB_RUN_ID: Workflow run ID for linking to logs
+    GITHUB_STEP_SUMMARY: Path to step summary file
+"""
+
+import json
+import os
+import sys
+import argparse
+from typing import Any
+from urllib.request import urlopen, Request
+from urllib.error import URLError
+
+
+# =============================================================================
+# JSON Extraction
+# =============================================================================
+
+def load_json(path: str) -> list[dict[str, Any]]:
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def extract_build_metrics(results: dict) -> dict[str, float]:
+    build = results.get("build", {})
+    if not build:
+        return {}
+
+    metrics: dict[str, float] = {}
+
+    build_time = build.get("build_time")
+    if build_time:
+        metrics["total_time"] = build_time / 1e6  # μs → s
+
+    for span in build.get("span_metrics", {}).get("spans", []):
+        name = span.get("span_name", "")
+        data = span.get("metrics", {})
+        if name == "DiskIndexBuild-PqConstruction":
+            metrics["pq_construction_time"] = data.get("duration_seconds", 0)
+        elif name == "DiskIndexBuild-InmemIndexBuild":
+            metrics["inmem_index_build_time"] = data.get("duration_seconds", 0)
+        elif name == "DiskIndexBuild-DiskLayout":
+            metrics["disk_layout_time"] = data.get("duration_seconds", 0)
+
+    return metrics
+
+
+def extract_search_metrics(results: dict, search_l: int, beam_width: int) -> dict[str, float]:
+    search = results.get("search", {})
+    if not search:
+        return {}
+
+    metrics: dict[str, float] = {}
+
+    # From search_results_per_l
+    for sr in search.get("search_results_per_l", []):
+        if sr.get("search_l") == search_l:
+            metrics["qps"] = sr.get("qps", 0)
+            metrics["recall"] = sr.get("recall", 0)
+            metrics["mean_latency"] = sr.get("mean_latency", 0)
+            metrics["mean_ios"] = sr.get("mean_ios", 0)
+            metrics["mean_comps"] = sr.get("mean_comparisons", 0)
+            metrics["mean_hops"] = sr.get("mean_hops", 0)
+            metrics["mean_io_time"] = sr.get("mean_io_time", 0)
+            metrics["mean_cpus"] = sr.get("mean_cpu_time", 0)
+            metrics["latency_95"] = sr.get("p999_latency", 0)
+            break
+
+    # Override with span metrics if available
+    span_name = f"search-with-L={search_l}-bw={beam_width}"
+    for span in search.get("span_metrics", {}).get("spans", []):
+        if span.get("span_name") == span_name:
+            data = span.get("metrics", {})
+            for key in ("qps", "recall", "mean_latency", "mean_ios", "mean_comps",
+                        "mean_hops", "mean_io_time", "mean_cpus"):
+                if key in data:
+                    metrics[key] = data[key]
+            break
+
+    return metrics
+
+
+def compute_diff(baseline_json: list[dict], target_json: list[dict]) -> list[dict]:
+    """
+    Compare baseline and target JSONs.
+    Returns a flat list of metric diffs:
+        [{category, metric, baseline, target, deviation}, ...]
+    """
+    rows = []
+
+    for baseline, target in zip(baseline_json, target_json):
+        b_results = baseline.get("results", {})
+        t_results = target.get("results", {})
+
+        inp = target.get("input", {})
+        search_phase = inp.get("content", {}).get("search_phase", {})
+        search_list = search_phase.get("search_list", [2000])
+        beam_width = search_phase.get("beam_width", 4)
+        primary_l = search_list[0] if search_list else 2000
+
+        # Build metrics
+        b_build = extract_build_metrics(b_results)
+        t_build = extract_build_metrics(t_results)
+
+        for key in ("total_time", "pq_construction_time", "inmem_index_build_time", "disk_layout_time"):
+            if key in t_build or key in b_build:
+                bv = b_build.get(key, 0)
+                tv = t_build.get(key, 0)
+                rows.append({
+                    "category": "index-build statistics",
+                    "metric": key,
+                    "baseline": bv,
+                    "target": tv,
+                    "deviation": ((tv - bv) / bv * 100) if bv else 0,
+                })
+
+        # Search metrics
+        b_search = extract_search_metrics(b_results, primary_l, beam_width)
+        t_search = extract_search_metrics(t_results, primary_l, beam_width)
+        span_cat = f"search-with-L={primary_l}-bw={beam_width}"
+
+        for key in ("qps", "recall", "mean_latency", "latency_95", "mean_ios",
+                     "mean_comps", "mean_hops", "mean_io_time", "mean_cpus"):
+            if key in t_search or key in b_search:
+                bv = b_search.get(key, 0)
+                tv = t_search.get(key, 0)
+                rows.append({
+                    "category": span_cat,
+                    "metric": key,
+                    "baseline": bv,
+                    "target": tv,
+                    "deviation": ((tv - bv) / bv * 100) if bv else 0,
+                })
+
+    return rows
+
+
+# =============================================================================
+# Thresholds
+# =============================================================================
+
+# Format: [max_deviation_%, direction, contract_value]
+#   direction: 'GT' = higher is better, 'LT' = lower is better
+#   contract_value: absolute limit (empty string = none)
+THRESHOLDS: dict[str, dict[str, list]] = {
+    "DiskIndexBuild-PqConstruction": {
+        "duration_seconds": [10, "LT", ""],
+        "peak_memory_usage": [10, "LT", ""],
+    },
+    "DiskIndexBuild-InmemIndexBuild": {
+        "duration_seconds": [10, "LT", ""],
+        "peak_memory_usage": [10, "LT", ""],
+    },
+    "search_disk_index-search_completed": {
+        "duration_seconds": [10, "LT", ""],
+        "peak_memory_usage": [10, "LT", 1.42],
+    },
+    "disk_index_perf_test": {
+        "total_duration_seconds": [10, "LT", ""],
+    },
+    "index-build statistics": {
+        # Calibrated from 5 GitHub runner runs (10 observations):
+        #   Wikipedia: 35.9–37.2s, OpenAI: 23.0–76.4s (SQ_1_2.0 variance)
+        #   Contract: worst × 1.5 to absorb shared-runner variance
+        "total_time": [10, "LT", 115],
+        "total_comparisons": [1, "LT", ""],
+        "search_hops": [1, "LT", ""],
+    },
+    "search-with-L=2000-bw=4": {
+        # Calibrated from 5 GitHub runner runs (10 observations)
+        "latency_95": [10, "LT", ""],
+        "mean_latency": [10, "LT", ""],
+        "mean_io_time": [10, "LT", ""],
+        "mean_cpus": [15, "LT", ""],   # wider — CPU time is noisy on shared runners
+        "qps": [10, "GT", 6.5],
+        "mean_ios": [1, "LT", 2410],
+        "mean_comps": [1, "LT", 33200],
+        "mean_hops": [1, "LT", ""],
+        "recall": [1, "GT", 98.0],
+    },
+    "search-with-L=100-bw=4": {
+        "latency_95": [10, "LT", ""],
+        "mean_latency": [10, "LT", ""],
+        "mean_io_time": [10, "LT", ""],
+        "mean_cpus": [15, "LT", ""],
+        "qps": [10, "GT", ""],
+        "mean_ios": [10, "LT", ""],
+        "mean_comps": [10, "LT", ""],
+        "mean_hops": [10, "LT", ""],
+        "recall": [1, "GT", ""],
+    },
+    "search-with-L=200-bw=4": {
+        "latency_95": [10, "LT", ""],
+        "mean_latency": [10, "LT", ""],
+        "mean_io_time": [10, "LT", ""],
+        "mean_cpus": [15, "LT", ""],
+        "qps": [10, "GT", ""],
+        "mean_ios": [10, "LT", ""],
+        "mean_comps": [10, "LT", ""],
+        "mean_hops": [10, "LT", ""],
+        "recall": [1, "GT", ""],
+    },
+}
+
+
+def allowed_range(threshold: float, direction: str, mode: str) -> tuple[float, float]:
+    """Acceptable change range (in %)."""
+    if mode == "aa":
+        return (-threshold, threshold)
+    if direction == "GT":
+        return (-threshold, float("inf"))
+    return (float("-inf"), threshold)
+
+
+def fmt_range(lo: float, hi: float) -> str:
+    lo_s = "-inf" if lo == float("-inf") else f"{lo}%"
+    hi_s = "inf" if hi == float("inf") else f"{hi}%"
+    return f"({lo_s} – {hi_s})"
+
+
+def check_contract(value: float, contract: Any, direction: str) -> tuple[bool, str]:
+    """Check if value violates a hard contract. Returns (broken, formatted_contract)."""
+    if contract == "":
+        return False, "N/A"
+    contract = float(contract)
+    if direction == "GT" and value < contract:
+        return True, f"> {contract}"
+    if direction == "LT" and value > contract:
+        return True, f"< {contract}"
+    return False, str(contract)
+
+
+# =============================================================================
+# Validation
+# =============================================================================
+
+def validate(diffs: list[dict], mode: str, run_id: str | None) -> tuple[bool, str]:
+    """
+    Check all diffs against thresholds.
+    Returns (has_failures, markdown_report).
+    """
+    failed_rows: list[str] = []
+
+    for d in diffs:
+        cat, metric = d["category"], d["metric"]
+        if cat not in THRESHOLDS or metric not in THRESHOLDS[cat]:
+            continue
+
+        pct, direction, contract = THRESHOLDS[cat][metric]
+        rng = allowed_range(pct, direction, mode)
+        dev = d["deviation"]
+
+        threshold_failed = dev < rng[0] or dev > rng[1]
+        contract_broken, contract_fmt = check_contract(d["target"], contract, direction)
+
+        if threshold_failed:
+            print(f"THRESHOLD FAILED: {cat}/{metric} change={dev:.2f}% allowed={fmt_range(*rng)}")
+        if contract_broken:
+            print(f"CONTRACT BROKEN:  {cat}/{metric} value={d['target']} required={contract_fmt}")
+
+        if threshold_failed or contract_broken:
+            outcome = []
+            if threshold_failed:
+                outcome.append("Regression detected")
+            if contract_broken:
+                outcome.append("Contract broken")
+            failed_rows.append(
+                f"| {cat}/{metric} | {d['baseline']:.4g} | {d['target']:.4g} | "
+                f"{contract_fmt} | {dev:.2f}% | {fmt_range(*rng)} | {', '.join(outcome)} |"
+            )
+
+    if not failed_rows:
+        return False, ""
+
+    logs_link = ""
+    if run_id:
+        repo = os.getenv("GITHUB_REPOSITORY", "microsoft/DiskANN")
+        logs_link = f"https://github.com/{repo}/actions/runs/{run_id}"
+
+    report = "### ❌ Benchmark Check Failed\n\n"
+    if logs_link:
+        report += f"Please investigate the [workflow logs]({logs_link}) to determine if the failure is due to your changes.\n\n"
+    report += "| Metric | Baseline | Current | Contract | Change | Allowed | Outcome |\n"
+    report += "|--------|----------|---------|----------|--------|---------|--------|\n"
+    report += "\n".join(failed_rows)
+
+    return True, report
+
+
+# =============================================================================
+# Markdown output
+# =============================================================================
+
+def diffs_to_markdown(diffs: list[dict], title: str) -> str:
+    """Render diffs as a Markdown table."""
+    lines = [
+        f"### {title}",
+        "",
+        "| Category | Metric | Baseline | Current | Change |",
+        "|----------|--------|----------|---------|--------|",
+    ]
+    for d in diffs:
+        lines.append(
+            f"| {d['category']} | {d['metric']} | {d['baseline']:.4g} | "
+            f"{d['target']:.4g} | {d['deviation']:+.2f}% |"
+        )
+    return "\n".join(lines)
+
+
+# =============================================================================
+# GitHub helpers (stdlib only — no requests dependency)
+# =============================================================================
+
+def post_pr_comment(body: str) -> bool:
+    token = os.getenv("GITHUB_TOKEN")
+    repo = os.getenv("GITHUB_REPOSITORY")
+    pr = os.getenv("GITHUB_PR_NUMBER")
+    if not all([token, repo, pr]):
+        print("WARNING: Missing GitHub env vars for PR comment "
+              f"(TOKEN={'set' if token else 'missing'}, REPO={repo or 'missing'}, PR={pr or 'missing'})")
+        return False
+
+    url = f"https://api.github.com/repos/{repo}/issues/{pr}/comments"
+    data = json.dumps({"body": body}).encode()
+    req = Request(url, data=data, method="POST", headers={
+        "Accept": "application/vnd.github+json",
+        "Authorization": f"Bearer {token}",
+        "X-GitHub-Api-Version": "2022-11-28",
+        "Content-Type": "application/json",
+    })
+    try:
+        with urlopen(req, timeout=30) as resp:
+            if resp.status < 300:
+                print(f"Posted comment to PR #{pr}")
+                return True
+    except URLError as e:
+        print(f"ERROR posting PR comment: {e}")
+    return False
+
+
+def write_step_summary(content: str) -> None:
+    path = os.getenv("GITHUB_STEP_SUMMARY")
+    if path:
+        with open(path, "a", encoding="utf-8") as f:
+            f.write(content + "\n")
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Compare two benchmark JSONs, validate thresholds, output Markdown."
+    )
+    parser.add_argument("--mode", choices=["aa", "pr"], default="aa",
+                        help="aa = symmetric thresholds, pr = directional")
+    parser.add_argument("--baseline", required=True, help="Baseline JSON path")
+    parser.add_argument("--target", required=True, help="Target JSON path")
+    parser.add_argument("--title", default="Benchmark Results",
+                        help="Title for the Markdown summary table")
+    parser.add_argument("--no-comment", action="store_true",
+                        help="Skip posting PR comment on failure")
+    args = parser.parse_args()
+
+    print(f"Mode: {args.mode}")
+    print(f"Baseline: {args.baseline}")
+    print(f"Target:   {args.target}")
+
+    baseline = load_json(args.baseline)
+    target = load_json(args.target)
+
+    if len(baseline) != len(target):
+        print(f"ERROR: JSON arrays differ in length: {len(baseline)} vs {len(target)}")
+        return 1
+
+    # Compare
+    diffs = compute_diff(baseline, target)
+    print(f"\nCompared {len(diffs)} metrics")
+
+    # Write Markdown summary
+    md = diffs_to_markdown(diffs, args.title)
+    write_step_summary(md)
+
+    # Validate thresholds
+    run_id = os.getenv("GITHUB_RUN_ID")
+    has_failures, report = validate(diffs, args.mode, run_id)
+
+    if has_failures:
+        print("\n" + report)
+        write_step_summary(report)
+        if args.mode == "pr" and not args.no_comment:
+            post_pr_comment(report)
+        return 1
+
+    print("\n✅ All metrics within thresholds")
+    write_step_summary("### ✅ Benchmark Check Passed\n\nAll metrics within acceptable thresholds.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/.github/scripts/compare_disk_index_json_output.py b/.github/scripts/compare_disk_index_json_output.py
deleted file mode 100644
index ca9c9d26b..000000000
--- a/.github/scripts/compare_disk_index_json_output.py
+++ /dev/null
@@ -1,256 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-"""
-Compare two disk-index benchmark JSON files and emit a diff CSV.
-
-This script takes baseline and branch (target) JSON files from the benchmark crate's
-disk-index benchmarks and produces a CSV file comparing the metrics with deviation percentages.
-
-The output format matches the CSV structure expected by benchmark_result_parse.py:
-  Parent Span Name, Span Name, Stat Key, Stat Value (Target), Stat Value (Baseline), Deviation (%)
-
-Usage:
-    python compare_disk_index_json_output.py \\
-        --baseline baseline/target/tmp/<dataset>_benchmark_crate_baseline.json \\
-        --branch diskann_rust/target/tmp/<dataset>_benchmark_crate_target.json \\
-        --out diskann_rust/target/tmp/<dataset>_change.csv
-"""
-
-import json
-import csv
-import argparse
-from typing import List, Dict, Any, Optional
-
-
-def load_json(path: str) -> List[Dict[str, Any]]:
-    """Load JSON file and return the parsed content."""
-    with open(path, "r", encoding="utf-8") as f:
-        return json.load(f)
-
-
-def calc_deviation(baseline: float, target: float) -> str:
-    """Calculate the percentage deviation from baseline to target."""
-    try:
-        if baseline != 0:
-            dev = ((target - baseline) / baseline) * 100
-            return f"{dev:.2f}"
-        return ""
-    except Exception:
-        return ""
-
-
-def extract_build_metrics(results: Dict[str, Any]) -> Dict[str, Any]:
-    """Extract build metrics from the results structure."""
-    if not results:
-        return {}
-
-    build = results.get("build", {})
-    if not build:
-        return {}
-
-    metrics = {}
-
-    # Total build time (in seconds)
-    build_time = build.get("build_time")
-    if build_time:
-        # build_time is in microseconds, convert to seconds
-        metrics["total_time"] = build_time / 1e6
-
-    # Extract span metrics
-    span_metrics = build.get("span_metrics", {})
-    spans = span_metrics.get("spans", [])
-
-    for span in spans:
-        span_name = span.get("span_name", "")
-        span_data = span.get("metrics", {})
-
-        if span_name == "DiskIndexBuild-PqConstruction":
-            metrics["pq_construction_time"] = span_data.get("duration_seconds", 0)
-        elif span_name == "DiskIndexBuild-InmemIndexBuild":
-            metrics["inmem_index_build_time"] = span_data.get("duration_seconds", 0)
-        elif span_name == "DiskIndexBuild-DiskLayout":
-            metrics["disk_layout_time"] = span_data.get("duration_seconds", 0)
-        elif span_name == "disk-index-build":
-            metrics["total_build_duration"] = span_data.get("duration_seconds", 0)
-
-    return metrics
-
-
-def extract_search_metrics(results: Dict[str, Any], search_l: int, beam_width: int) -> Dict[str, Any]:
-    """Extract search metrics for a specific search_l value."""
-    if not results:
-        return {}
-
-    search = results.get("search", {})
-    if not search:
-        return {}
-
-    metrics = {}
-
-    # Find the search result for the specified search_l
-    search_results = search.get("search_results_per_l", [])
-    for sr in search_results:
-        if sr.get("search_l") == search_l:
-            metrics["qps"] = sr.get("qps", 0)
-            metrics["recall"] = sr.get("recall", 0)
-            metrics["mean_latency"] = sr.get("mean_latency", 0)
-            metrics["mean_ios"] = sr.get("mean_ios", 0)
-            metrics["mean_comps"] = sr.get("mean_comparisons", 0)
-            metrics["mean_hops"] = sr.get("mean_hops", 0)
-            metrics["mean_io_time"] = sr.get("mean_io_time", 0)
-            metrics["mean_cpus"] = sr.get("mean_cpu_time", 0)
-            metrics["latency_95"] = sr.get("p999_latency", 0)  # Use p999 as proxy for 95th percentile
-            break
-
-    # Also try span metrics
-    span_metrics = search.get("span_metrics", {})
-    spans = span_metrics.get("spans", [])
-
-    search_span_name = f"search-with-L={search_l}-bw={beam_width}"
-    for span in spans:
-        if span.get("span_name") == search_span_name:
-            span_data = span.get("metrics", {})
-            # Override with span metrics if they exist
-            if "qps" in span_data:
-                metrics["qps"] = span_data["qps"]
-            if "recall" in span_data:
-                metrics["recall"] = span_data["recall"]
-            if "mean_latency" in span_data:
-                metrics["mean_latency"] = span_data["mean_latency"]
-            if "mean_ios" in span_data:
-                metrics["mean_ios"] = span_data["mean_ios"]
-            if "mean_comps" in span_data:
-                metrics["mean_comps"] = span_data["mean_comps"]
-            if "mean_hops" in span_data:
-                metrics["mean_hops"] = span_data["mean_hops"]
-            if "mean_io_time" in span_data:
-                metrics["mean_io_time"] = span_data["mean_io_time"]
-            if "mean_cpus" in span_data:
-                metrics["mean_cpus"] = span_data["mean_cpus"]
-            break
-
-    return metrics
-
-
-def make_rows(baseline_list: List[Dict], target_list: List[Dict]) -> List[List[str]]:
-    """Generate comparison rows for the CSV output."""
-    rows = []
-
-    for baseline, target in zip(baseline_list, target_list):
-        baseline_results = baseline.get("results", {})
-        target_results = target.get("results", {})
-
-        # Get input info for context
-        inp = target.get("input", {})
-        content = inp.get("content", {})
-        search_phase = content.get("search_phase", {})
-
-        # Determine search_l and beam_width for search metrics
-        search_list = search_phase.get("search_list", [2000])
-        beam_width = search_phase.get("beam_width", 4)
-
-        # Use the first (or primary) search_l value
-        primary_search_l = search_list[0] if search_list else 2000
-
-        # Extract build metrics
-        baseline_build = extract_build_metrics(baseline_results)
-        target_build = extract_build_metrics(target_results)
-
-        # Build metrics rows
-        build_metrics = [
-            ("total_time", "total build time (s)"),
-            ("pq_construction_time", "PQ construction (s)"),
-            ("inmem_index_build_time", "in-memory index build (s)"),
-            ("disk_layout_time", "disk layout (s)"),
-        ]
-
-        for key, display_name in build_metrics:
-            if key in target_build or key in baseline_build:
-                target_val = target_build.get(key, 0)
-                baseline_val = baseline_build.get(key, 0)
-                rows.append([
-                    "index-build statistics",
-                    display_name,
-                    key,
-                    str(target_val),
-                    str(baseline_val),
-                    calc_deviation(baseline_val, target_val)
-                ])
-
-        # Extract search metrics for the primary search_l
-        baseline_search = extract_search_metrics(baseline_results, primary_search_l, beam_width)
-        target_search = extract_search_metrics(target_results, primary_search_l, beam_width)
-
-        search_span_name = f"search-with-L={primary_search_l}-bw={beam_width}"
-
-        # Search metrics rows
-        search_metrics = [
-            ("qps", "queries per second"),
-            ("recall", "recall (%)"),
-            ("mean_latency", "mean latency (μs)"),
-            ("latency_95", "p999 latency (μs)"),
-            ("mean_ios", "mean IOs"),
-            ("mean_comps", "mean comparisons"),
-            ("mean_hops", "mean hops"),
-            ("mean_io_time", "mean IO time (μs)"),
-            ("mean_cpus", "mean CPU time (μs)"),
-        ]
-
-        for key, display_name in search_metrics:
-            if key in target_search or key in baseline_search:
-                target_val = target_search.get(key, 0)
-                baseline_val = baseline_search.get(key, 0)
-                rows.append([
-                    search_span_name,
-                    display_name,
-                    key,
-                    str(target_val),
-                    str(baseline_val),
-                    calc_deviation(baseline_val, target_val)
-                ])
-
-    return rows
-
-
-def write_csv(rows: List[List[str]], out_path: str):
-    """Write the comparison rows to a CSV file."""
-    header = [
-        "Parent Span Name",
-        "Span Name",
-        "Stat Key",
-        "Stat Value (Target)",
-        "Stat Value (Baseline)",
-        "Deviation (%)"
-    ]
-    with open(out_path, "w", newline="", encoding="utf-8") as f:
-        writer = csv.writer(f)
-        writer.writerow(header)
-        writer.writerows(rows)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Compare two disk-index benchmark JSONs and emit a diff CSV."
-    )
-    parser.add_argument("--baseline", "-b", required=True, help="Path to baseline JSON")
-    parser.add_argument("--branch", "-r", required=True, help="Path to branch/target JSON")
-    parser.add_argument("--out", "-o", required=True, help="Where to write output CSV")
-    args = parser.parse_args()
-
-    baseline_list = load_json(args.baseline)
-    target_list = load_json(args.branch)
-
-    if len(baseline_list) != len(target_list):
-        raise ValueError(
-            f"baseline/branch JSON arrays differ in length: {len(baseline_list)} vs {len(target_list)}"
-        )
-
-    rows = make_rows(baseline_list, target_list)
-    write_csv(rows, args.out)
-    print(f"✓ Written diff CSV to {args.out}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.github/scripts/csv_to_markdown.py b/.github/scripts/csv_to_markdown.py
deleted file mode 100644
index 885a20208..000000000
--- a/.github/scripts/csv_to_markdown.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-"""Convert a CSV file to a Markdown table and optionally append to GitHub Step Summary."""
-
-import argparse
-import csv
-import os
-import sys
-
-
-def csv_to_markdown(csv_path: str) -> str:
-    """Convert a CSV file to a Markdown table string."""
-    with open(csv_path) as f:
-        rows = list(csv.reader(f))
-    if len(rows) < 2:
-        return ""
-    header = rows[0]
-    sep = ["---"] * len(header)
-    return "\n".join(" | ".join(r) for r in [header, sep] + rows[1:])
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--csv", required=True, help="Input CSV file path")
-    parser.add_argument("--md", required=True, help="Output Markdown file path")
-    parser.add_argument("--title", default="", help="Section title for GitHub Step Summary")
-    args = parser.parse_args()
-
-    md = csv_to_markdown(args.csv)
-    if not md:
-        print("No data")
-        return 0
-
-    with open(args.md, "w") as f:
-        f.write(md + "\n")
-
-    # Append to GitHub Step Summary if available
-    summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
-    if summary_path and args.title:
-        with open(summary_path, "a") as f:
-            f.write(f"### {args.title}\n")
-            f.write(md + "\n")
-
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml
index c1fd45ad8..d03f123f8 100644
--- a/.github/workflows/benchmarks-aa.yml
+++ b/.github/workflows/benchmarks-aa.yml
@@ -100,27 +100,13 @@ jobs:
             run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
             --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json
 
-      - name: Generate diff stats (baseline vs target)
-        run: |
-          python diskann_rust/.github/scripts/compare_disk_index_json_output.py \
-            --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \
-            --branch diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \
-            --out diskann_rust/target/tmp/wikipedia-100K_change.csv
-
-      - name: Convert results to Markdown
-        working-directory: diskann_rust
-        run: |
-          python .github/scripts/csv_to_markdown.py \
-            --csv target/tmp/wikipedia-100K_change.csv \
-            --md target/tmp/wikipedia-100K_change.md \
-            --title 'A/A Results: Wikipedia-100K Dataset'
-
       - name: Validate benchmark results
-        working-directory: diskann_rust
         run: |
-          python .github/scripts/benchmark_result_parse.py \
+          python diskann_rust/.github/scripts/benchmark_validate.py \
             --mode aa \
-            --file target/tmp/wikipedia-100K_change.csv
+            --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \
+            --target diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \
+            --title 'A/A Results: Wikipedia-100K Dataset'
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_REPOSITORY: ${{ github.repository }}
@@ -132,8 +118,6 @@ jobs:
         with:
           name: aa-results-wikipedia-100K
           path: |
-            diskann_rust/target/tmp/wikipedia-100K_change.csv
-            diskann_rust/target/tmp/wikipedia-100K_change.md
             diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json
             baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json
           retention-days: 30
@@ -206,27 +190,13 @@ jobs:
             run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \
             --output-file target/tmp/openai-100K_benchmark_crate_target.json
 
-      - name: Generate diff stats (baseline vs target)
-        run: |
-          python diskann_rust/.github/scripts/compare_disk_index_json_output.py \
-            --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \
-            --branch diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \
-            --out diskann_rust/target/tmp/openai-100K_change.csv
-
-      - name: Convert results to Markdown
-        working-directory: diskann_rust
-        run: |
-          python .github/scripts/csv_to_markdown.py \
-            --csv target/tmp/openai-100K_change.csv \
-            --md target/tmp/openai-100K_change.md \
-            --title 'A/A Results: OpenAI ArXiv 100K Dataset'
-
       - name: Validate benchmark results
-        working-directory: diskann_rust
         run: |
-          python .github/scripts/benchmark_result_parse.py \
+          python diskann_rust/.github/scripts/benchmark_validate.py \
             --mode aa \
-            --file target/tmp/openai-100K_change.csv
+            --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \
+            --target diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \
+            --title 'A/A Results: OpenAI ArXiv 100K Dataset'
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_REPOSITORY: ${{ github.repository }}
@@ -238,8 +208,6 @@ jobs:
         with:
           name: aa-results-openai-100K
           path: |
-            diskann_rust/target/tmp/openai-100K_change.csv
-            diskann_rust/target/tmp/openai-100K_change.md
             diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json
             baseline/target/tmp/openai-100K_benchmark_crate_baseline.json
           retention-days: 30
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 8cdf767a9..a7cc47753 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -22,8 +22,7 @@ on:
     paths:
       - 'diskann-benchmark/perf_test_inputs/**-disk-index.json'
       - '.github/workflows/benchmarks.yml'
-      - '.github/scripts/compare_disk_index_json_output.py'
-      - '.github/scripts/benchmark_result_parse.py'
+      - '.github/scripts/benchmark_validate.py'
 
 # Cancel in-progress runs when a new run is triggered
 concurrency:
@@ -88,8 +87,7 @@ jobs:
       - name: Install system dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y openssl libssl-dev pkg-config python3-pip
-          pip install csvtomd numpy scipy
+          sudo apt-get install -y openssl libssl-dev pkg-config
 
       # Download pre-packaged Wikipedia-100K dataset from GitHub Release
       # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance)
@@ -119,27 +117,13 @@ jobs:
             run --input-file diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json \
             --output-file target/tmp/wikipedia-100K_benchmark_crate_target.json
 
-      - name: Generate diff stats (baseline vs target)
-        run: |
-          python diskann_rust/.github/scripts/compare_disk_index_json_output.py \
-            --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \
-            --branch diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \
-            --out diskann_rust/target/tmp/wikipedia-100K_change.csv
-
-      - name: Convert results to Markdown
-        working-directory: diskann_rust
-        run: |
-          python .github/scripts/csv_to_markdown.py \
-            --csv target/tmp/wikipedia-100K_change.csv \
-            --md target/tmp/wikipedia-100K_change.md \
-            --title 'Benchmark Results: Wikipedia-100K Dataset'
-
       - name: Validate benchmark results
-        working-directory: diskann_rust
         run: |
-          python .github/scripts/benchmark_result_parse.py \
+          python diskann_rust/.github/scripts/benchmark_validate.py \
             --mode pr \
-            --file target/tmp/wikipedia-100K_change.csv
+            --baseline baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json \
+            --target diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json \
+            --title 'Benchmark Results: Wikipedia-100K Dataset'
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_REPOSITORY: ${{ github.repository }}
@@ -152,8 +136,6 @@ jobs:
         with:
           name: benchmark-results-wikipedia-100K
           path: |
-            diskann_rust/target/tmp/wikipedia-100K_change.csv
-            diskann_rust/target/tmp/wikipedia-100K_change.md
             diskann_rust/target/tmp/wikipedia-100K_benchmark_crate_target.json
             baseline/target/tmp/wikipedia-100K_benchmark_crate_baseline.json
           retention-days: 30
@@ -199,8 +181,7 @@ jobs:
       - name: Install system dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y openssl libssl-dev pkg-config python3-pip
-          pip install csvtomd numpy scipy
+          sudo apt-get install -y openssl libssl-dev pkg-config
 
       # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release
       # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance)
@@ -228,27 +209,13 @@ jobs:
             run --input-file diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json \
             --output-file target/tmp/openai-100K_benchmark_crate_target.json
 
-      - name: Generate diff stats (baseline vs target)
-        run: |
-          python diskann_rust/.github/scripts/compare_disk_index_json_output.py \
-            --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \
-            --branch diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \
-            --out diskann_rust/target/tmp/openai-100K_change.csv
-
-      - name: Convert results to Markdown
-        working-directory: diskann_rust
-        run: |
-          python .github/scripts/csv_to_markdown.py \
-            --csv target/tmp/openai-100K_change.csv \
-            --md target/tmp/openai-100K_change.md \
-            --title 'Benchmark Results: OpenAI ArXiv 100K Dataset'
-
       - name: Validate benchmark results
-        working-directory: diskann_rust
         run: |
-          python .github/scripts/benchmark_result_parse.py \
+          python diskann_rust/.github/scripts/benchmark_validate.py \
             --mode pr \
-            --file target/tmp/openai-100K_change.csv
+            --baseline baseline/target/tmp/openai-100K_benchmark_crate_baseline.json \
+            --target diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json \
+            --title 'Benchmark Results: OpenAI ArXiv 100K Dataset'
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_REPOSITORY: ${{ github.repository }}
@@ -261,8 +228,6 @@ jobs:
         with:
           name: benchmark-results-openai-100K
           path: |
-            diskann_rust/target/tmp/openai-100K_change.csv
-            diskann_rust/target/tmp/openai-100K_change.md
             diskann_rust/target/tmp/openai-100K_benchmark_crate_target.json
             baseline/target/tmp/openai-100K_benchmark_crate_baseline.json
           retention-days: 30
\ No newline at end of file

From 9779e3cfe458d2776b506a1f3403e9825486e294 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Tue, 31 Mar 2026 09:57:01 +0800
Subject: [PATCH 25/31] switch benchmark jobs to self-hosted 1ES runner pool
 (diskann-github)

---
 .github/workflows/benchmarks-aa.yml | 4 ++--
 .github/workflows/benchmarks.yml    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml
index d03f123f8..efd299564 100644
--- a/.github/workflows/benchmarks-aa.yml
+++ b/.github/workflows/benchmarks-aa.yml
@@ -35,7 +35,7 @@ jobs:
   # A/A benchmark: Wikipedia-100K dataset (main vs main)
   aa-wikipedia-100K:
     name: A/A - Wikipedia 100K
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ]
     timeout-minutes: 120
 
     steps:
@@ -125,7 +125,7 @@ jobs:
   # A/A benchmark: OpenAI ArXiv 100K dataset (main vs main)
   aa-openai-100K:
     name: A/A - OAI ArXiv 100K
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ]
     timeout-minutes: 120
 
     steps:
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index a7cc47753..9f4cf2fc5 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -46,7 +46,7 @@ jobs:
   # Macro benchmark: Wikipedia-100K dataset
   macro-benchmark-wikipedia-100K:
     name: Macro Benchmark - Wikipedia 100K
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ]
     # TODO: For production benchmarks, consider using a self-hosted runner with:
     # - NVMe storage for consistent I/O performance
     # - CPU pinning (taskset) for reduced variance
@@ -143,7 +143,7 @@ jobs:
   # Macro benchmark: OpenAI ArXiv dataset
   macro-benchmark-oai-large:
     name: Macro Benchmark - OAI ArXiv 100K
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ]
     # TODO: For production benchmarks, consider using a self-hosted runner
     timeout-minutes: 120
 

From db7e97ac23e55606f6405b5c65f45549d3da7087 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Tue, 31 Mar 2026 10:05:07 +0800
Subject: [PATCH 26/31] replace gh CLI with curl for dataset downloads (gh not
 available on 1ES runners)

---
 .github/workflows/benchmarks-aa.yml | 8 ++------
 .github/workflows/benchmarks.yml    | 8 ++------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml
index efd299564..ed7783233 100644
--- a/.github/workflows/benchmarks-aa.yml
+++ b/.github/workflows/benchmarks-aa.yml
@@ -78,11 +78,9 @@ jobs:
       # Download pre-packaged Wikipedia-100K dataset from GitHub Release
       # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
       - name: Download wikipedia-100K dataset
-        env:
-          GH_TOKEN: ${{ github.token }}
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'wikipedia-100K.tar.gz' --dir .
+          curl -L -o wikipedia-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/wikipedia-100K.tar.gz
           tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/
           cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/
 
@@ -168,11 +166,9 @@ jobs:
       # Download pre-packaged OpenAI ArXiv 100K dataset from GitHub Release
       # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
       - name: Download openai-100K dataset
-        env:
-          GH_TOKEN: ${{ github.token }}
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'openai-100K.tar.gz' --dir .
+          curl -L -o openai-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/openai-100K.tar.gz
           tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/
           cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/
 
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 9f4cf2fc5..d5015df88 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -93,11 +93,9 @@ jobs:
       # Dataset: 100K Cohere Wikipedia embeddings (768-dim, float32, cosine distance)
       # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
       - name: Download wikipedia-100K dataset
-        env:
-          GH_TOKEN: ${{ github.token }}
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'wikipedia-100K.tar.gz' --dir .
+          curl -L -o wikipedia-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/wikipedia-100K.tar.gz
           tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/
           cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/
 
@@ -187,11 +185,9 @@ jobs:
       # Dataset: 100K OpenAI embeddings of ArXiv papers (1536-dim, float32, euclidean distance)
       # Source: https://github.com/harsha-simhadri/big-ann-benchmarks
       - name: Download openai-100K dataset
-        env:
-          GH_TOKEN: ${{ github.token }}
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          gh release download v1 --repo YuanyuanTian-hh/diskann-benchmark-data --pattern 'openai-100K.tar.gz' --dir .
+          curl -L -o openai-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/openai-100K.tar.gz
           tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/
           cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/
 

From ea91c62f060ce69e3e2aa1413394c5993a2d78ed Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Tue, 31 Mar 2026 10:21:47 +0800
Subject: [PATCH 27/31] fix latency_95: read p95_latency instead of
 p999_latency from benchmark JSON

---
 .github/scripts/benchmark_validate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/benchmark_validate.py b/.github/scripts/benchmark_validate.py
index cb69f6054..ebcb95332 100644
--- a/.github/scripts/benchmark_validate.py
+++ b/.github/scripts/benchmark_validate.py
@@ -86,7 +86,7 @@ def extract_search_metrics(results: dict, search_l: int, beam_width: int) -> dic
             metrics["mean_hops"] = sr.get("mean_hops", 0)
             metrics["mean_io_time"] = sr.get("mean_io_time", 0)
             metrics["mean_cpus"] = sr.get("mean_cpu_time", 0)
-            metrics["latency_95"] = sr.get("p999_latency", 0)
+            metrics["latency_95"] = sr.get("p95_latency", 0)
             break
 
     # Override with span metrics if available

From 360cdc70fbbd33d7127ec11093b2ab9b6fcf06ac Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Wed, 1 Apr 2026 11:29:20 +0800
Subject: [PATCH 28/31] revert to ubuntu-latest runners, switch dataset source
 to BAB v0.4.0

---
 .github/workflows/benchmarks-aa.yml | 8 ++++----
 .github/workflows/benchmarks.yml    | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml
index ed7783233..9161400af 100644
--- a/.github/workflows/benchmarks-aa.yml
+++ b/.github/workflows/benchmarks-aa.yml
@@ -35,7 +35,7 @@ jobs:
   # A/A benchmark: Wikipedia-100K dataset (main vs main)
   aa-wikipedia-100K:
     name: A/A - Wikipedia 100K
-    runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ]
+    runs-on: ubuntu-latest
     timeout-minutes: 120
 
     steps:
@@ -80,7 +80,7 @@ jobs:
       - name: Download wikipedia-100K dataset
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          curl -L -o wikipedia-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/wikipedia-100K.tar.gz
+          curl -L -o wikipedia-100K.tar.gz https://github.com/harsha-simhadri/big-ann-benchmarks/releases/download/v0.4.0/wikipedia-100K.tar.gz
           tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/
           cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/
 
@@ -123,7 +123,7 @@ jobs:
   # A/A benchmark: OpenAI ArXiv 100K dataset (main vs main)
   aa-openai-100K:
     name: A/A - OAI ArXiv 100K
-    runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ]
+    runs-on: ubuntu-latest
     timeout-minutes: 120
 
     steps:
@@ -168,7 +168,7 @@ jobs:
       - name: Download openai-100K dataset
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          curl -L -o openai-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/openai-100K.tar.gz
+          curl -L -o openai-100K.tar.gz https://github.com/harsha-simhadri/big-ann-benchmarks/releases/download/v0.4.0/openai-100K.tar.gz
           tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/
           cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/
 
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index d5015df88..aa832b094 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -46,7 +46,7 @@ jobs:
   # Macro benchmark: Wikipedia-100K dataset
   macro-benchmark-wikipedia-100K:
     name: Macro Benchmark - Wikipedia 100K
-    runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ]
+    runs-on: ubuntu-latest
     # TODO: For production benchmarks, consider using a self-hosted runner with:
     # - NVMe storage for consistent I/O performance
     # - CPU pinning (taskset) for reduced variance
@@ -95,7 +95,7 @@ jobs:
       - name: Download wikipedia-100K dataset
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          curl -L -o wikipedia-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/wikipedia-100K.tar.gz
+          curl -L -o wikipedia-100K.tar.gz https://github.com/harsha-simhadri/big-ann-benchmarks/releases/download/v0.4.0/wikipedia-100K.tar.gz
           tar xzf wikipedia-100K.tar.gz -C diskann_rust/target/tmp/
           cp -r diskann_rust/target/tmp/wikipedia_cohere baseline/target/tmp/
 
@@ -141,7 +141,7 @@ jobs:
   # Macro benchmark: OpenAI ArXiv dataset
   macro-benchmark-oai-large:
     name: Macro Benchmark - OAI ArXiv 100K
-    runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest ]
+    runs-on: ubuntu-latest
     # TODO: For production benchmarks, consider using a self-hosted runner
     timeout-minutes: 120
 
@@ -187,7 +187,7 @@ jobs:
       - name: Download openai-100K dataset
         run: |
           mkdir -p diskann_rust/target/tmp baseline/target/tmp
-          curl -L -o openai-100K.tar.gz https://github.com/YuanyuanTian-hh/diskann-benchmark-data/releases/download/v1/openai-100K.tar.gz
+          curl -L -o openai-100K.tar.gz https://github.com/harsha-simhadri/big-ann-benchmarks/releases/download/v0.4.0/openai-100K.tar.gz
           tar xzf openai-100K.tar.gz -C diskann_rust/target/tmp/
           cp -r diskann_rust/target/tmp/OpenAIArXiv baseline/target/tmp/
 

From 9d635be150fa7bd62f07338db8d2db2bf6d91b90 Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Thu, 2 Apr 2026 11:17:30 +0800
Subject: [PATCH 29/31] address PR review: reduce search_list to 200, remove
 hardcoded Rust version, fix missing-field handling, clean up orphaned
 thresholds, switch data source to BAB v0.4.0

---
 .github/scripts/benchmark_validate.py         | 93 +++++++------------
 .github/workflows/benchmarks-aa.yml           |  9 +-
 .github/workflows/benchmarks.yml              | 10 +-
 .../openai-100K-disk-index.json               |  2 +-
 .../wikipedia-100K-disk-index.json            |  2 +-
 5 files changed, 46 insertions(+), 70 deletions(-)

diff --git a/.github/scripts/benchmark_validate.py b/.github/scripts/benchmark_validate.py
index ebcb95332..9081de76c 100644
--- a/.github/scripts/benchmark_validate.py
+++ b/.github/scripts/benchmark_validate.py
@@ -59,11 +59,11 @@ def extract_build_metrics(results: dict) -> dict[str, float]:
         name = span.get("span_name", "")
         data = span.get("metrics", {})
         if name == "DiskIndexBuild-PqConstruction":
-            metrics["pq_construction_time"] = data.get("duration_seconds", 0)
+            metrics["pq_construction_time"] = data.get("duration_seconds")
         elif name == "DiskIndexBuild-InmemIndexBuild":
-            metrics["inmem_index_build_time"] = data.get("duration_seconds", 0)
+            metrics["inmem_index_build_time"] = data.get("duration_seconds")
         elif name == "DiskIndexBuild-DiskLayout":
-            metrics["disk_layout_time"] = data.get("duration_seconds", 0)
+            metrics["disk_layout_time"] = data.get("duration_seconds")
 
     return metrics
 
@@ -78,15 +78,15 @@ def extract_search_metrics(results: dict, search_l: int, beam_width: int) -> dic
     # From search_results_per_l
     for sr in search.get("search_results_per_l", []):
         if sr.get("search_l") == search_l:
-            metrics["qps"] = sr.get("qps", 0)
-            metrics["recall"] = sr.get("recall", 0)
-            metrics["mean_latency"] = sr.get("mean_latency", 0)
-            metrics["mean_ios"] = sr.get("mean_ios", 0)
-            metrics["mean_comps"] = sr.get("mean_comparisons", 0)
-            metrics["mean_hops"] = sr.get("mean_hops", 0)
-            metrics["mean_io_time"] = sr.get("mean_io_time", 0)
-            metrics["mean_cpus"] = sr.get("mean_cpu_time", 0)
-            metrics["latency_95"] = sr.get("p95_latency", 0)
+            metrics["qps"] = sr.get("qps")
+            metrics["recall"] = sr.get("recall")
+            metrics["mean_latency"] = sr.get("mean_latency")
+            metrics["mean_ios"] = sr.get("mean_ios")
+            metrics["mean_comps"] = sr.get("mean_comparisons")
+            metrics["mean_hops"] = sr.get("mean_hops")
+            metrics["mean_io_time"] = sr.get("mean_io_time")
+            metrics["mean_cpus"] = sr.get("mean_cpu_time")
+            metrics["latency_95"] = sr.get("p95_latency")
             break
 
     # Override with span metrics if available
@@ -117,25 +117,26 @@ def compute_diff(baseline_json: list[dict], target_json: list[dict]) -> list[dic
 
         inp = target.get("input", {})
         search_phase = inp.get("content", {}).get("search_phase", {})
-        search_list = search_phase.get("search_list", [2000])
+        search_list = search_phase.get("search_list", [200])
         beam_width = search_phase.get("beam_width", 4)
-        primary_l = search_list[0] if search_list else 2000
+        primary_l = search_list[0] if search_list else 200
 
         # Build metrics
         b_build = extract_build_metrics(b_results)
         t_build = extract_build_metrics(t_results)
 
         for key in ("total_time", "pq_construction_time", "inmem_index_build_time", "disk_layout_time"):
-            if key in t_build or key in b_build:
-                bv = b_build.get(key, 0)
-                tv = t_build.get(key, 0)
-                rows.append({
-                    "category": "index-build statistics",
-                    "metric": key,
-                    "baseline": bv,
-                    "target": tv,
-                    "deviation": ((tv - bv) / bv * 100) if bv else 0,
-                })
+            bv = b_build.get(key)
+            tv = t_build.get(key)
+            if bv is None or tv is None:
+                continue  # skip metrics missing from either side
+            rows.append({
+                "category": "index-build statistics",
+                "metric": key,
+                "baseline": bv,
+                "target": tv,
+                "deviation": ((tv - bv) / bv * 100) if bv else 0,
+            })
 
         # Search metrics
         b_search = extract_search_metrics(b_results, primary_l, beam_width)
@@ -144,16 +145,17 @@ def compute_diff(baseline_json: list[dict], target_json: list[dict]) -> list[dic
 
         for key in ("qps", "recall", "mean_latency", "latency_95", "mean_ios",
                      "mean_comps", "mean_hops", "mean_io_time", "mean_cpus"):
-            if key in t_search or key in b_search:
-                bv = b_search.get(key, 0)
-                tv = t_search.get(key, 0)
-                rows.append({
-                    "category": span_cat,
-                    "metric": key,
-                    "baseline": bv,
-                    "target": tv,
-                    "deviation": ((tv - bv) / bv * 100) if bv else 0,
-                })
+            bv = b_search.get(key)
+            tv = t_search.get(key)
+            if bv is None or tv is None:
+                continue  # skip metrics missing from either side
+            rows.append({
+                "category": span_cat,
+                "metric": key,
+                "baseline": bv,
+                "target": tv,
+                "deviation": ((tv - bv) / bv * 100) if bv else 0,
+            })
 
     return rows
 
@@ -189,29 +191,6 @@ def compute_diff(baseline_json: list[dict], target_json: list[dict]) -> list[dic
         "total_comparisons": [1, "LT", ""],
         "search_hops": [1, "LT", ""],
     },
-    "search-with-L=2000-bw=4": {
-        # Calibrated from 5 GitHub runner runs (10 observations)
-        "latency_95": [10, "LT", ""],
-        "mean_latency": [10, "LT", ""],
-        "mean_io_time": [10, "LT", ""],
-        "mean_cpus": [15, "LT", ""],   # wider — CPU time is noisy on shared runners
-        "qps": [10, "GT", 6.5],
-        "mean_ios": [1, "LT", 2410],
-        "mean_comps": [1, "LT", 33200],
-        "mean_hops": [1, "LT", ""],
-        "recall": [1, "GT", 98.0],
-    },
-    "search-with-L=100-bw=4": {
-        "latency_95": [10, "LT", ""],
-        "mean_latency": [10, "LT", ""],
-        "mean_io_time": [10, "LT", ""],
-        "mean_cpus": [15, "LT", ""],
-        "qps": [10, "GT", ""],
-        "mean_ios": [10, "LT", ""],
-        "mean_comps": [10, "LT", ""],
-        "mean_hops": [10, "LT", ""],
-        "recall": [1, "GT", ""],
-    },
     "search-with-L=200-bw=4": {
         "latency_95": [10, "LT", ""],
         "mean_latency": [10, "LT", ""],
diff --git a/.github/workflows/benchmarks-aa.yml b/.github/workflows/benchmarks-aa.yml
index 9161400af..c8256bd59 100644
--- a/.github/workflows/benchmarks-aa.yml
+++ b/.github/workflows/benchmarks-aa.yml
@@ -21,7 +21,6 @@ concurrency:
 
 env:
   RUST_BACKTRACE: 1
-  rust_stable: "1.92"
 
 defaults:
   run:
@@ -53,10 +52,10 @@ jobs:
           path: baseline
           lfs: true
 
-      - name: Install Rust ${{ env.rust_stable }}
+      - name: Install Rust
         uses: dtolnay/rust-toolchain@master
         with:
-          toolchain: ${{ env.rust_stable }}
+          toolchain: stable
 
       - name: Cache Rust dependencies (target)
         uses: Swatinem/rust-cache@v2
@@ -141,10 +140,10 @@ jobs:
           path: baseline
           lfs: true
 
-      - name: Install Rust ${{ env.rust_stable }}
+      - name: Install Rust
         uses: dtolnay/rust-toolchain@master
         with:
-          toolchain: ${{ env.rust_stable }}
+          toolchain: stable
 
       - name: Cache Rust dependencies (target)
         uses: Swatinem/rust-cache@v2
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index aa832b094..194267355 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -31,8 +31,6 @@ concurrency:
 
 env:
   RUST_BACKTRACE: 1
-  # Use the Rust version specified in rust-toolchain.toml
-  rust_stable: "1.92"
 
 defaults:
   run:
@@ -67,10 +65,10 @@ jobs:
           path: baseline
           lfs: true
 
-      - name: Install Rust ${{ env.rust_stable }}
+      - name: Install Rust
         uses: dtolnay/rust-toolchain@master
         with:
-          toolchain: ${{ env.rust_stable }}
+          toolchain: stable
 
       - name: Cache Rust dependencies (current)
         uses: Swatinem/rust-cache@v2
@@ -159,10 +157,10 @@ jobs:
           path: baseline
           lfs: true
 
-      - name: Install Rust ${{ env.rust_stable }}
+      - name: Install Rust
         uses: dtolnay/rust-toolchain@master
         with:
-          toolchain: ${{ env.rust_stable }}
+          toolchain: stable
 
       - name: Cache Rust dependencies (current)
         uses: Swatinem/rust-cache@v2
diff --git a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
index 3a2a1d9e2..d021640fc 100644
--- a/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/openai-100K-disk-index.json
@@ -24,7 +24,7 @@
                     "queries": "OpenAIArXiv/openai_query.bin",
                     "groundtruth": "OpenAIArXiv/openai-100K",
                     "search_list": [
-                        2000
+                        200
                     ],
                     "beam_width": 4,
                     "recall_at": 100,
diff --git a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
index 6a52b1e32..e5f06aa1b 100644
--- a/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
+++ b/diskann-benchmark/perf_test_inputs/wikipedia-100K-disk-index.json
@@ -24,7 +24,7 @@
                     "queries": "wikipedia_cohere/wikipedia_query.bin",
                     "groundtruth": "wikipedia_cohere/wikipedia-100K",
                     "search_list": [
-                        2000
+                        200
                     ],
                     "beam_width": 4,
                     "recall_at": 100,

From 4ca80d101b7117afb8046bb4d6a7cc4efe3a1d8d Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Thu, 2 Apr 2026 11:30:38 +0800
Subject: [PATCH 30/31] widen latency_95 threshold to 15% for shared-runner
 noise

---
 .github/scripts/benchmark_validate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/benchmark_validate.py b/.github/scripts/benchmark_validate.py
index 9081de76c..9dd5de14b 100644
--- a/.github/scripts/benchmark_validate.py
+++ b/.github/scripts/benchmark_validate.py
@@ -192,7 +192,7 @@ def compute_diff(baseline_json: list[dict], target_json: list[dict]) -> list[dic
         "search_hops": [1, "LT", ""],
     },
     "search-with-L=200-bw=4": {
-        "latency_95": [10, "LT", ""],
+        "latency_95": [15, "LT", ""],   # wider — p95 latency is noisy on shared runners
         "mean_latency": [10, "LT", ""],
         "mean_io_time": [10, "LT", ""],
         "mean_cpus": [15, "LT", ""],

From d168932ec0e26ee814f8bbd3d59e4155bff0403c Mon Sep 17 00:00:00 2001
From: "Yuanyuan Tian (from Dev Box)" <tianyuanyuan@microsoft.com>
Date: Thu, 2 Apr 2026 12:12:57 +0800
Subject: [PATCH 31/31] replace push trigger with pull_request trigger
 targeting main

---
 .github/workflows/benchmarks.yml | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 194267355..d75e0efe7 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -16,11 +16,21 @@ on:
         required: true
         default: 'main'
         type: string
-  push:
+  pull_request:
     branches:
-      - 'user/tianyuanyuan/add-benchmark-pipeline'
+      - main
     paths:
-      - 'diskann-benchmark/perf_test_inputs/**-disk-index.json'
+      - 'diskann/**'
+      - 'diskann-disk/**'
+      - 'diskann-linalg/**'
+      - 'diskann-providers/**'
+      - 'diskann-quantization/**'
+      - 'diskann-vector/**'
+      - 'diskann-wide/**'
+      - 'diskann-utils/**'
+      - 'diskann-platform/**'
+      - 'diskann-label-filter/**'
+      - 'diskann-benchmark/**'
       - '.github/workflows/benchmarks.yml'
       - '.github/scripts/benchmark_validate.py'