From 77cc936ed8241b34f3baf75a0842ddc863d367ff Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Thu, 7 May 2026 01:18:55 +0530
Subject: [PATCH 01/13] Add benchmark crate for multi-vector

---
 Cargo.lock                                    |   17 +
 Cargo.toml                                    |    1 +
 diskann-benchmark-multi-vector/Cargo.toml     |   30 +
 diskann-benchmark-multi-vector/README.md      |  136 ++
 .../examples/multi-vector.json                |   70 +
 .../examples/test.json                        |   47 +
 .../examples/tolerance.json                   |   16 +
 diskann-benchmark-multi-vector/src/bin.rs     |   96 +
 diskann-benchmark-multi-vector/src/lib.rs     |  992 ++++++++
 results.json                                  | 2150 +++++++++++++++++
 10 files changed, 3555 insertions(+)
 create mode 100644 diskann-benchmark-multi-vector/Cargo.toml
 create mode 100644 diskann-benchmark-multi-vector/README.md
 create mode 100644 diskann-benchmark-multi-vector/examples/multi-vector.json
 create mode 100644 diskann-benchmark-multi-vector/examples/test.json
 create mode 100644 diskann-benchmark-multi-vector/examples/tolerance.json
 create mode 100644 diskann-benchmark-multi-vector/src/bin.rs
 create mode 100644 diskann-benchmark-multi-vector/src/lib.rs
 create mode 100644 results.json

diff --git a/Cargo.lock b/Cargo.lock
index beac316c4..fc0a7cc87 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -697,6 +697,23 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "diskann-benchmark-multi-vector"
+version = "0.50.1"
+dependencies = [
+ "anyhow",
+ "diskann-benchmark-runner",
+ "diskann-quantization",
+ "diskann-utils",
+ "diskann-vector",
+ "half",
+ "rand 0.9.4",
+ "serde",
+ "serde_json",
+ "tempfile",
+ "thiserror 2.0.17",
+]
+
 [[package]]
 name = "diskann-benchmark-runner"
 version = "0.50.1"
diff --git a/Cargo.toml b/Cargo.toml
index 6f31a1ae2..13fcbdd9c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,6 +21,7 @@ members = [
     "diskann-benchmark-runner",
     "diskann-benchmark-core",
     "diskann-benchmark-simd",
+    "diskann-benchmark-multi-vector",
     "diskann-benchmark",
     "diskann-tools",
     "vectorset",
diff --git a/diskann-benchmark-multi-vector/Cargo.toml b/diskann-benchmark-multi-vector/Cargo.toml
new file mode 100644
index 000000000..f8eb937e1
--- /dev/null
+++ b/diskann-benchmark-multi-vector/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "diskann-benchmark-multi-vector"
+version.workspace = true
+description.workspace = true
+authors.workspace = true
+documentation.workspace = true
+license.workspace = true
+edition.workspace = true
+
+[[bin]]
+name = "benchmark-multi-vector"
+path = "src/bin.rs"
+
+[dependencies]
+anyhow.workspace = true
+diskann-utils = { workspace = true, default-features = false }
+half = { workspace = true, features = ["rand_distr"] }
+diskann-benchmark-runner = { workspace = true }
+diskann-quantization = { workspace = true }
+diskann-vector = { workspace = true }
+rand.workspace = true
+serde = { workspace = true, features = ["derive"] }
+serde_json.workspace = true
+thiserror.workspace = true
+
+[lints]
+workspace = true
+
+[dev-dependencies]
+tempfile.workspace = true
diff --git a/diskann-benchmark-multi-vector/README.md b/diskann-benchmark-multi-vector/README.md
new file mode 100644
index 000000000..014a393a1
--- /dev/null
+++ b/diskann-benchmark-multi-vector/README.md
@@ -0,0 +1,136 @@
+# diskann-benchmark-multi-vector
+
+Benchmarks and regression detection for the **multi-vector distance
+operations** exposed by `diskann-quantization` — `Chamfer` and `MaxSim` —
+across `f32` and `f16` element types.
+
+## Layout
+
+- `src/lib.rs` — benchmark library: input/tolerance schemas, kernel
+  dispatch, regression checker.
+- `src/bin.rs` — `benchmark-multi-vector` CLI entry point.
+- `examples/multi-vector.json` — full benchmark matrix covering both
+  operations across the registered kernels and a representative range of
+  shapes.
+- `examples/test.json` — minimal smoke configuration consumed by the
+  integration tests.
+- `examples/tolerance.json` — default regression thresholds.
+
+## Registered kernels
+
+The crate registers four kernels — one per `(element_type, implementation)`
+pair:
+
+| Tag                              | Element | Implementation       |
+| -------------------------------- | ------- | -------------------- |
+| `multi-vector-op-f32-optimized`  | `f32`   | `QueryComputer`      |
+| `multi-vector-op-f16-optimized`  | `f16`   | `QueryComputer`      |
+| `multi-vector-op-f32-reference`  | `f32`   | `Chamfer` / `MaxSim` |
+| `multi-vector-op-f16-reference`  | `f16`   | `Chamfer` / `MaxSim` |
+
+The **optimized** path constructs a `QueryComputer` once per shape (which
+internally selects the best available SIMD kernel for the host) and calls
+`chamfer` / `max_sim` inside the timed loop. The **reference** path drives
+the `Chamfer` / `MaxSim` fallback used by the `multi_vector` unit tests —
+useful both as a numerical ground truth and as a baseline to measure SIMD
+speedups against.
+
+## Time normalization
+
+Per-measurement latency is normalized to **nanoseconds per inner-product
+call**, abbreviated `ns/IP`:
+
+```
+ns/IP = min_latency_µs * 1000 / (Q * D * loops_per_measurement)
+```
+
+Two important properties:
+
+- **Independent of `Q`, `D`, and `loops_per_measurement`.** Reshaping the
+  benchmark or scaling the loop budget leaves the metric unchanged, so
+  cache-residency effects and SIMD utilization show up directly.
+- **Approximately linear in `Dim`.** Each inner-product call is itself an
+  O(`Dim`) operation, so `ns/IP` grows with `Dim` — that is why the table
+  headers read `ns/IP @ Dim`. Compare across rows with the same `Dim`; to
+  compare across different `Dim`s, divide further by `Dim` to recover ns
+  per scalar multiply.
+
+This is the right metric for the two things this crate cares about:
+detecting per-shape regressions (the `Dim` factor cancels) and comparing
+optimized vs. reference at a fixed shape.
+
+## Usage
+
+All examples below assume you are inside the crate directory and use a
+small shell function for brevity:
+
+```bash
+bench() { cargo run --release -p diskann-benchmark-multi-vector --bin benchmark-multi-vector -- "$@"; }
+```
+
+### Run benchmarks
+
+`run` executes every job in the input file and writes per-measurement
+latencies plus percentiles to the output file:
+
+```bash
+bench run --input-file examples/multi-vector.json --output-file before.json
+```
+
+### Regression check workflow
+
+The check workflow is **two-phase**: validate the tolerance file once, then
+compare two recorded result files.
+
+**Phase 1 — preflight.** No benchmarks are executed. The verifier confirms
+that every entry in `tolerance.json` matches at least one job in the input
+file, and that every job is matched by exactly one entry. Run it whenever
+you edit `tolerance.json`:
+
+```bash
+bench check verify \
+  --tolerances examples/tolerance.json \
+  --input-file examples/multi-vector.json
+```
+
+**Phase 2 — comparison.** Record results before and after a code change,
+then compare. The command exits non-zero if any run regresses past its
+tolerance:
+
+```bash
+# On the baseline commit
+bench run --input-file examples/multi-vector.json --output-file before.json
+
+# On the change commit
+bench run --input-file examples/multi-vector.json --output-file after.json
+
+# Compare
+bench check run \
+  --tolerances examples/tolerance.json \
+  --input-file examples/multi-vector.json \
+  --before before.json --after after.json \
+  --output-file checks.json
+```
+
+A run **fails** when its post-change `ns/IP` minimum exceeds the
+baseline minimum by more than `min_time_regression` (default `0.05` =
+5%). Improvements (negative change) always pass.
+
+### How tolerances are matched to jobs
+
+Each entry in `tolerance.json` has the shape `{ input, tolerance }`. The
+`input` block acts as a **partial template** against the jobs in the input
+file: any field present must match; missing fields are wildcards.
+
+The shipped `tolerance.json` uses an empty `"content": {}`, which matches
+every `multi-vector-op` job — so a single 5% threshold applies to all four
+kernels. To apply different thresholds per implementation, add more
+specific entries, e.g.:
+
+```json
+{ "input":     { "type": "multi-vector-op", "content": { "implementation": "reference" } },
+  "tolerance": { "type": "multi-vector-tolerance", "content": { "min_time_regression": 0.10 } } }
+```
+
+`check verify` will reject the file if entries overlap or leave any job
+unmatched.
diff --git a/diskann-benchmark-multi-vector/examples/multi-vector.json b/diskann-benchmark-multi-vector/examples/multi-vector.json
new file mode 100644
index 000000000..2626e5047
--- /dev/null
+++ b/diskann-benchmark-multi-vector/examples/multi-vector.json
@@ -0,0 +1,70 @@
+{
+  "search_directories": [],
+  "jobs": [
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "implementation": "optimized",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 20 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
+
+          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 20 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "implementation": "optimized",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "implementation": "reference",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,  "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,  "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "implementation": "reference",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,  "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2,  "num_measurements": 50 }
+        ]
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark-multi-vector/examples/test.json b/diskann-benchmark-multi-vector/examples/test.json
new file mode 100644
index 000000000..28e9b9d64
--- /dev/null
+++ b/diskann-benchmark-multi-vector/examples/test.json
@@ -0,0 +1,47 @@
+{
+  "search_directories": [],
+  "jobs": [
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "implementation": "optimized",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "implementation": "optimized",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "implementation": "reference",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "implementation": "reference",
+        "runs": [
+          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark-multi-vector/examples/tolerance.json b/diskann-benchmark-multi-vector/examples/tolerance.json
new file mode 100644
index 000000000..8d5997199
--- /dev/null
+++ b/diskann-benchmark-multi-vector/examples/tolerance.json
@@ -0,0 +1,16 @@
+{
+  "checks": [
+    {
+      "input": {
+        "type": "multi-vector-op",
+        "content": {}
+      },
+      "tolerance": {
+        "type": "multi-vector-tolerance",
+        "content": {
+          "min_time_regression": 0.05
+        }
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark-multi-vector/src/bin.rs b/diskann-benchmark-multi-vector/src/bin.rs
new file mode 100644
index 000000000..d595533e7
--- /dev/null
+++ b/diskann-benchmark-multi-vector/src/bin.rs
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+use diskann_benchmark_multi_vector::{register, MultiVectorOp};
+use diskann_benchmark_runner::{output, registry, App, Output};
+
+pub fn main() -> anyhow::Result<()> {
+    // Create the pocket bench application.
+    let app = App::parse();
+    main_inner(&app, &mut output::default())
+}
+
+fn main_inner(app: &App, output: &mut dyn Output) -> anyhow::Result<()> {
+    // Register inputs and benchmarks.
+    let mut inputs = registry::Inputs::new();
+    inputs.register::<MultiVectorOp>()?;
+
+    let mut benchmarks = registry::Benchmarks::new();
+    register(&mut benchmarks);
+
+    // Here we go!
+    app.run(&inputs, &benchmarks, output)
+}
+
+///////////
+// Tests //
+///////////
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::path::{Path, PathBuf};
+
+    use diskann_benchmark_runner::app::{Check, Commands};
+
+    fn run_integration_test(input_file: &Path, output_file: &Path) {
+        let commands = Commands::Run {
+            input_file: input_file.to_str().unwrap().into(),
+            output_file: output_file.to_str().unwrap().into(),
+            dry_run: false,
+            allow_debug: true,
+        };
+
+        let app = App::from_commands(commands);
+
+        let mut output = output::Memory::new();
+        main_inner(&app, &mut output).unwrap();
+        println!(
+            "output = {}",
+            String::from_utf8(output.into_inner()).unwrap()
+        );
+
+        assert!(output_file.exists());
+    }
+
+    fn run_check_test(input_file: &Path, tolerances: &Path) -> String {
+        let commands = Commands::Check(Check::Verify {
+            tolerances: tolerances.to_str().unwrap().into(),
+            input_file: input_file.to_str().unwrap().into(),
+        });
+
+        let app = App::from_commands(commands);
+
+        let mut output = output::Memory::new();
+        main_inner(&app, &mut output).unwrap();
+        String::from_utf8(output.into_inner()).unwrap()
+    }
+
+    #[test]
+    fn integration_test() {
+        let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("examples")
+            .join("test.json");
+
+        let tempdir = tempfile::tempdir().unwrap();
+        let output_path = tempdir.path().join("output.json");
+
+        run_integration_test(&input_path, &output_path);
+    }
+
+    #[test]
+    fn check_verify() {
+        let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("examples")
+            .join("test.json");
+        let tolerance_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("examples")
+            .join("tolerance.json");
+
+        let stdout = run_check_test(&input_path, &tolerance_path);
+        println!("stdout = {}", stdout);
+    }
+}
diff --git a/diskann-benchmark-multi-vector/src/lib.rs b/diskann-benchmark-multi-vector/src/lib.rs
new file mode 100644
index 000000000..7cadf4f29
--- /dev/null
+++ b/diskann-benchmark-multi-vector/src/lib.rs
@@ -0,0 +1,992 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Multi-vector distance benchmarks with regression detection.
+
+use std::{io::Write, num::NonZeroUsize};
+
+use diskann_quantization::multi_vector::{Chamfer, MatRef, MaxSim, QueryComputer, Standard};
+use diskann_vector::distance::InnerProduct;
+use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
+use half::f16;
+use rand::{
+    distr::{Distribution, StandardUniform},
+    rngs::StdRng,
+    SeedableRng,
+};
+use serde::{Deserialize, Serialize};
+use thiserror::Error;
+
+use diskann_benchmark_runner::{
+    benchmark::{PassFail, Regression},
+    dispatcher::{Description, DispatchRule, FailureScore, MatchScore},
+    utils::{
+        datatype::{self, DataType},
+        num::{relative_change, NonNegativeFinite},
+        percentiles, MicroSeconds,
+    },
+    Any, Benchmark, CheckDeserialization, Checker, Input,
+};
+
+////////////////
+// Public API //
+////////////////
+
+/// Register all multi-vector benchmarks with the runner's dispatcher.
+pub fn register(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) {
+    register_benchmarks_impl(dispatcher)
+}
+
+///////////
+// Utils //
+///////////
+
+#[derive(Debug, Clone, Copy)]
+struct DisplayWrapper<'a, T: ?Sized>(&'a T);
+
+impl<T: ?Sized> std::ops::Deref for DisplayWrapper<'_, T> {
+    type Target = T;
+    fn deref(&self) -> &T {
+        self.0
+    }
+}
+
+////////////
+// Inputs //
+////////////
+
+/// The two distance operations exposed by [`QueryComputer`].
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Operation {
+    Chamfer,
+    MaxSim,
+}
+
+impl std::fmt::Display for Operation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let st = match self {
+            Self::Chamfer => "chamfer",
+            Self::MaxSim => "max_sim",
+        };
+        write!(f, "{}", st)
+    }
+}
+
+/// Which implementation tier to benchmark.
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+enum Implementation {
+    Optimized,
+    Reference,
+}
+
+impl std::fmt::Display for Implementation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let st = match self {
+            Self::Optimized => "optimized",
+            Self::Reference => "reference",
+        };
+        write!(f, "{}", st)
+    }
+}
+
+/// One benchmark configuration: a single (operation, shape) measurement.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+struct Run {
+    operation: Operation,
+    num_query_vectors: NonZeroUsize,
+    num_doc_vectors: NonZeroUsize,
+    dim: NonZeroUsize,
+    loops_per_measurement: NonZeroUsize,
+    num_measurements: NonZeroUsize,
+}
+
+/// A complete multi-vector benchmark job.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct MultiVectorOp {
+    element_type: DataType,
+    implementation: Implementation,
+    runs: Vec<Run>,
+}
+
+impl CheckDeserialization for MultiVectorOp {
+    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
+        Ok(())
+    }
+}
+
+macro_rules! write_field {
+    ($f:ident, $field:tt, $($expr:tt)*) => {
+        writeln!($f, "{:>18}: {}", $field, $($expr)*)
+    }
+}
+
+impl MultiVectorOp {
+    fn summarize_fields(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write_field!(f, "element type", self.element_type)?;
+        write_field!(f, "implementation", self.implementation)?;
+        write_field!(f, "number of runs", self.runs.len())?;
+        Ok(())
+    }
+}
+
+impl std::fmt::Display for MultiVectorOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "Multi-Vector Operation\n")?;
+        write_field!(f, "tag", Self::tag())?;
+        self.summarize_fields(f)
+    }
+}
+
+impl Input for MultiVectorOp {
+    fn tag() -> &'static str {
+        "multi-vector-op"
+    }
+
+    fn try_deserialize(
+        serialized: &serde_json::Value,
+        checker: &mut Checker,
+    ) -> anyhow::Result<Any> {
+        checker.any(Self::deserialize(serialized)?)
+    }
+
+    fn example() -> anyhow::Result<serde_json::Value> {
+        const NUM_QUERY_VECTORS: NonZeroUsize = NonZeroUsize::new(32).unwrap();
+        const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
+        const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap();
+        const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap();
+        const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap();
+
+        let runs = vec![
+            Run {
+                operation: Operation::Chamfer,
+                num_query_vectors: NUM_QUERY_VECTORS,
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+            Run {
+                operation: Operation::MaxSim,
+                num_query_vectors: NUM_QUERY_VECTORS,
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+        ];
+
+        Ok(serde_json::to_value(&Self {
+            element_type: DataType::Float32,
+            implementation: Implementation::Optimized,
+            runs,
+        })?)
+    }
+}
+
+//////////////////////
+// Regression Check //
+//////////////////////
+
+/// Tolerance thresholds for multi-vector benchmark regression detection.
+///
+/// Each field specifies the maximum allowed relative increase in the corresponding metric.
+/// For example, a value of `0.05` means a 5% increase is tolerated.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+struct MultiVectorTolerance {
+    min_time_regression: NonNegativeFinite,
+}
+
+impl CheckDeserialization for MultiVectorTolerance {
+    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
+        Ok(())
+    }
+}
+
+impl Input for MultiVectorTolerance {
+    fn tag() -> &'static str {
+        "multi-vector-tolerance"
+    }
+
+    fn try_deserialize(
+        serialized: &serde_json::Value,
+        checker: &mut Checker,
+    ) -> anyhow::Result<Any> {
+        checker.any(Self::deserialize(serialized)?)
+    }
+
+    fn example() -> anyhow::Result<serde_json::Value> {
+        const EXAMPLE: NonNegativeFinite = match NonNegativeFinite::new(0.05) {
+            Ok(v) => v,
+            Err(_) => panic!("use a non-negative finite please"),
+        };
+
+        Ok(serde_json::to_value(MultiVectorTolerance {
+            min_time_regression: EXAMPLE,
+        })?)
+    }
+}
+
+/// Per-run comparison result showing before/after percentile differences.
+#[derive(Debug, Serialize)]
+struct Comparison {
+    run: Run,
+    tolerance: MultiVectorTolerance,
+    before_min: f64,
+    after_min: f64,
+}
+
+/// Aggregated result of the regression check across all runs.
+#[derive(Debug, Serialize)]
+struct CheckResult {
+    checks: Vec<Comparison>,
+}
+
+impl std::fmt::Display for CheckResult {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let header = [
+            "Operation",
+            "Q",
+            "D",
+            "Dim",
+            "Min Before (ns/IP @ Dim)",
+            "Min After (ns/IP @ Dim)",
+            "Change (%)",
+            "Remark",
+        ];
+
+        let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.checks.len());
+
+        for (i, c) in self.checks.iter().enumerate() {
+            let mut row = table.row(i);
+            let change = relative_change(c.before_min, c.after_min);
+
+            row.insert(c.run.operation, 0);
+            row.insert(c.run.num_query_vectors, 1);
+            row.insert(c.run.num_doc_vectors, 2);
+            row.insert(c.run.dim, 3);
+            row.insert(format!("{:.3}", c.before_min), 4);
+            row.insert(format!("{:.3}", c.after_min), 5);
+            match change {
+                Ok(change) => {
+                    row.insert(format!("{:.3} %", change * 100.0), 6);
+                    if change > c.tolerance.min_time_regression.get() {
+                        row.insert("FAIL", 7);
+                    }
+                }
+                Err(err) => {
+                    row.insert("invalid", 6);
+                    row.insert(err, 7);
+                }
+            }
+        }
+
+        table.fmt(f)
+    }
+}
+
+////////////////////////////
+// Benchmark Registration //
+////////////////////////////
+
+fn register_benchmarks_impl(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) {
+    // Optimized (architecture-dispatched QueryComputer).
+    dispatcher.register_regression(
+        "multi-vector-op-f32-optimized",
+        Kernel::<Optimized, f32>::new(),
+    );
+    dispatcher.register_regression(
+        "multi-vector-op-f16-optimized",
+        Kernel::<Optimized, f16>::new(),
+    );
+
+    // Reference (Chamfer / MaxSim fallback path).
+    dispatcher.register_regression(
+        "multi-vector-op-f32-reference",
+        Kernel::<Reference, f32>::new(),
+    );
+    dispatcher.register_regression(
+        "multi-vector-op-f16-reference",
+        Kernel::<Reference, f16>::new(),
+    );
+}
+
+//////////////
+// Dispatch //
+//////////////
+
+/// Dispatch marker for the [`QueryComputer`] implementation.
+#[derive(Debug)]
+struct Optimized;
+
+/// Dispatch marker for the [`Chamfer`] / [`MaxSim`] fallback.
+#[derive(Debug)]
+struct Reference;
+
+/// A multi-vector benchmark.
+struct Kernel<I, T> {
+    _type: std::marker::PhantomData<(I, T)>,
+}
+
+impl<I, T> Kernel<I, T> {
+    fn new() -> Self {
+        Self {
+            _type: std::marker::PhantomData,
+        }
+    }
+}
+
+#[derive(Debug, Error)]
+#[error("implementation {0} is not registered for this benchmark")]
+pub(crate) struct ImplementationMismatch(Implementation);
+
+impl DispatchRule<Implementation> for Optimized {
+    type Error = ImplementationMismatch;
+
+    fn try_match(from: &Implementation) -> Result<MatchScore, FailureScore> {
+        if *from == Implementation::Optimized {
+            Ok(MatchScore(0))
+        } else {
+            Err(FailureScore(1))
+        }
+    }
+
+    fn convert(from: Implementation) -> Result<Self, Self::Error> {
+        if from == Implementation::Optimized {
+            Ok(Optimized)
+        } else {
+            Err(ImplementationMismatch(from))
+        }
+    }
+
+    fn description(
+        f: &mut std::fmt::Formatter<'_>,
+        from: Option<&Implementation>,
+    ) -> std::fmt::Result {
+        match from {
+            None => write!(f, "QueryComputer (architecture-dispatched)"),
+            Some(impl_) => {
+                if Self::try_match(impl_).is_ok() {
+                    write!(f, "matched {}", impl_)
+                } else {
+                    write!(f, "expected {}, got {}", Implementation::Optimized, impl_)
+                }
+            }
+        }
+    }
+}
+
+impl DispatchRule<Implementation> for Reference {
+    type Error = ImplementationMismatch;
+
+    fn try_match(from: &Implementation) -> Result<MatchScore, FailureScore> {
+        if *from == Implementation::Reference {
+            Ok(MatchScore(0))
+        } else {
+            Err(FailureScore(1))
+        }
+    }
+
+    fn convert(from: Implementation) -> Result<Self, Self::Error> {
+        if from == Implementation::Reference {
+            Ok(Reference)
+        } else {
+            Err(ImplementationMismatch(from))
+        }
+    }
+
+    fn description(
+        f: &mut std::fmt::Formatter<'_>,
+        from: Option<&Implementation>,
+    ) -> std::fmt::Result {
+        match from {
+            None => write!(f, "Chamfer / MaxSim fallback"),
+            Some(impl_) => {
+                if Self::try_match(impl_).is_ok() {
+                    write!(f, "matched {}", impl_)
+                } else {
+                    write!(f, "expected {}, got {}", Implementation::Reference, impl_)
+                }
+            }
+        }
+    }
+}
+
+impl<I, T> Benchmark for Kernel<I, T>
+where
+    datatype::Type<T>: DispatchRule<datatype::DataType>,
+    I: DispatchRule<Implementation, Error = ImplementationMismatch> + 'static,
+    Kernel<I, T>: RunBenchmark<I>,
+    T: 'static,
+{
+    type Input = MultiVectorOp;
+    type Output = Vec<RunResult>;
+
+    fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
+        let mut failscore: Option<u32> = None;
+        if datatype::Type::<T>::try_match(&from.element_type).is_err() {
+            *failscore.get_or_insert(0) += 10;
+        }
+        if let Err(FailureScore(score)) = I::try_match(&from.implementation) {
+            *failscore.get_or_insert(0) += 2 + score;
+        }
+
+        match failscore {
+            None => Ok(MatchScore(0)),
+            Some(score) => Err(FailureScore(score)),
+        }
+    }
+
+    fn run(
+        &self,
+        input: &MultiVectorOp,
+        _: diskann_benchmark_runner::Checkpoint<'_>,
+        mut output: &mut dyn diskann_benchmark_runner::Output,
+    ) -> anyhow::Result<Self::Output> {
+        let _ = I::convert(input.implementation)?;
+        writeln!(output, "{}", input)?;
+        let results = self.run_benchmark(input)?;
+        writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
+        Ok(results)
+    }
+
+    fn description(
+        &self,
+        f: &mut std::fmt::Formatter<'_>,
+        input: Option<&MultiVectorOp>,
+    ) -> std::fmt::Result {
+        match input {
+            None => {
+                writeln!(
+                    f,
+                    "- Element Type: {}",
+                    Description::<datatype::DataType, datatype::Type<T>>::new()
+                )?;
+                writeln!(
+                    f,
+                    "- Implementation: {}",
+                    Description::<Implementation, I>::new()
+                )?;
+            }
+            Some(input) => {
+                if let Err(err) = datatype::Type::<T>::try_match_verbose(&input.element_type) {
+                    writeln!(f, "\n    - Mismatched element type: {}", err)?;
+                }
+                if let Err(err) = I::try_match_verbose(&input.implementation) {
+                    writeln!(f, "\n    - Mismatched implementation: {}", err)?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<I, T> Regression for Kernel<I, T>
+where
+    datatype::Type<T>: DispatchRule<datatype::DataType>,
+    I: DispatchRule<Implementation, Error = ImplementationMismatch> + 'static,
+    Kernel<I, T>: RunBenchmark<I>,
+    T: 'static,
+{
+    type Tolerances = MultiVectorTolerance;
+    type Pass = CheckResult;
+    type Fail = CheckResult;
+
+    fn check(
+        &self,
+        tolerance: &MultiVectorTolerance,
+        _input: &MultiVectorOp,
+        before: &Vec<RunResult>,
+        after: &Vec<RunResult>,
+    ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
+        anyhow::ensure!(
+            before.len() == after.len(),
+            "before has {} runs but after has {}",
+            before.len(),
+            after.len(),
+        );
+
+        let mut passed = true;
+        let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
+            .enumerate()
+            .map(|(i, (b, a))| {
+                anyhow::ensure!(b.run == a.run, "run {i} mismatched");
+
+                let computations_per_latency = b.computations_per_latency() as f64;
+
+                let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+                let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+
+                let comparison = Comparison {
+                    run: b.run.clone(),
+                    tolerance: *tolerance,
+                    before_min,
+                    after_min,
+                };
+
+                match relative_change(before_min, after_min) {
+                    Ok(change) => {
+                        if change > tolerance.min_time_regression.get() {
+                            passed = false;
+                        }
+                    }
+                    Err(_) => passed = false,
+                };
+
+                Ok(comparison)
+            })
+            .collect::<anyhow::Result<Vec<Comparison>>>()?;
+
+        let check = CheckResult { checks };
+
+        if passed {
+            Ok(PassFail::Pass(check))
+        } else {
+            Ok(PassFail::Fail(check))
+        }
+    }
+}
+
+///////////////
+// Benchmark //
+///////////////
+
+trait RunBenchmark<I> {
+    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error>;
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct RunResult {
+    /// The configuration for this run.
+    run: Run,
+    /// Per-measurement latencies (over `loops_per_measurement` calls).
+    latencies: Vec<MicroSeconds>,
+    /// Latency percentiles.
+    percentiles: percentiles::Percentiles<MicroSeconds>,
+}
+
+impl RunResult {
+    fn computations_per_latency(&self) -> usize {
+        self.run.num_query_vectors.get()
+            * self.run.num_doc_vectors.get()
+            * self.run.loops_per_measurement.get()
+    }
+}
+
+impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.is_empty() {
+            return Ok(());
+        }
+
+        // ns/IP is normalized as `min_latency_us * 1000 / (Q * D * loops)` and is
+        // approximately linear in `dim`. Compare across rows with the same `Dim`;
+        // divide further by `Dim` to recover ns per scalar multiply.
+        writeln!(
+            f,
+            "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)"
+        )?;
+
+        let header = [
+            "Operation",
+            "Q",
+            "D",
+            "Dim",
+            "Min Time (ns/IP @ Dim)",
+            "Mean Time (ns/IP @ Dim)",
+            "Loops",
+            "Measurements",
+        ];
+
+        let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.len());
+
+        self.iter().enumerate().for_each(|(row, r)| {
+            let mut row = table.row(row);
+
+            let min_latency = r
+                .latencies
+                .iter()
+                .min()
+                .copied()
+                .unwrap_or(MicroSeconds::new(u64::MAX));
+            let mean_latency = r.percentiles.mean;
+
+            let computations_per_latency = r.computations_per_latency() as f64;
+
+            // Convert time from micro-seconds to nano-seconds per inner-product call
+            // (one (query, doc) pair, ~ linear in dim).
+            let min_time = min_latency.as_f64() / computations_per_latency * 1000.0;
+            let mean_time = mean_latency / computations_per_latency * 1000.0;
+
+            row.insert(r.run.operation, 0);
+            row.insert(r.run.num_query_vectors, 1);
+            row.insert(r.run.num_doc_vectors, 2);
+            row.insert(r.run.dim, 3);
+            row.insert(format!("{:.3}", min_time), 4);
+            row.insert(format!("{:.3}", mean_time), 5);
+            row.insert(r.run.loops_per_measurement, 6);
+            row.insert(r.run.num_measurements, 7);
+        });
+
+        table.fmt(f)
+    }
+}
+
+fn run_loops<F>(run: &Run, mut body: F) -> RunResult
+where
+    F: FnMut(),
+{
+    let mut latencies = Vec::with_capacity(run.num_measurements.get());
+
+    for _ in 0..run.num_measurements.get() {
+        let start = std::time::Instant::now();
+        for _ in 0..run.loops_per_measurement.get() {
+            body();
+        }
+        latencies.push(start.elapsed().into());
+    }
+
+    let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap();
+    RunResult {
+        run: run.clone(),
+        latencies,
+        percentiles,
+    }
+}
+
+///////////////////
+// Data fixtures //
+///////////////////
+
+const RNG_SEED: u64 = 0x12345;
+
+struct Data<T> {
+    query_data: Box<[T]>,
+    doc_data: Box<[T]>,
+}
+
+impl<T: Copy> Data<T>
+where
+    StandardUniform: Distribution<T>,
+{
+    fn new(run: &Run) -> Self {
+        let mut rng = StdRng::seed_from_u64(RNG_SEED);
+        let query_data: Box<[T]> = (0..run.num_query_vectors.get() * run.dim.get())
+            .map(|_| StandardUniform.sample(&mut rng))
+            .collect();
+        let doc_data: Box<[T]> = (0..run.num_doc_vectors.get() * run.dim.get())
+            .map(|_| StandardUniform.sample(&mut rng))
+            .collect();
+
+        Self {
+            query_data,
+            doc_data,
+        }
+    }
+
+    fn query(&self, run: &Run) -> MatRef<'_, Standard<T>> {
+        MatRef::new(
+            Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(),
+            &self.query_data,
+        )
+        .unwrap()
+    }
+
+    fn doc(&self, run: &Run) -> MatRef<'_, Standard<T>> {
+        MatRef::new(
+            Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(),
+            &self.doc_data,
+        )
+        .unwrap()
+    }
+}
+
+/////////////////////
+// Implementations //
+/////////////////////
+
+fn run_optimized<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
+where
+    T: Copy,
+    StandardUniform: Distribution<T>,
+    QueryComputer<T>: NewFromMatRef<T>,
+{
+    let mut results = Vec::with_capacity(input.runs.len());
+    for run in input.runs.iter() {
+        let data = Data::<T>::new(run);
+        let computer = <QueryComputer<T> as NewFromMatRef<T>>::new_from(data.query(run));
+        let doc = data.doc(run);
+
+        let result = match run.operation {
+            Operation::Chamfer => run_loops(run, || {
+                let v = computer.chamfer(doc);
+                std::hint::black_box(v);
+            }),
+            Operation::MaxSim => {
+                let mut scores = vec![0.0f32; run.num_query_vectors.get()];
+                run_loops(run, || {
+                    computer.max_sim(doc, &mut scores);
+                    std::hint::black_box(&mut scores);
+                })
+            }
+        };
+        results.push(result);
+    }
+    Ok(results)
+}
+
+/// Drive the [`Chamfer`] / [`MaxSim`] fallback path.
+fn run_reference<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
+where
+    T: Copy,
+    StandardUniform: Distribution<T>,
+    InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+{
+    let mut results = Vec::with_capacity(input.runs.len());
+    for run in input.runs.iter() {
+        let data = Data::<T>::new(run);
+        let query = data.query(run);
+        let doc = data.doc(run);
+
+        let result = match run.operation {
+            Operation::Chamfer => run_loops(run, || {
+                let v = Chamfer::evaluate(query.into(), doc);
+                std::hint::black_box(v);
+            }),
+            Operation::MaxSim => {
+                let mut scores = vec![0.0f32; run.num_query_vectors.get()];
+                run_loops(run, || {
+                    let mut max_sim = MaxSim::new(&mut scores).unwrap();
+                    let _ = max_sim.evaluate(query.into(), doc);
+                    std::hint::black_box(&mut scores);
+                })
+            }
+        };
+        results.push(result);
+    }
+    Ok(results)
+}
+
+impl RunBenchmark<Optimized> for Kernel<Optimized, f32> {
+    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
+        run_optimized::<f32>(input)
+    }
+}
+
+impl RunBenchmark<Optimized> for Kernel<Optimized, f16> {
+    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
+        run_optimized::<f16>(input)
+    }
+}
+
+impl RunBenchmark<Reference> for Kernel<Reference, f32> {
+    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
+        run_reference::<f32>(input)
+    }
+}
+
+impl RunBenchmark<Reference> for Kernel<Reference, f16> {
+    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
+        run_reference::<f16>(input)
+    }
+}
+
+/// Element-type-erasing constructor for [`QueryComputer`].
+trait NewFromMatRef<T: Copy> {
+    fn new_from(query: MatRef<'_, Standard<T>>) -> QueryComputer<T>;
+}
+
+impl NewFromMatRef<f32> for QueryComputer<f32> {
+    fn new_from(query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
+        QueryComputer::<f32>::new(query)
+    }
+}
+
+impl NewFromMatRef<f16> for QueryComputer<f16> {
+    fn new_from(query: MatRef<'_, Standard<f16>>) -> QueryComputer<f16> {
+        QueryComputer::<f16>::new(query)
+    }
+}
+
+///////////
+// Tests //
+///////////
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use diskann_benchmark_runner::{
+        benchmark::{PassFail, Regression},
+        utils::percentiles::compute_percentiles,
+    };
+
+    fn tiny_run(operation: Operation) -> Run {
+        Run {
+            operation,
+            num_query_vectors: NonZeroUsize::new(2).unwrap(),
+            num_doc_vectors: NonZeroUsize::new(2).unwrap(),
+            dim: NonZeroUsize::new(4).unwrap(),
+            loops_per_measurement: NonZeroUsize::new(1).unwrap(),
+            num_measurements: NonZeroUsize::new(1).unwrap(),
+        }
+    }
+
+    fn tiny_op() -> MultiVectorOp {
+        MultiVectorOp {
+            element_type: DataType::Float32,
+            implementation: Implementation::Optimized,
+            runs: vec![tiny_run(Operation::Chamfer)],
+        }
+    }
+
+    fn tiny_result(operation: Operation, minimum: u64) -> RunResult {
+        let run = tiny_run(operation);
+        let minimum = MicroSeconds::new(minimum);
+        let mut latencies = vec![minimum];
+        let percentiles = compute_percentiles(&mut latencies).unwrap();
+        RunResult {
+            run,
+            latencies,
+            percentiles,
+        }
+    }
+
+    fn tolerance(limit: f64) -> MultiVectorTolerance {
+        MultiVectorTolerance {
+            min_time_regression: NonNegativeFinite::new(limit).unwrap(),
+        }
+    }
+
+    #[test]
+    fn check_rejects_mismatched_runs() {
+        let kernel = Kernel::<Optimized, f32>::new();
+
+        let err = kernel
+            .check(
+                &tolerance(0.0),
+                &tiny_op(),
+                &vec![tiny_result(Operation::Chamfer, 100)],
+                &vec![tiny_result(Operation::MaxSim, 100)],
+            )
+            .unwrap_err();
+
+        assert_eq!(err.to_string(), "run 0 mismatched");
+    }
+
+    #[test]
+    fn check_allows_negative_relative_change() {
+        let kernel = Kernel::<Optimized, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.0),
+                &tiny_op(),
+                &vec![tiny_result(Operation::Chamfer, 100)],
+                &vec![tiny_result(Operation::Chamfer, 95)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Pass(_)));
+    }
+
+    #[test]
+    fn check_passes_on_tolerance_boundary() {
+        let kernel = Kernel::<Optimized, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(Operation::Chamfer, 100)],
+                &vec![tiny_result(Operation::Chamfer, 105)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Pass(_)));
+    }
+
+    #[test]
+    fn check_fails_above_tolerance_boundary() {
+        let kernel = Kernel::<Optimized, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(Operation::Chamfer, 100)],
+                &vec![tiny_result(Operation::Chamfer, 106)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Fail(_)));
+    }
+
+    #[test]
+    fn check_result_display_includes_failure_details() {
+        let check = CheckResult {
+            checks: vec![Comparison {
+                run: tiny_run(Operation::Chamfer),
+                tolerance: tolerance(0.05),
+                before_min: 100.0,
+                after_min: 106.0,
+            }],
+        };
+
+        let rendered = check.to_string();
+        assert!(rendered.contains("Operation"), "rendered = {rendered}");
+        assert!(rendered.contains("chamfer"), "rendered = {rendered}");
+        assert!(rendered.contains("100.000"), "rendered = {rendered}");
+        assert!(rendered.contains("106.000"), "rendered = {rendered}");
+        assert!(rendered.contains("6.000 %"), "rendered = {rendered}");
+        assert!(rendered.contains("FAIL"), "rendered = {rendered}");
+    }
+
+    /// A "before" value of 0 means the measurement was too fast to obtain a
+    /// reliable signal, so we *could* be letting a regression through. We
+    /// require at least a non-zero value.
+    #[test]
+    fn zero_values_rejected() {
+        let kernel = Kernel::<Optimized, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(Operation::Chamfer, 0)],
+                &vec![tiny_result(Operation::Chamfer, 0)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Fail(_)));
+    }
+
+    /// Sanity-check that the optimized kernel and the reference path produce
+    /// numerically equivalent Chamfer scores on a small fixture.
+    #[test]
+    fn optimized_chamfer_matches_reference_f32() {
+        let run = Run {
+            operation: Operation::Chamfer,
+            num_query_vectors: NonZeroUsize::new(5).unwrap(),
+            num_doc_vectors: NonZeroUsize::new(7).unwrap(),
+            dim: NonZeroUsize::new(16).unwrap(),
+            loops_per_measurement: NonZeroUsize::new(1).unwrap(),
+            num_measurements: NonZeroUsize::new(1).unwrap(),
+        };
+
+        let data = Data::<f32>::new(&run);
+        let query = data.query(&run);
+        let doc = data.doc(&run);
+
+        let optimized = QueryComputer::<f32>::new(query).chamfer(doc);
+        let reference = Chamfer::evaluate(query.into(), doc);
+
+        assert!(
+            (optimized - reference).abs() < 1e-4,
+            "optimized={optimized}, reference={reference}",
+        );
+    }
+}
diff --git a/results.json b/results.json
new file mode 100644
index 000000000..f061f6750
--- /dev/null
+++ b/results.json
@@ -0,0 +1,2150 @@
+[
+  {
+    "input": {
+      "content": {
+        "element_type": "float32",
+        "implementation": "optimized",
+        "runs": [
+          {
+            "dim": 128,
+            "loops_per_measurement": 500,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 8,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 256,
+            "loops_per_measurement": 100,
+            "num_doc_vectors": 64,
+            "num_measurements": 50,
+            "num_query_vectors": 16,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 384,
+            "loops_per_measurement": 20,
+            "num_doc_vectors": 128,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 256,
+            "loops_per_measurement": 200,
+            "num_doc_vectors": 16,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 264,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 64,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 10,
+            "num_doc_vectors": 1250,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 512,
+            "loops_per_measurement": 2,
+            "num_doc_vectors": 1250,
+            "num_measurements": 20,
+            "num_query_vectors": 64,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 200,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 64,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 512,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 500,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 8,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 256,
+            "loops_per_measurement": 100,
+            "num_doc_vectors": 64,
+            "num_measurements": 50,
+            "num_query_vectors": 16,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 384,
+            "loops_per_measurement": 20,
+            "num_doc_vectors": 128,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 256,
+            "loops_per_measurement": 200,
+            "num_doc_vectors": 16,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 264,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 64,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 10,
+            "num_doc_vectors": 1250,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 512,
+            "loops_per_measurement": 2,
+            "num_doc_vectors": 1250,
+            "num_measurements": 20,
+            "num_query_vectors": 64,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 200,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 64,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 512,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          }
+        ]
+      },
+      "type": "multi-vector-op"
+    },
+    "results": [
+      {
+        "latencies": [
+          777,
+          777,
+          778,
+          780,
+          780,
+          781,
+          804,
+          838,
+          838,
+          838,
+          838,
+          839,
+          839,
+          839,
+          840,
+          842,
+          845,
+          850,
+          899,
+          926,
+          927,
+          931,
+          932,
+          937,
+          939,
+          956,
+          978,
+          1034,
+          1035,
+          1036,
+          1053,
+          1064,
+          1065,
+          1147,
+          1164,
+          1165,
+          1165,
+          1166,
+          1173,
+          1221,
+          1323,
+          1333,
+          1350,
+          1352,
+          1353,
+          1353,
+          1357,
+          1393,
+          1529,
+          1537
+        ],
+        "percentiles": {
+          "mean": 1030.32,
+          "median": 947.5,
+          "minimum": 777,
+          "p90": 1353,
+          "p99": 1537
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 500,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 8,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1029,
+          1029,
+          1030,
+          1030,
+          1030,
+          1030,
+          1030,
+          1031,
+          1032,
+          1034,
+          1035,
+          1038,
+          1050,
+          1058,
+          1070,
+          1112,
+          1112,
+          1112,
+          1112,
+          1112,
+          1112,
+          1112,
+          1113,
+          1117,
+          1119,
+          1120,
+          1123,
+          1145,
+          1146,
+          1146,
+          1146,
+          1148,
+          1152,
+          1167,
+          1192,
+          1192,
+          1192,
+          1192,
+          1193,
+          1207,
+          1235,
+          1251,
+          1254,
+          1256,
+          1257,
+          1261,
+          1293,
+          1330,
+          1330,
+          1344
+        ],
+        "percentiles": {
+          "mean": 1139.22,
+          "median": 1119.5,
+          "minimum": 1029,
+          "p90": 1261,
+          "p99": 1344
+        },
+        "run": {
+          "dim": 256,
+          "loops_per_measurement": 100,
+          "num_doc_vectors": 64,
+          "num_measurements": 50,
+          "num_query_vectors": 16,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1211,
+          1212,
+          1212,
+          1212,
+          1212,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1214,
+          1217,
+          1217,
+          1220,
+          1223,
+          1225,
+          1226,
+          1227,
+          1229,
+          1231,
+          1235,
+          1235,
+          1239,
+          1239,
+          1240,
+          1244,
+          1249,
+          1252,
+          1259,
+          1264,
+          1270,
+          1281,
+          1294,
+          1299,
+          1306,
+          1312,
+          1315,
+          1332,
+          1341,
+          1383,
+          1484
+        ],
+        "percentiles": {
+          "mean": 1246.32,
+          "median": 1225.5,
+          "minimum": 1210,
+          "p90": 1315,
+          "p99": 1484
+        },
+        "run": {
+          "dim": 384,
+          "loops_per_measurement": 20,
+          "num_doc_vectors": 128,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          958,
+          958,
+          958,
+          958,
+          958,
+          960,
+          960,
+          960,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          962,
+          962,
+          963,
+          964,
+          964,
+          965,
+          965,
+          965,
+          966,
+          966,
+          973,
+          974,
+          974,
+          981,
+          981,
+          983,
+          985,
+          987,
+          987,
+          987,
+          990,
+          999,
+          999
+        ],
+        "percentiles": {
+          "mean": 967.42,
+          "median": 961.0,
+          "minimum": 958,
+          "p90": 987,
+          "p99": 999
+        },
+        "run": {
+          "dim": 256,
+          "loops_per_measurement": 200,
+          "num_doc_vectors": 16,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1019,
+          1019,
+          1019,
+          1019,
+          1020,
+          1020,
+          1020,
+          1020,
+          1020,
+          1020,
+          1021,
+          1022,
+          1023,
+          1023,
+          1026,
+          1029,
+          1031,
+          1032,
+          1033,
+          1034,
+          1035,
+          1036,
+          1037,
+          1041,
+          1044,
+          1044,
+          1045,
+          1046,
+          1065
+        ],
+        "percentiles": {
+          "mean": 1024.58,
+          "median": 1019.5,
+          "minimum": 1017,
+          "p90": 1044,
+          "p99": 1065
+        },
+        "run": {
+          "dim": 264,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 64,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1854,
+          1855,
+          1855,
+          1855,
+          1855,
+          1855,
+          1856,
+          1856,
+          1856,
+          1857,
+          1857,
+          1857,
+          1857,
+          1857,
+          1857,
+          1858,
+          1858,
+          1858,
+          1858,
+          1858,
+          1858,
+          1858,
+          1859,
+          1860,
+          1861,
+          1861,
+          1863,
+          1866,
+          1869,
+          1870,
+          1871,
+          1871,
+          1871,
+          1872,
+          1874,
+          1875,
+          1881,
+          1883,
+          1885,
+          1885,
+          1890,
+          1892,
+          1892,
+          1892,
+          1892,
+          1899,
+          1906,
+          1909,
+          1909,
+          1916
+        ],
+        "percentiles": {
+          "mean": 1870.38,
+          "median": 1861.0,
+          "minimum": 1854,
+          "p90": 1899,
+          "p99": 1916
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 10,
+          "num_doc_vectors": 1250,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          3180,
+          3180,
+          3180,
+          3180,
+          3180,
+          3181,
+          3181,
+          3181,
+          3181,
+          3183,
+          3185,
+          3187,
+          3205,
+          3206,
+          3207,
+          3208,
+          3211,
+          3218,
+          3220,
+          3268
+        ],
+        "percentiles": {
+          "mean": 3196.1,
+          "median": 3184.0,
+          "minimum": 3180,
+          "p90": 3220,
+          "p99": 3268
+        },
+        "run": {
+          "dim": 512,
+          "loops_per_measurement": 2,
+          "num_doc_vectors": 1250,
+          "num_measurements": 20,
+          "num_query_vectors": 64,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1784,
+          1784,
+          1784,
+          1784,
+          1784,
+          1784,
+          1785,
+          1785,
+          1790,
+          1791,
+          1791,
+          1792,
+          1792,
+          1792,
+          1792,
+          1792,
+          1792,
+          1792,
+          1795,
+          1795,
+          1796,
+          1796,
+          1796,
+          1796,
+          1798,
+          1800,
+          1803,
+          1805,
+          1814,
+          1815,
+          1817,
+          1818,
+          1821,
+          1826,
+          1840,
+          1845,
+          1856,
+          1858,
+          1878,
+          1879,
+          1879,
+          1884,
+          1888,
+          1890,
+          1893,
+          1905,
+          1907,
+          1912,
+          1918,
+          1950
+        ],
+        "percentiles": {
+          "mean": 1825.26,
+          "median": 1799.0,
+          "minimum": 1784,
+          "p90": 1905,
+          "p99": 1950
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 200,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 64,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1019,
+          1020,
+          1020,
+          1021,
+          1021,
+          1022,
+          1022,
+          1022,
+          1023,
+          1027,
+          1030,
+          1030,
+          1035,
+          1043,
+          1043,
+          1044,
+          1045,
+          1049,
+          1049,
+          1060
+        ],
+        "percentiles": {
+          "mean": 1023.2,
+          "median": 1017.5,
+          "minimum": 1017,
+          "p90": 1044,
+          "p99": 1060
+        },
+        "run": {
+          "dim": 512,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          569,
+          569,
+          569,
+          569,
+          569,
+          569,
+          570,
+          570,
+          570,
+          570,
+          570,
+          570,
+          570,
+          570,
+          570,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          574,
+          578,
+          578,
+          594,
+          595,
+          598
+        ],
+        "percentiles": {
+          "mean": 571.2,
+          "median": 570.0,
+          "minimum": 567,
+          "p90": 578,
+          "p99": 598
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 500,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 8,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          988,
+          988,
+          988,
+          988,
+          988,
+          988,
+          988,
+          988,
+          988,
+          989,
+          989,
+          989,
+          989,
+          989,
+          989,
+          989,
+          989,
+          989,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          992,
+          992,
+          992,
+          992,
+          992,
+          992,
+          992,
+          992,
+          992,
+          996,
+          996,
+          1004,
+          1009,
+          1013,
+          1018,
+          1020,
+          1047,
+          1057
+        ],
+        "percentiles": {
+          "mean": 995.1,
+          "median": 991.0,
+          "minimum": 988,
+          "p90": 1013,
+          "p99": 1057
+        },
+        "run": {
+          "dim": 256,
+          "loops_per_measurement": 100,
+          "num_doc_vectors": 64,
+          "num_measurements": 50,
+          "num_query_vectors": 16,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1211,
+          1211,
+          1211,
+          1212,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1214,
+          1214,
+          1214,
+          1214,
+          1214,
+          1214,
+          1214,
+          1214,
+          1214,
+          1214,
+          1216,
+          1217,
+          1217,
+          1217,
+          1218,
+          1220,
+          1222,
+          1223,
+          1224,
+          1224,
+          1225,
+          1227,
+          1238,
+          1239,
+          1239,
+          1241,
+          1242,
+          1243
+        ],
+        "percentiles": {
+          "mean": 1217.74,
+          "median": 1214.0,
+          "minimum": 1210,
+          "p90": 1239,
+          "p99": 1243
+        },
+        "run": {
+          "dim": 384,
+          "loops_per_measurement": 20,
+          "num_doc_vectors": 128,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          953,
+          953,
+          953,
+          953,
+          953,
+          953,
+          954,
+          954,
+          956,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          958,
+          958,
+          958,
+          958,
+          958,
+          958,
+          960,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          962,
+          963,
+          971,
+          976,
+          978,
+          984,
+          984,
+          987
+        ],
+        "percentiles": {
+          "mean": 960.1,
+          "median": 957.0,
+          "minimum": 953,
+          "p90": 976,
+          "p99": 987
+        },
+        "run": {
+          "dim": 256,
+          "loops_per_measurement": 200,
+          "num_doc_vectors": 16,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          1016,
+          1016,
+          1016,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1019,
+          1019,
+          1019,
+          1019,
+          1019,
+          1019,
+          1019,
+          1021,
+          1021,
+          1023,
+          1023,
+          1025,
+          1032,
+          1044,
+          1045,
+          1045,
+          1045,
+          1047,
+          1052,
+          1058,
+          1061
+        ],
+        "percentiles": {
+          "mean": 1023.46,
+          "median": 1018.0,
+          "minimum": 1016,
+          "p90": 1045,
+          "p99": 1061
+        },
+        "run": {
+          "dim": 264,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 64,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          1858,
+          1858,
+          1860,
+          1860,
+          1860,
+          1860,
+          1860,
+          1860,
+          1860,
+          1860,
+          1860,
+          1861,
+          1861,
+          1861,
+          1861,
+          1861,
+          1861,
+          1861,
+          1862,
+          1863,
+          1863,
+          1864,
+          1865,
+          1867,
+          1868,
+          1872,
+          1873,
+          1876,
+          1878,
+          1881,
+          1882,
+          1883,
+          1888,
+          1889,
+          1889,
+          1890,
+          1890,
+          1890,
+          1891,
+          1892,
+          1905,
+          1906,
+          1908,
+          1934,
+          1962,
+          1967,
+          1974,
+          1988,
+          2004,
+          2014
+        ],
+        "percentiles": {
+          "mean": 1887.22,
+          "median": 1870.0,
+          "minimum": 1858,
+          "p90": 1967,
+          "p99": 2014
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 10,
+          "num_doc_vectors": 1250,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          3177,
+          3177,
+          3177,
+          3179,
+          3192,
+          3201,
+          3212,
+          3222,
+          3251,
+          3251,
+          3255,
+          3256,
+          3256,
+          3321,
+          3381,
+          3399,
+          3400,
+          3419,
+          3422,
+          3445
+        ],
+        "percentiles": {
+          "mean": 3279.65,
+          "median": 3253.0,
+          "minimum": 3177,
+          "p90": 3422,
+          "p99": 3445
+        },
+        "run": {
+          "dim": 512,
+          "loops_per_measurement": 2,
+          "num_doc_vectors": 1250,
+          "num_measurements": 20,
+          "num_query_vectors": 64,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          1783,
+          1784,
+          1787,
+          1791,
+          1791,
+          1791,
+          1813,
+          1838,
+          1853,
+          1868,
+          1871,
+          1882,
+          1882,
+          1884,
+          1890,
+          1899,
+          1899,
+          1899,
+          1900,
+          1901,
+          1905,
+          1906,
+          1908,
+          1909,
+          1911,
+          1911,
+          1911,
+          1911,
+          1914,
+          1915,
+          1915,
+          1916,
+          1916,
+          1917,
+          1919,
+          1922,
+          1922,
+          1923,
+          1923,
+          1925,
+          1927,
+          1927,
+          1928,
+          1929,
+          1929,
+          1933,
+          1937,
+          1938,
+          1940,
+          1983
+        ],
+        "percentiles": {
+          "mean": 1893.52,
+          "median": 1911.0,
+          "minimum": 1783,
+          "p90": 1933,
+          "p99": 1983
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 200,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 64,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1020,
+          1023,
+          1023,
+          1025,
+          1028,
+          1033,
+          1033,
+          1034,
+          1037,
+          1038,
+          1040,
+          1043,
+          1044,
+          1052,
+          1052,
+          1057,
+          1060,
+          1063,
+          1078,
+          1088,
+          1088,
+          1088,
+          1088,
+          1088,
+          1088,
+          1088,
+          1088,
+          1090,
+          1090,
+          1090,
+          1092,
+          1093,
+          1093,
+          1094,
+          1094
+        ],
+        "percentiles": {
+          "mean": 1049.56,
+          "median": 1039.0,
+          "minimum": 1017,
+          "p90": 1092,
+          "p99": 1094
+        },
+        "run": {
+          "dim": 512,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      }
+    ]
+  },
+  {
+    "input": {
+      "content": {
+        "element_type": "float16",
+        "implementation": "optimized",
+        "runs": [
+          {
+            "dim": 256,
+            "loops_per_measurement": 100,
+            "num_doc_vectors": 64,
+            "num_measurements": 50,
+            "num_query_vectors": 16,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 10,
+            "num_doc_vectors": 1250,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 256,
+            "loops_per_measurement": 100,
+            "num_doc_vectors": 64,
+            "num_measurements": 50,
+            "num_query_vectors": 16,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 10,
+            "num_doc_vectors": 1250,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          }
+        ]
+      },
+      "type": "multi-vector-op"
+    },
+    "results": [
+      {
+        "latencies": [
+          1734,
+          1734,
+          1736,
+          1736,
+          1737,
+          1737,
+          1737,
+          1738,
+          1738,
+          1738,
+          1738,
+          1739,
+          1740,
+          1740,
+          1741,
+          1744,
+          1744,
+          1751,
+          1751,
+          1753,
+          1754,
+          1754,
+          1756,
+          1759,
+          1761,
+          1764,
+          1767,
+          1767,
+          1767,
+          1768,
+          1768,
+          1769,
+          1769,
+          1773,
+          1774,
+          1775,
+          1779,
+          1787,
+          1794,
+          1808,
+          1822,
+          1825,
+          1829,
+          1829,
+          1844,
+          1846,
+          1852,
+          1859,
+          1903,
+          2194
+        ],
+        "percentiles": {
+          "mean": 1780.44,
+          "median": 1762.5,
+          "minimum": 1734,
+          "p90": 1846,
+          "p99": 2194
+        },
+        "run": {
+          "dim": 256,
+          "loops_per_measurement": 100,
+          "num_doc_vectors": 64,
+          "num_measurements": 50,
+          "num_query_vectors": 16,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          2130,
+          2130,
+          2130,
+          2131,
+          2133,
+          2133,
+          2140,
+          2142,
+          2149,
+          2151,
+          2158,
+          2160,
+          2163,
+          2164,
+          2166,
+          2167,
+          2167,
+          2168,
+          2171,
+          2173,
+          2174,
+          2176,
+          2177,
+          2178,
+          2178,
+          2181,
+          2184,
+          2189,
+          2195,
+          2195,
+          2197,
+          2198,
+          2198,
+          2201,
+          2203,
+          2207,
+          2215,
+          2217,
+          2220,
+          2229,
+          2240,
+          2242,
+          2243,
+          2249,
+          2250,
+          2291,
+          2305,
+          2438,
+          2613,
+          2643
+        ],
+        "percentiles": {
+          "mean": 2209.04,
+          "median": 2179.5,
+          "minimum": 2130,
+          "p90": 2291,
+          "p99": 2643
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 10,
+          "num_doc_vectors": 1250,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1731,
+          1733,
+          1737,
+          1737,
+          1737,
+          1741,
+          1741,
+          1745,
+          1745,
+          1750,
+          1750,
+          1750,
+          1750,
+          1751,
+          1754,
+          1754,
+          1755,
+          1758,
+          1758,
+          1759,
+          1761,
+          1761,
+          1766,
+          1768,
+          1770,
+          1771,
+          1771,
+          1772,
+          1773,
+          1773,
+          1775,
+          1776,
+          1776,
+          1778,
+          1785,
+          1788,
+          1789,
+          1791,
+          1795,
+          1800,
+          1804,
+          1808,
+          1814,
+          1822,
+          1832,
+          1833,
+          1834,
+          1864,
+          1867,
+          1869
+        ],
+        "percentiles": {
+          "mean": 1776.44,
+          "median": 1770.5,
+          "minimum": 1731,
+          "p90": 1833,
+          "p99": 1869
+        },
+        "run": {
+          "dim": 256,
+          "loops_per_measurement": 100,
+          "num_doc_vectors": 64,
+          "num_measurements": 50,
+          "num_query_vectors": 16,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          2127,
+          2127,
+          2129,
+          2130,
+          2132,
+          2141,
+          2142,
+          2142,
+          2147,
+          2148,
+          2149,
+          2150,
+          2154,
+          2154,
+          2159,
+          2162,
+          2166,
+          2168,
+          2170,
+          2173,
+          2177,
+          2180,
+          2180,
+          2181,
+          2181,
+          2182,
+          2183,
+          2187,
+          2196,
+          2196,
+          2199,
+          2200,
+          2204,
+          2211,
+          2213,
+          2216,
+          2224,
+          2255,
+          2256,
+          2271,
+          2354,
+          2488,
+          2493,
+          2495,
+          2498,
+          2505,
+          2525,
+          2653,
+          2657,
+          3515
+        ],
+        "percentiles": {
+          "mean": 2264.9,
+          "median": 2181.5,
+          "minimum": 2127,
+          "p90": 2505,
+          "p99": 3515
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 10,
+          "num_doc_vectors": 1250,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      }
+    ]
+  },
+  {
+    "input": {
+      "content": {
+        "element_type": "float32",
+        "implementation": "reference",
+        "runs": [
+          {
+            "dim": 128,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 8,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 384,
+            "loops_per_measurement": 2,
+            "num_doc_vectors": 128,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 8,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 384,
+            "loops_per_measurement": 2,
+            "num_doc_vectors": 128,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          }
+        ]
+      },
+      "type": "multi-vector-op"
+    },
+    "results": [
+      {
+        "latencies": [
+          64,
+          64,
+          64,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          67,
+          67,
+          67,
+          67,
+          67,
+          67,
+          67,
+          67,
+          67,
+          68,
+          68,
+          69,
+          71,
+          127
+        ],
+        "percentiles": {
+          "mean": 67.52,
+          "median": 66.0,
+          "minimum": 64,
+          "p90": 68,
+          "p99": 127
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 8,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          130,
+          130,
+          130,
+          130,
+          130,
+          130,
+          130,
+          130,
+          130,
+          130,
+          131,
+          131,
+          131,
+          131,
+          132,
+          132,
+          133,
+          133,
+          135,
+          136,
+          136,
+          137,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          139,
+          139,
+          139,
+          139,
+          139,
+          140,
+          140,
+          140,
+          141,
+          143,
+          147,
+          161
+        ],
+        "percentiles": {
+          "mean": 136.26,
+          "median": 138.0,
+          "minimum": 130,
+          "p90": 140,
+          "p99": 161
+        },
+        "run": {
+          "dim": 384,
+          "loops_per_measurement": 2,
+          "num_doc_vectors": 128,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          62,
+          62,
+          62,
+          62,
+          62,
+          63,
+          63,
+          63,
+          63,
+          63,
+          63,
+          63,
+          63,
+          63,
+          63,
+          64,
+          64,
+          65,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          67,
+          67,
+          67,
+          67,
+          67,
+          67,
+          67,
+          68,
+          68,
+          69,
+          71,
+          72,
+          78,
+          106
+        ],
+        "percentiles": {
+          "mean": 66.44,
+          "median": 66.0,
+          "minimum": 62,
+          "p90": 69,
+          "p99": 106
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 8,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          130,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          132,
+          132,
+          132,
+          132,
+          132,
+          133,
+          133,
+          134,
+          134,
+          135,
+          135,
+          135,
+          136,
+          136,
+          137,
+          139,
+          139,
+          140,
+          142,
+          142,
+          143,
+          144,
+          145,
+          145,
+          147,
+          155,
+          158
+        ],
+        "percentiles": {
+          "mean": 135.18,
+          "median": 132.0,
+          "minimum": 130,
+          "p90": 145,
+          "p99": 158
+        },
+        "run": {
+          "dim": 384,
+          "loops_per_measurement": 2,
+          "num_doc_vectors": 128,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      }
+    ]
+  },
+  {
+    "input": {
+      "content": {
+        "element_type": "float16",
+        "implementation": "reference",
+        "runs": [
+          {
+            "dim": 128,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 8,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 384,
+            "loops_per_measurement": 2,
+            "num_doc_vectors": 128,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          }
+        ]
+      },
+      "type": "multi-vector-op"
+    },
+    "results": [
+      {
+        "latencies": [
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          74,
+          74,
+          74,
+          74,
+          74,
+          74,
+          75,
+          75,
+          76,
+          76,
+          76,
+          76,
+          76,
+          76,
+          77,
+          77,
+          77,
+          77,
+          77,
+          77,
+          77,
+          77,
+          77,
+          78,
+          78,
+          78,
+          79,
+          80,
+          80,
+          80,
+          84,
+          87,
+          92
+        ],
+        "percentiles": {
+          "mean": 76.0,
+          "median": 75.5,
+          "minimum": 73,
+          "p90": 80,
+          "p99": 92
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 8,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          135,
+          135,
+          135,
+          135,
+          135,
+          135,
+          135,
+          135,
+          136,
+          136,
+          137,
+          138,
+          140,
+          141,
+          141,
+          141,
+          141,
+          141,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          143,
+          143,
+          143,
+          144,
+          144,
+          145,
+          145,
+          145,
+          147,
+          150,
+          151,
+          151,
+          153,
+          154,
+          158,
+          158
+        ],
+        "percentiles": {
+          "mean": 142.36,
+          "median": 142.0,
+          "minimum": 135,
+          "p90": 151,
+          "p99": 158
+        },
+        "run": {
+          "dim": 384,
+          "loops_per_measurement": 2,
+          "num_doc_vectors": 128,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      }
+    ]
+  }
+]
\ No newline at end of file

From 54a21ec8f274006c433fcddf111cd2580aa184e1 Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryangupta@microsoft.com>
Date: Thu, 7 May 2026 02:38:23 +0530
Subject: [PATCH 02/13] Move some repetetive code to macros and add more
 benchmark cases

---
 .../examples/multi-vector.json                |  65 ++++-
 diskann-benchmark-multi-vector/src/lib.rs     | 228 +++++++-----------
 2 files changed, 141 insertions(+), 152 deletions(-)

diff --git a/diskann-benchmark-multi-vector/examples/multi-vector.json b/diskann-benchmark-multi-vector/examples/multi-vector.json
index 2626e5047..553a6a9d8 100644
--- a/diskann-benchmark-multi-vector/examples/multi-vector.json
+++ b/diskann-benchmark-multi-vector/examples/multi-vector.json
@@ -13,7 +13,7 @@
           { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
           { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
           { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 20 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
           { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
           { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
 
@@ -23,7 +23,7 @@
           { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
           { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
           { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 20 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
           { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
           { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
         ]
@@ -35,10 +35,25 @@
         "element_type": "float16",
         "implementation": "optimized",
         "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
           { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
           { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
+
+          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
           { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 }
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
         ]
       }
     },
@@ -48,10 +63,25 @@
         "element_type": "float32",
         "implementation": "reference",
         "runs": [
-          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,  "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,  "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2,  "num_measurements": 50 }
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
+
+          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
         ]
       }
     },
@@ -61,8 +91,25 @@
         "element_type": "float16",
         "implementation": "reference",
         "runs": [
-          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,  "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2,  "num_measurements": 50 }
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
+
+          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
         ]
       }
     }
diff --git a/diskann-benchmark-multi-vector/src/lib.rs b/diskann-benchmark-multi-vector/src/lib.rs
index 7cadf4f29..ea6a09715 100644
--- a/diskann-benchmark-multi-vector/src/lib.rs
+++ b/diskann-benchmark-multi-vector/src/lib.rs
@@ -293,25 +293,19 @@ impl std::fmt::Display for CheckResult {
 ////////////////////////////
 
 fn register_benchmarks_impl(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) {
+    macro_rules! register {
+        ($impl:ident, $t:ty, $tag:literal) => {
+            dispatcher.register_regression($tag, Kernel::<$impl, $t>::new());
+        };
+    }
+
     // Optimized (architecture-dispatched QueryComputer).
-    dispatcher.register_regression(
-        "multi-vector-op-f32-optimized",
-        Kernel::<Optimized, f32>::new(),
-    );
-    dispatcher.register_regression(
-        "multi-vector-op-f16-optimized",
-        Kernel::<Optimized, f16>::new(),
-    );
+    register!(Optimized, f32, "multi-vector-op-f32-optimized");
+    register!(Optimized, f16, "multi-vector-op-f16-optimized");
 
     // Reference (Chamfer / MaxSim fallback path).
-    dispatcher.register_regression(
-        "multi-vector-op-f32-reference",
-        Kernel::<Reference, f32>::new(),
-    );
-    dispatcher.register_regression(
-        "multi-vector-op-f16-reference",
-        Kernel::<Reference, f16>::new(),
-    );
+    register!(Reference, f32, "multi-vector-op-f32-reference");
+    register!(Reference, f16, "multi-vector-op-f16-reference");
 }
 
 //////////////
@@ -340,81 +334,52 @@ impl<I, T> Kernel<I, T> {
 }
 
 #[derive(Debug, Error)]
-#[error("implementation {0} is not registered for this benchmark")]
+#[error("this kernel handles a different implementation than {0}")]
 pub(crate) struct ImplementationMismatch(Implementation);
 
-impl DispatchRule<Implementation> for Optimized {
-    type Error = ImplementationMismatch;
-
-    fn try_match(from: &Implementation) -> Result<MatchScore, FailureScore> {
-        if *from == Implementation::Optimized {
-            Ok(MatchScore(0))
-        } else {
-            Err(FailureScore(1))
-        }
-    }
-
-    fn convert(from: Implementation) -> Result<Self, Self::Error> {
-        if from == Implementation::Optimized {
-            Ok(Optimized)
-        } else {
-            Err(ImplementationMismatch(from))
-        }
-    }
+macro_rules! impl_dispatch_rule {
+    ($marker:ident, $variant:ident, $description:literal) => {
+        impl DispatchRule<Implementation> for $marker {
+            type Error = ImplementationMismatch;
 
-    fn description(
-        f: &mut std::fmt::Formatter<'_>,
-        from: Option<&Implementation>,
-    ) -> std::fmt::Result {
-        match from {
-            None => write!(f, "QueryComputer (architecture-dispatched)"),
-            Some(impl_) => {
-                if Self::try_match(impl_).is_ok() {
-                    write!(f, "matched {}", impl_)
+            fn try_match(from: &Implementation) -> Result<MatchScore, FailureScore> {
+                if *from == Implementation::$variant {
+                    Ok(MatchScore(0))
                 } else {
-                    write!(f, "expected {}, got {}", Implementation::Optimized, impl_)
+                    Err(FailureScore(1))
                 }
             }
-        }
-    }
-}
-
-impl DispatchRule<Implementation> for Reference {
-    type Error = ImplementationMismatch;
 
-    fn try_match(from: &Implementation) -> Result<MatchScore, FailureScore> {
-        if *from == Implementation::Reference {
-            Ok(MatchScore(0))
-        } else {
-            Err(FailureScore(1))
-        }
-    }
-
-    fn convert(from: Implementation) -> Result<Self, Self::Error> {
-        if from == Implementation::Reference {
-            Ok(Reference)
-        } else {
-            Err(ImplementationMismatch(from))
-        }
-    }
-
-    fn description(
-        f: &mut std::fmt::Formatter<'_>,
-        from: Option<&Implementation>,
-    ) -> std::fmt::Result {
-        match from {
-            None => write!(f, "Chamfer / MaxSim fallback"),
-            Some(impl_) => {
-                if Self::try_match(impl_).is_ok() {
-                    write!(f, "matched {}", impl_)
+            fn convert(from: Implementation) -> Result<Self, Self::Error> {
+                if from == Implementation::$variant {
+                    Ok($marker)
                 } else {
-                    write!(f, "expected {}, got {}", Implementation::Reference, impl_)
+                    Err(ImplementationMismatch(from))
+                }
+            }
+
+            fn description(
+                f: &mut std::fmt::Formatter<'_>,
+                from: Option<&Implementation>,
+            ) -> std::fmt::Result {
+                match from {
+                    None => write!(f, $description),
+                    Some(impl_) => {
+                        if Self::try_match(impl_).is_ok() {
+                            write!(f, "matched {}", impl_)
+                        } else {
+                            write!(f, "expected {}, got {}", Implementation::$variant, impl_)
+                        }
+                    }
                 }
             }
         }
-    }
+    };
 }
 
+impl_dispatch_rule!(Optimized, Optimized, "QueryComputer (architecture-dispatched)");
+impl_dispatch_rule!(Reference, Reference, "Chamfer / MaxSim fallback");
+
 impl<I, T> Benchmark for Kernel<I, T>
 where
     datatype::Type<T>: DispatchRule<datatype::DataType>,
@@ -446,7 +411,9 @@ where
         _: diskann_benchmark_runner::Checkpoint<'_>,
         mut output: &mut dyn diskann_benchmark_runner::Output,
     ) -> anyhow::Result<Self::Output> {
-        let _ = I::convert(input.implementation)?;
+        // The dispatcher only invokes `run` after `try_match` has already accepted
+        // the input, so a failure here would indicate a dispatcher bug.
+        I::convert(input.implementation).expect("try_match accepted the input");
         writeln!(output, "{}", input)?;
         let results = self.run_benchmark(input)?;
         writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
@@ -717,6 +684,9 @@ where
     let mut results = Vec::with_capacity(input.runs.len());
     for run in input.runs.iter() {
         let data = Data::<T>::new(run);
+        // `QueryComputer` performs query-side precomputation that is intentionally
+        // amortized across many `chamfer` / `max_sim` calls; construct it once per
+        // shape, outside the timed loop.
         let computer = <QueryComputer<T> as NewFromMatRef<T>>::new_from(data.query(run));
         let doc = data.doc(run);
 
@@ -748,20 +718,23 @@ where
     let mut results = Vec::with_capacity(input.runs.len());
     for run in input.runs.iter() {
         let data = Data::<T>::new(run);
-        let query = data.query(run);
         let doc = data.doc(run);
+        // Hoist out of the timed loop to mirror the optimized path's
+        // per-shape precomputation.
+        let query: diskann_quantization::multi_vector::distance::QueryMatRef<'_, _> =
+            data.query(run).into();
 
         let result = match run.operation {
             Operation::Chamfer => run_loops(run, || {
-                let v = Chamfer::evaluate(query.into(), doc);
+                let v = Chamfer::evaluate(query, doc);
                 std::hint::black_box(v);
             }),
             Operation::MaxSim => {
                 let mut scores = vec![0.0f32; run.num_query_vectors.get()];
+                let mut max_sim = MaxSim::new(&mut scores).unwrap();
                 run_loops(run, || {
-                    let mut max_sim = MaxSim::new(&mut scores).unwrap();
-                    let _ = max_sim.evaluate(query.into(), doc);
-                    std::hint::black_box(&mut scores);
+                    let _ = max_sim.evaluate(query, doc);
+                    std::hint::black_box(max_sim.scores_mut());
                 })
             }
         };
@@ -770,47 +743,42 @@ where
     Ok(results)
 }
 
-impl RunBenchmark<Optimized> for Kernel<Optimized, f32> {
-    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
-        run_optimized::<f32>(input)
-    }
-}
-
-impl RunBenchmark<Optimized> for Kernel<Optimized, f16> {
-    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
-        run_optimized::<f16>(input)
-    }
-}
-
-impl RunBenchmark<Reference> for Kernel<Reference, f32> {
-    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
-        run_reference::<f32>(input)
-    }
-}
-
-impl RunBenchmark<Reference> for Kernel<Reference, f16> {
-    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
-        run_reference::<f16>(input)
-    }
-}
-
 /// Element-type-erasing constructor for [`QueryComputer`].
 trait NewFromMatRef<T: Copy> {
     fn new_from(query: MatRef<'_, Standard<T>>) -> QueryComputer<T>;
 }
 
-impl NewFromMatRef<f32> for QueryComputer<f32> {
-    fn new_from(query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
-        QueryComputer::<f32>::new(query)
-    }
-}
+macro_rules! impl_kernel_for {
+    ($t:ty) => {
+        impl NewFromMatRef<$t> for QueryComputer<$t> {
+            fn new_from(query: MatRef<'_, Standard<$t>>) -> QueryComputer<$t> {
+                QueryComputer::<$t>::new(query)
+            }
+        }
 
-impl NewFromMatRef<f16> for QueryComputer<f16> {
-    fn new_from(query: MatRef<'_, Standard<f16>>) -> QueryComputer<f16> {
-        QueryComputer::<f16>::new(query)
-    }
+        impl RunBenchmark<Optimized> for Kernel<Optimized, $t> {
+            fn run_benchmark(
+                &self,
+                input: &MultiVectorOp,
+            ) -> Result<Vec<RunResult>, anyhow::Error> {
+                run_optimized::<$t>(input)
+            }
+        }
+
+        impl RunBenchmark<Reference> for Kernel<Reference, $t> {
+            fn run_benchmark(
+                &self,
+                input: &MultiVectorOp,
+            ) -> Result<Vec<RunResult>, anyhow::Error> {
+                run_reference::<$t>(input)
+            }
+        }
+    };
 }
 
+impl_kernel_for!(f32);
+impl_kernel_for!(f16);
+
 ///////////
 // Tests //
 ///////////
@@ -963,30 +931,4 @@ mod tests {
 
         assert!(matches!(result, PassFail::Fail(_)));
     }
-
-    /// Sanity-check that the optimized kernel and the reference path produce
-    /// numerically equivalent Chamfer scores on a small fixture.
-    #[test]
-    fn optimized_chamfer_matches_reference_f32() {
-        let run = Run {
-            operation: Operation::Chamfer,
-            num_query_vectors: NonZeroUsize::new(5).unwrap(),
-            num_doc_vectors: NonZeroUsize::new(7).unwrap(),
-            dim: NonZeroUsize::new(16).unwrap(),
-            loops_per_measurement: NonZeroUsize::new(1).unwrap(),
-            num_measurements: NonZeroUsize::new(1).unwrap(),
-        };
-
-        let data = Data::<f32>::new(&run);
-        let query = data.query(&run);
-        let doc = data.doc(&run);
-
-        let optimized = QueryComputer::<f32>::new(query).chamfer(doc);
-        let reference = Chamfer::evaluate(query.into(), doc);
-
-        assert!(
-            (optimized - reference).abs() < 1e-4,
-            "optimized={optimized}, reference={reference}",
-        );
-    }
 }

From f3a5d9fb33cc2dbb0864c88f8bd90bbb65e26dca Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryangupta@microsoft.com>
Date: Thu, 7 May 2026 02:38:43 +0530
Subject: [PATCH 03/13] Move some repetetive code to macros and add more
 benchmark cases

---
 diskann-benchmark-multi-vector/src/lib.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/diskann-benchmark-multi-vector/src/lib.rs b/diskann-benchmark-multi-vector/src/lib.rs
index ea6a09715..df08d93dd 100644
--- a/diskann-benchmark-multi-vector/src/lib.rs
+++ b/diskann-benchmark-multi-vector/src/lib.rs
@@ -377,7 +377,11 @@ macro_rules! impl_dispatch_rule {
     };
 }
 
-impl_dispatch_rule!(Optimized, Optimized, "QueryComputer (architecture-dispatched)");
+impl_dispatch_rule!(
+    Optimized,
+    Optimized,
+    "QueryComputer (architecture-dispatched)"
+);
 impl_dispatch_rule!(Reference, Reference, "Chamfer / MaxSim fallback");
 
 impl<I, T> Benchmark for Kernel<I, T>

From 8efdbcd5e79bf48068dfc8e1b4d6a6cdfadd35f0 Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryangupta@microsoft.com>
Date: Thu, 7 May 2026 02:39:33 +0530
Subject: [PATCH 04/13] Move some repetetive code to macros and add more
 benchmark cases

---
 results.json | 2150 --------------------------------------------------
 1 file changed, 2150 deletions(-)
 delete mode 100644 results.json

diff --git a/results.json b/results.json
deleted file mode 100644
index f061f6750..000000000
--- a/results.json
+++ /dev/null
@@ -1,2150 +0,0 @@
-[
-  {
-    "input": {
-      "content": {
-        "element_type": "float32",
-        "implementation": "optimized",
-        "runs": [
-          {
-            "dim": 128,
-            "loops_per_measurement": 500,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 8,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 256,
-            "loops_per_measurement": 100,
-            "num_doc_vectors": 64,
-            "num_measurements": 50,
-            "num_query_vectors": 16,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 384,
-            "loops_per_measurement": 20,
-            "num_doc_vectors": 128,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 256,
-            "loops_per_measurement": 200,
-            "num_doc_vectors": 16,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 264,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 64,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 10,
-            "num_doc_vectors": 1250,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 512,
-            "loops_per_measurement": 2,
-            "num_doc_vectors": 1250,
-            "num_measurements": 20,
-            "num_query_vectors": 64,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 200,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 64,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 512,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 500,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 8,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 256,
-            "loops_per_measurement": 100,
-            "num_doc_vectors": 64,
-            "num_measurements": 50,
-            "num_query_vectors": 16,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 384,
-            "loops_per_measurement": 20,
-            "num_doc_vectors": 128,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 256,
-            "loops_per_measurement": 200,
-            "num_doc_vectors": 16,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 264,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 64,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 10,
-            "num_doc_vectors": 1250,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 512,
-            "loops_per_measurement": 2,
-            "num_doc_vectors": 1250,
-            "num_measurements": 20,
-            "num_query_vectors": 64,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 200,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 64,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 512,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          }
-        ]
-      },
-      "type": "multi-vector-op"
-    },
-    "results": [
-      {
-        "latencies": [
-          777,
-          777,
-          778,
-          780,
-          780,
-          781,
-          804,
-          838,
-          838,
-          838,
-          838,
-          839,
-          839,
-          839,
-          840,
-          842,
-          845,
-          850,
-          899,
-          926,
-          927,
-          931,
-          932,
-          937,
-          939,
-          956,
-          978,
-          1034,
-          1035,
-          1036,
-          1053,
-          1064,
-          1065,
-          1147,
-          1164,
-          1165,
-          1165,
-          1166,
-          1173,
-          1221,
-          1323,
-          1333,
-          1350,
-          1352,
-          1353,
-          1353,
-          1357,
-          1393,
-          1529,
-          1537
-        ],
-        "percentiles": {
-          "mean": 1030.32,
-          "median": 947.5,
-          "minimum": 777,
-          "p90": 1353,
-          "p99": 1537
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 500,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 8,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1029,
-          1029,
-          1030,
-          1030,
-          1030,
-          1030,
-          1030,
-          1031,
-          1032,
-          1034,
-          1035,
-          1038,
-          1050,
-          1058,
-          1070,
-          1112,
-          1112,
-          1112,
-          1112,
-          1112,
-          1112,
-          1112,
-          1113,
-          1117,
-          1119,
-          1120,
-          1123,
-          1145,
-          1146,
-          1146,
-          1146,
-          1148,
-          1152,
-          1167,
-          1192,
-          1192,
-          1192,
-          1192,
-          1193,
-          1207,
-          1235,
-          1251,
-          1254,
-          1256,
-          1257,
-          1261,
-          1293,
-          1330,
-          1330,
-          1344
-        ],
-        "percentiles": {
-          "mean": 1139.22,
-          "median": 1119.5,
-          "minimum": 1029,
-          "p90": 1261,
-          "p99": 1344
-        },
-        "run": {
-          "dim": 256,
-          "loops_per_measurement": 100,
-          "num_doc_vectors": 64,
-          "num_measurements": 50,
-          "num_query_vectors": 16,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1211,
-          1212,
-          1212,
-          1212,
-          1212,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1214,
-          1217,
-          1217,
-          1220,
-          1223,
-          1225,
-          1226,
-          1227,
-          1229,
-          1231,
-          1235,
-          1235,
-          1239,
-          1239,
-          1240,
-          1244,
-          1249,
-          1252,
-          1259,
-          1264,
-          1270,
-          1281,
-          1294,
-          1299,
-          1306,
-          1312,
-          1315,
-          1332,
-          1341,
-          1383,
-          1484
-        ],
-        "percentiles": {
-          "mean": 1246.32,
-          "median": 1225.5,
-          "minimum": 1210,
-          "p90": 1315,
-          "p99": 1484
-        },
-        "run": {
-          "dim": 384,
-          "loops_per_measurement": 20,
-          "num_doc_vectors": 128,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          958,
-          958,
-          958,
-          958,
-          958,
-          960,
-          960,
-          960,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          962,
-          962,
-          963,
-          964,
-          964,
-          965,
-          965,
-          965,
-          966,
-          966,
-          973,
-          974,
-          974,
-          981,
-          981,
-          983,
-          985,
-          987,
-          987,
-          987,
-          990,
-          999,
-          999
-        ],
-        "percentiles": {
-          "mean": 967.42,
-          "median": 961.0,
-          "minimum": 958,
-          "p90": 987,
-          "p99": 999
-        },
-        "run": {
-          "dim": 256,
-          "loops_per_measurement": 200,
-          "num_doc_vectors": 16,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1019,
-          1019,
-          1019,
-          1019,
-          1020,
-          1020,
-          1020,
-          1020,
-          1020,
-          1020,
-          1021,
-          1022,
-          1023,
-          1023,
-          1026,
-          1029,
-          1031,
-          1032,
-          1033,
-          1034,
-          1035,
-          1036,
-          1037,
-          1041,
-          1044,
-          1044,
-          1045,
-          1046,
-          1065
-        ],
-        "percentiles": {
-          "mean": 1024.58,
-          "median": 1019.5,
-          "minimum": 1017,
-          "p90": 1044,
-          "p99": 1065
-        },
-        "run": {
-          "dim": 264,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 64,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1854,
-          1855,
-          1855,
-          1855,
-          1855,
-          1855,
-          1856,
-          1856,
-          1856,
-          1857,
-          1857,
-          1857,
-          1857,
-          1857,
-          1857,
-          1858,
-          1858,
-          1858,
-          1858,
-          1858,
-          1858,
-          1858,
-          1859,
-          1860,
-          1861,
-          1861,
-          1863,
-          1866,
-          1869,
-          1870,
-          1871,
-          1871,
-          1871,
-          1872,
-          1874,
-          1875,
-          1881,
-          1883,
-          1885,
-          1885,
-          1890,
-          1892,
-          1892,
-          1892,
-          1892,
-          1899,
-          1906,
-          1909,
-          1909,
-          1916
-        ],
-        "percentiles": {
-          "mean": 1870.38,
-          "median": 1861.0,
-          "minimum": 1854,
-          "p90": 1899,
-          "p99": 1916
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 10,
-          "num_doc_vectors": 1250,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          3180,
-          3180,
-          3180,
-          3180,
-          3180,
-          3181,
-          3181,
-          3181,
-          3181,
-          3183,
-          3185,
-          3187,
-          3205,
-          3206,
-          3207,
-          3208,
-          3211,
-          3218,
-          3220,
-          3268
-        ],
-        "percentiles": {
-          "mean": 3196.1,
-          "median": 3184.0,
-          "minimum": 3180,
-          "p90": 3220,
-          "p99": 3268
-        },
-        "run": {
-          "dim": 512,
-          "loops_per_measurement": 2,
-          "num_doc_vectors": 1250,
-          "num_measurements": 20,
-          "num_query_vectors": 64,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1784,
-          1784,
-          1784,
-          1784,
-          1784,
-          1784,
-          1785,
-          1785,
-          1790,
-          1791,
-          1791,
-          1792,
-          1792,
-          1792,
-          1792,
-          1792,
-          1792,
-          1792,
-          1795,
-          1795,
-          1796,
-          1796,
-          1796,
-          1796,
-          1798,
-          1800,
-          1803,
-          1805,
-          1814,
-          1815,
-          1817,
-          1818,
-          1821,
-          1826,
-          1840,
-          1845,
-          1856,
-          1858,
-          1878,
-          1879,
-          1879,
-          1884,
-          1888,
-          1890,
-          1893,
-          1905,
-          1907,
-          1912,
-          1918,
-          1950
-        ],
-        "percentiles": {
-          "mean": 1825.26,
-          "median": 1799.0,
-          "minimum": 1784,
-          "p90": 1905,
-          "p99": 1950
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 200,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 64,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1019,
-          1020,
-          1020,
-          1021,
-          1021,
-          1022,
-          1022,
-          1022,
-          1023,
-          1027,
-          1030,
-          1030,
-          1035,
-          1043,
-          1043,
-          1044,
-          1045,
-          1049,
-          1049,
-          1060
-        ],
-        "percentiles": {
-          "mean": 1023.2,
-          "median": 1017.5,
-          "minimum": 1017,
-          "p90": 1044,
-          "p99": 1060
-        },
-        "run": {
-          "dim": 512,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          569,
-          569,
-          569,
-          569,
-          569,
-          569,
-          570,
-          570,
-          570,
-          570,
-          570,
-          570,
-          570,
-          570,
-          570,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          574,
-          578,
-          578,
-          594,
-          595,
-          598
-        ],
-        "percentiles": {
-          "mean": 571.2,
-          "median": 570.0,
-          "minimum": 567,
-          "p90": 578,
-          "p99": 598
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 500,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 8,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          988,
-          988,
-          988,
-          988,
-          988,
-          988,
-          988,
-          988,
-          988,
-          989,
-          989,
-          989,
-          989,
-          989,
-          989,
-          989,
-          989,
-          989,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          992,
-          992,
-          992,
-          992,
-          992,
-          992,
-          992,
-          992,
-          992,
-          996,
-          996,
-          1004,
-          1009,
-          1013,
-          1018,
-          1020,
-          1047,
-          1057
-        ],
-        "percentiles": {
-          "mean": 995.1,
-          "median": 991.0,
-          "minimum": 988,
-          "p90": 1013,
-          "p99": 1057
-        },
-        "run": {
-          "dim": 256,
-          "loops_per_measurement": 100,
-          "num_doc_vectors": 64,
-          "num_measurements": 50,
-          "num_query_vectors": 16,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1211,
-          1211,
-          1211,
-          1212,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1214,
-          1214,
-          1214,
-          1214,
-          1214,
-          1214,
-          1214,
-          1214,
-          1214,
-          1214,
-          1216,
-          1217,
-          1217,
-          1217,
-          1218,
-          1220,
-          1222,
-          1223,
-          1224,
-          1224,
-          1225,
-          1227,
-          1238,
-          1239,
-          1239,
-          1241,
-          1242,
-          1243
-        ],
-        "percentiles": {
-          "mean": 1217.74,
-          "median": 1214.0,
-          "minimum": 1210,
-          "p90": 1239,
-          "p99": 1243
-        },
-        "run": {
-          "dim": 384,
-          "loops_per_measurement": 20,
-          "num_doc_vectors": 128,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          953,
-          953,
-          953,
-          953,
-          953,
-          953,
-          954,
-          954,
-          956,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          958,
-          958,
-          958,
-          958,
-          958,
-          958,
-          960,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          962,
-          963,
-          971,
-          976,
-          978,
-          984,
-          984,
-          987
-        ],
-        "percentiles": {
-          "mean": 960.1,
-          "median": 957.0,
-          "minimum": 953,
-          "p90": 976,
-          "p99": 987
-        },
-        "run": {
-          "dim": 256,
-          "loops_per_measurement": 200,
-          "num_doc_vectors": 16,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          1016,
-          1016,
-          1016,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1019,
-          1019,
-          1019,
-          1019,
-          1019,
-          1019,
-          1019,
-          1021,
-          1021,
-          1023,
-          1023,
-          1025,
-          1032,
-          1044,
-          1045,
-          1045,
-          1045,
-          1047,
-          1052,
-          1058,
-          1061
-        ],
-        "percentiles": {
-          "mean": 1023.46,
-          "median": 1018.0,
-          "minimum": 1016,
-          "p90": 1045,
-          "p99": 1061
-        },
-        "run": {
-          "dim": 264,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 64,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          1858,
-          1858,
-          1860,
-          1860,
-          1860,
-          1860,
-          1860,
-          1860,
-          1860,
-          1860,
-          1860,
-          1861,
-          1861,
-          1861,
-          1861,
-          1861,
-          1861,
-          1861,
-          1862,
-          1863,
-          1863,
-          1864,
-          1865,
-          1867,
-          1868,
-          1872,
-          1873,
-          1876,
-          1878,
-          1881,
-          1882,
-          1883,
-          1888,
-          1889,
-          1889,
-          1890,
-          1890,
-          1890,
-          1891,
-          1892,
-          1905,
-          1906,
-          1908,
-          1934,
-          1962,
-          1967,
-          1974,
-          1988,
-          2004,
-          2014
-        ],
-        "percentiles": {
-          "mean": 1887.22,
-          "median": 1870.0,
-          "minimum": 1858,
-          "p90": 1967,
-          "p99": 2014
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 10,
-          "num_doc_vectors": 1250,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          3177,
-          3177,
-          3177,
-          3179,
-          3192,
-          3201,
-          3212,
-          3222,
-          3251,
-          3251,
-          3255,
-          3256,
-          3256,
-          3321,
-          3381,
-          3399,
-          3400,
-          3419,
-          3422,
-          3445
-        ],
-        "percentiles": {
-          "mean": 3279.65,
-          "median": 3253.0,
-          "minimum": 3177,
-          "p90": 3422,
-          "p99": 3445
-        },
-        "run": {
-          "dim": 512,
-          "loops_per_measurement": 2,
-          "num_doc_vectors": 1250,
-          "num_measurements": 20,
-          "num_query_vectors": 64,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          1783,
-          1784,
-          1787,
-          1791,
-          1791,
-          1791,
-          1813,
-          1838,
-          1853,
-          1868,
-          1871,
-          1882,
-          1882,
-          1884,
-          1890,
-          1899,
-          1899,
-          1899,
-          1900,
-          1901,
-          1905,
-          1906,
-          1908,
-          1909,
-          1911,
-          1911,
-          1911,
-          1911,
-          1914,
-          1915,
-          1915,
-          1916,
-          1916,
-          1917,
-          1919,
-          1922,
-          1922,
-          1923,
-          1923,
-          1925,
-          1927,
-          1927,
-          1928,
-          1929,
-          1929,
-          1933,
-          1937,
-          1938,
-          1940,
-          1983
-        ],
-        "percentiles": {
-          "mean": 1893.52,
-          "median": 1911.0,
-          "minimum": 1783,
-          "p90": 1933,
-          "p99": 1983
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 200,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 64,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1020,
-          1023,
-          1023,
-          1025,
-          1028,
-          1033,
-          1033,
-          1034,
-          1037,
-          1038,
-          1040,
-          1043,
-          1044,
-          1052,
-          1052,
-          1057,
-          1060,
-          1063,
-          1078,
-          1088,
-          1088,
-          1088,
-          1088,
-          1088,
-          1088,
-          1088,
-          1088,
-          1090,
-          1090,
-          1090,
-          1092,
-          1093,
-          1093,
-          1094,
-          1094
-        ],
-        "percentiles": {
-          "mean": 1049.56,
-          "median": 1039.0,
-          "minimum": 1017,
-          "p90": 1092,
-          "p99": 1094
-        },
-        "run": {
-          "dim": 512,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      }
-    ]
-  },
-  {
-    "input": {
-      "content": {
-        "element_type": "float16",
-        "implementation": "optimized",
-        "runs": [
-          {
-            "dim": 256,
-            "loops_per_measurement": 100,
-            "num_doc_vectors": 64,
-            "num_measurements": 50,
-            "num_query_vectors": 16,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 10,
-            "num_doc_vectors": 1250,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 256,
-            "loops_per_measurement": 100,
-            "num_doc_vectors": 64,
-            "num_measurements": 50,
-            "num_query_vectors": 16,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 10,
-            "num_doc_vectors": 1250,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          }
-        ]
-      },
-      "type": "multi-vector-op"
-    },
-    "results": [
-      {
-        "latencies": [
-          1734,
-          1734,
-          1736,
-          1736,
-          1737,
-          1737,
-          1737,
-          1738,
-          1738,
-          1738,
-          1738,
-          1739,
-          1740,
-          1740,
-          1741,
-          1744,
-          1744,
-          1751,
-          1751,
-          1753,
-          1754,
-          1754,
-          1756,
-          1759,
-          1761,
-          1764,
-          1767,
-          1767,
-          1767,
-          1768,
-          1768,
-          1769,
-          1769,
-          1773,
-          1774,
-          1775,
-          1779,
-          1787,
-          1794,
-          1808,
-          1822,
-          1825,
-          1829,
-          1829,
-          1844,
-          1846,
-          1852,
-          1859,
-          1903,
-          2194
-        ],
-        "percentiles": {
-          "mean": 1780.44,
-          "median": 1762.5,
-          "minimum": 1734,
-          "p90": 1846,
-          "p99": 2194
-        },
-        "run": {
-          "dim": 256,
-          "loops_per_measurement": 100,
-          "num_doc_vectors": 64,
-          "num_measurements": 50,
-          "num_query_vectors": 16,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          2130,
-          2130,
-          2130,
-          2131,
-          2133,
-          2133,
-          2140,
-          2142,
-          2149,
-          2151,
-          2158,
-          2160,
-          2163,
-          2164,
-          2166,
-          2167,
-          2167,
-          2168,
-          2171,
-          2173,
-          2174,
-          2176,
-          2177,
-          2178,
-          2178,
-          2181,
-          2184,
-          2189,
-          2195,
-          2195,
-          2197,
-          2198,
-          2198,
-          2201,
-          2203,
-          2207,
-          2215,
-          2217,
-          2220,
-          2229,
-          2240,
-          2242,
-          2243,
-          2249,
-          2250,
-          2291,
-          2305,
-          2438,
-          2613,
-          2643
-        ],
-        "percentiles": {
-          "mean": 2209.04,
-          "median": 2179.5,
-          "minimum": 2130,
-          "p90": 2291,
-          "p99": 2643
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 10,
-          "num_doc_vectors": 1250,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1731,
-          1733,
-          1737,
-          1737,
-          1737,
-          1741,
-          1741,
-          1745,
-          1745,
-          1750,
-          1750,
-          1750,
-          1750,
-          1751,
-          1754,
-          1754,
-          1755,
-          1758,
-          1758,
-          1759,
-          1761,
-          1761,
-          1766,
-          1768,
-          1770,
-          1771,
-          1771,
-          1772,
-          1773,
-          1773,
-          1775,
-          1776,
-          1776,
-          1778,
-          1785,
-          1788,
-          1789,
-          1791,
-          1795,
-          1800,
-          1804,
-          1808,
-          1814,
-          1822,
-          1832,
-          1833,
-          1834,
-          1864,
-          1867,
-          1869
-        ],
-        "percentiles": {
-          "mean": 1776.44,
-          "median": 1770.5,
-          "minimum": 1731,
-          "p90": 1833,
-          "p99": 1869
-        },
-        "run": {
-          "dim": 256,
-          "loops_per_measurement": 100,
-          "num_doc_vectors": 64,
-          "num_measurements": 50,
-          "num_query_vectors": 16,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          2127,
-          2127,
-          2129,
-          2130,
-          2132,
-          2141,
-          2142,
-          2142,
-          2147,
-          2148,
-          2149,
-          2150,
-          2154,
-          2154,
-          2159,
-          2162,
-          2166,
-          2168,
-          2170,
-          2173,
-          2177,
-          2180,
-          2180,
-          2181,
-          2181,
-          2182,
-          2183,
-          2187,
-          2196,
-          2196,
-          2199,
-          2200,
-          2204,
-          2211,
-          2213,
-          2216,
-          2224,
-          2255,
-          2256,
-          2271,
-          2354,
-          2488,
-          2493,
-          2495,
-          2498,
-          2505,
-          2525,
-          2653,
-          2657,
-          3515
-        ],
-        "percentiles": {
-          "mean": 2264.9,
-          "median": 2181.5,
-          "minimum": 2127,
-          "p90": 2505,
-          "p99": 3515
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 10,
-          "num_doc_vectors": 1250,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      }
-    ]
-  },
-  {
-    "input": {
-      "content": {
-        "element_type": "float32",
-        "implementation": "reference",
-        "runs": [
-          {
-            "dim": 128,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 8,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 384,
-            "loops_per_measurement": 2,
-            "num_doc_vectors": 128,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 8,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 384,
-            "loops_per_measurement": 2,
-            "num_doc_vectors": 128,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          }
-        ]
-      },
-      "type": "multi-vector-op"
-    },
-    "results": [
-      {
-        "latencies": [
-          64,
-          64,
-          64,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          67,
-          67,
-          67,
-          67,
-          67,
-          67,
-          67,
-          67,
-          67,
-          68,
-          68,
-          69,
-          71,
-          127
-        ],
-        "percentiles": {
-          "mean": 67.52,
-          "median": 66.0,
-          "minimum": 64,
-          "p90": 68,
-          "p99": 127
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 8,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          130,
-          130,
-          130,
-          130,
-          130,
-          130,
-          130,
-          130,
-          130,
-          130,
-          131,
-          131,
-          131,
-          131,
-          132,
-          132,
-          133,
-          133,
-          135,
-          136,
-          136,
-          137,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          139,
-          139,
-          139,
-          139,
-          139,
-          140,
-          140,
-          140,
-          141,
-          143,
-          147,
-          161
-        ],
-        "percentiles": {
-          "mean": 136.26,
-          "median": 138.0,
-          "minimum": 130,
-          "p90": 140,
-          "p99": 161
-        },
-        "run": {
-          "dim": 384,
-          "loops_per_measurement": 2,
-          "num_doc_vectors": 128,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          62,
-          62,
-          62,
-          62,
-          62,
-          63,
-          63,
-          63,
-          63,
-          63,
-          63,
-          63,
-          63,
-          63,
-          63,
-          64,
-          64,
-          65,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          67,
-          67,
-          67,
-          67,
-          67,
-          67,
-          67,
-          68,
-          68,
-          69,
-          71,
-          72,
-          78,
-          106
-        ],
-        "percentiles": {
-          "mean": 66.44,
-          "median": 66.0,
-          "minimum": 62,
-          "p90": 69,
-          "p99": 106
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 8,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          130,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          132,
-          132,
-          132,
-          132,
-          132,
-          133,
-          133,
-          134,
-          134,
-          135,
-          135,
-          135,
-          136,
-          136,
-          137,
-          139,
-          139,
-          140,
-          142,
-          142,
-          143,
-          144,
-          145,
-          145,
-          147,
-          155,
-          158
-        ],
-        "percentiles": {
-          "mean": 135.18,
-          "median": 132.0,
-          "minimum": 130,
-          "p90": 145,
-          "p99": 158
-        },
-        "run": {
-          "dim": 384,
-          "loops_per_measurement": 2,
-          "num_doc_vectors": 128,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      }
-    ]
-  },
-  {
-    "input": {
-      "content": {
-        "element_type": "float16",
-        "implementation": "reference",
-        "runs": [
-          {
-            "dim": 128,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 8,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 384,
-            "loops_per_measurement": 2,
-            "num_doc_vectors": 128,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          }
-        ]
-      },
-      "type": "multi-vector-op"
-    },
-    "results": [
-      {
-        "latencies": [
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          74,
-          74,
-          74,
-          74,
-          74,
-          74,
-          75,
-          75,
-          76,
-          76,
-          76,
-          76,
-          76,
-          76,
-          77,
-          77,
-          77,
-          77,
-          77,
-          77,
-          77,
-          77,
-          77,
-          78,
-          78,
-          78,
-          79,
-          80,
-          80,
-          80,
-          84,
-          87,
-          92
-        ],
-        "percentiles": {
-          "mean": 76.0,
-          "median": 75.5,
-          "minimum": 73,
-          "p90": 80,
-          "p99": 92
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 8,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          135,
-          135,
-          135,
-          135,
-          135,
-          135,
-          135,
-          135,
-          136,
-          136,
-          137,
-          138,
-          140,
-          141,
-          141,
-          141,
-          141,
-          141,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          143,
-          143,
-          143,
-          144,
-          144,
-          145,
-          145,
-          145,
-          147,
-          150,
-          151,
-          151,
-          153,
-          154,
-          158,
-          158
-        ],
-        "percentiles": {
-          "mean": 142.36,
-          "median": 142.0,
-          "minimum": 135,
-          "p90": 151,
-          "p99": 158
-        },
-        "run": {
-          "dim": 384,
-          "loops_per_measurement": 2,
-          "num_doc_vectors": 128,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      }
-    ]
-  }
-]
\ No newline at end of file

From 3a89c3750bef66322e5e5c3f90e91d43e941a74b Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Thu, 7 May 2026 12:09:00 +0530
Subject: [PATCH 05/13] Add Cargo.lock

---
 Cargo.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index c7b68684e..e179d3320 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -699,7 +699,7 @@ dependencies = [
 
 [[package]]
 name = "diskann-benchmark-multi-vector"
-version = "0.50.1"
+version = "0.51.0"
 dependencies = [
  "anyhow",
  "diskann-benchmark-runner",

From 96d17b30378159ffdfc04b5afbeb0bf225992de2 Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Fri, 8 May 2026 01:48:37 +0530
Subject: [PATCH 06/13] Remove unused scalar benchmark config file

---
 .../graph_index_scalar_oai_large.json         | 115 ------------------
 1 file changed, 115 deletions(-)
 delete mode 100644 diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json

diff --git a/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json b/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json
deleted file mode 100644
index 09752477a..000000000
--- a/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json
+++ /dev/null
@@ -1,115 +0,0 @@
-{
-    "search_directories": [
-      "/mnt/nvme/s"
-    ],
-    "jobs": [
-      {
-        "type": "graph-index-build-sq",
-        "content": {
-          "build": {
-            "data_type": "float16",
-            "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin",
-            "distance": "squared_l2",
-            "max_degree": 32,
-            "l_build": 100,
-            "alpha": 1.2,
-            "backedge_ratio": 1.0,
-            "num_threads": 8,
-            "multi_insert":null,
-            "search_phase": {
-              "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin",
-              "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000",
-              "reps": 2,
-              "num_threads": [
-                8
-              ],
-              "runs": [
-                {
-                  "search_n": 10,
-                  "search_l": [
-                    50
-                  ],
-                  "recall_k": 10
-                }
-              ]
-            }
-          },
-          "num_bits": 1,
-          "standard_deviations": 2,
-          "use_fp_for_search": true
-        }
-      },
-      {
-        "type": "graph-index-build-sq",
-        "content": {
-          "build": {
-            "data_type": "float16",
-            "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin",
-            "distance": "squared_l2",
-            "max_degree": 32,
-            "l_build": 100,
-            "alpha": 1.2,
-            "backedge_ratio": 1.0,
-            "num_threads": 8,
-            "multi_insert":null,
-            "search_phase": {
-              "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin",
-              "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000",
-              "reps": 2,
-              "num_threads": [
-                8
-              ],
-              "runs": [
-                {
-                  "search_n": 10,
-                  "search_l": [
-                    50
-                  ],
-                  "recall_k": 10
-                }
-              ]
-            }
-          },
-          "num_bits": 4,
-          "standard_deviations": 2,
-          "use_fp_for_search": true
-        }
-      },
-      {
-        "type": "graph-index-build-sq",
-        "content": {
-          "build": {
-            "data_type": "float16",
-            "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin",
-            "distance": "squared_l2",
-            "max_degree": 32,
-            "l_build": 100,
-            "alpha": 1.2,
-            "backedge_ratio": 1.0,
-            "num_threads": 8,
-            "multi_insert":null,
-            "search_phase": {
-              "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin",
-              "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000",
-              "reps": 1,
-              "num_threads": [
-                8
-              ],
-              "runs": [
-                {
-                  "search_n": 10,
-                  "search_l": [
-                    50
-                  ],
-                  "recall_k": 10
-                }
-              ]
-            }
-          },
-          "num_bits": 8,
-          "standard_deviations": 2,
-          "use_fp_for_search": true
-        }
-      }
-    ]
-  }
\ No newline at end of file

From 6b33719c7b082fb6142d0b372c0c227c501fdc8c Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Fri, 8 May 2026 01:51:13 +0530
Subject: [PATCH 07/13] Revert "Remove unused scalar benchmark config file"

This reverts commit 96d17b30378159ffdfc04b5afbeb0bf225992de2.
---
 .../graph_index_scalar_oai_large.json         | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json

diff --git a/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json b/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json
new file mode 100644
index 000000000..09752477a
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json
@@ -0,0 +1,115 @@
+{
+    "search_directories": [
+      "/mnt/nvme/s"
+    ],
+    "jobs": [
+      {
+        "type": "graph-index-build-sq",
+        "content": {
+          "build": {
+            "data_type": "float16",
+            "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin",
+            "distance": "squared_l2",
+            "max_degree": 32,
+            "l_build": 100,
+            "alpha": 1.2,
+            "backedge_ratio": 1.0,
+            "num_threads": 8,
+            "multi_insert":null,
+            "search_phase": {
+              "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin",
+              "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000",
+              "reps": 2,
+              "num_threads": [
+                8
+              ],
+              "runs": [
+                {
+                  "search_n": 10,
+                  "search_l": [
+                    50
+                  ],
+                  "recall_k": 10
+                }
+              ]
+            }
+          },
+          "num_bits": 1,
+          "standard_deviations": 2,
+          "use_fp_for_search": true
+        }
+      },
+      {
+        "type": "graph-index-build-sq",
+        "content": {
+          "build": {
+            "data_type": "float16",
+            "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin",
+            "distance": "squared_l2",
+            "max_degree": 32,
+            "l_build": 100,
+            "alpha": 1.2,
+            "backedge_ratio": 1.0,
+            "num_threads": 8,
+            "multi_insert":null,
+            "search_phase": {
+              "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin",
+              "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000",
+              "reps": 2,
+              "num_threads": [
+                8
+              ],
+              "runs": [
+                {
+                  "search_n": 10,
+                  "search_l": [
+                    50
+                  ],
+                  "recall_k": 10
+                }
+              ]
+            }
+          },
+          "num_bits": 4,
+          "standard_deviations": 2,
+          "use_fp_for_search": true
+        }
+      },
+      {
+        "type": "graph-index-build-sq",
+        "content": {
+          "build": {
+            "data_type": "float16",
+            "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin",
+            "distance": "squared_l2",
+            "max_degree": 32,
+            "l_build": 100,
+            "alpha": 1.2,
+            "backedge_ratio": 1.0,
+            "num_threads": 8,
+            "multi_insert":null,
+            "search_phase": {
+              "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin",
+              "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000",
+              "reps": 1,
+              "num_threads": [
+                8
+              ],
+              "runs": [
+                {
+                  "search_n": 10,
+                  "search_l": [
+                    50
+                  ],
+                  "recall_k": 10
+                }
+              ]
+            }
+          },
+          "num_bits": 8,
+          "standard_deviations": 2,
+          "use_fp_for_search": true
+        }
+      }
+    ]
+  }
\ No newline at end of file

From d06df7ee59ebb69009837dbdbc5bcd8bcaedfc84 Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Tue, 12 May 2026 20:47:04 +0530
Subject: [PATCH 08/13] Fold the new crate to existing diskann-benchmark crate

---
 Cargo.lock                                    |  17 -
 Cargo.toml                                    |   1 -
 diskann-benchmark-multi-vector/Cargo.toml     |  30 -
 diskann-benchmark-multi-vector/README.md      | 136 ---
 diskann-benchmark-multi-vector/src/bin.rs     |  96 --
 diskann-benchmark-multi-vector/src/lib.rs     | 938 ------------------
 diskann-benchmark/Cargo.toml                  |   3 +
 .../example/multi-vector-test.json            |   0
 .../example}/multi-vector.json                |   0
 .../multi-vector-tolerance.json               |   0
 diskann-benchmark/src/backend/mod.rs          |   2 +
 diskann-benchmark/src/backend/multi_vector.rs | 806 +++++++++++++++
 diskann-benchmark/src/inputs/mod.rs           |   2 +
 diskann-benchmark/src/inputs/multi_vector.rs  | 190 ++++
 diskann-benchmark/src/main.rs                 |  86 ++
 .../src/multi_vector/matrix.rs                |  44 +
 diskann-quantization/src/multi_vector/mod.rs  |   4 +-
 17 files changed, 1135 insertions(+), 1220 deletions(-)
 delete mode 100644 diskann-benchmark-multi-vector/Cargo.toml
 delete mode 100644 diskann-benchmark-multi-vector/README.md
 delete mode 100644 diskann-benchmark-multi-vector/src/bin.rs
 delete mode 100644 diskann-benchmark-multi-vector/src/lib.rs
 rename diskann-benchmark-multi-vector/examples/test.json => diskann-benchmark/example/multi-vector-test.json (100%)
 rename {diskann-benchmark-multi-vector/examples => diskann-benchmark/example}/multi-vector.json (100%)
 rename diskann-benchmark-multi-vector/examples/tolerance.json => diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json (100%)
 create mode 100644 diskann-benchmark/src/backend/multi_vector.rs
 create mode 100644 diskann-benchmark/src/inputs/multi_vector.rs

diff --git a/Cargo.lock b/Cargo.lock
index e179d3320..1713f4b87 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -697,23 +697,6 @@ dependencies = [
  "tokio",
 ]
 
-[[package]]
-name = "diskann-benchmark-multi-vector"
-version = "0.51.0"
-dependencies = [
- "anyhow",
- "diskann-benchmark-runner",
- "diskann-quantization",
- "diskann-utils",
- "diskann-vector",
- "half",
- "rand 0.9.4",
- "serde",
- "serde_json",
- "tempfile",
- "thiserror 2.0.17",
-]
-
 [[package]]
 name = "diskann-benchmark-runner"
 version = "0.51.0"
diff --git a/Cargo.toml b/Cargo.toml
index cce02b501..6353773c9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,6 @@ members = [
     "diskann-benchmark-runner",
     "diskann-benchmark-core",
     "diskann-benchmark-simd",
-    "diskann-benchmark-multi-vector",
     "diskann-benchmark",
     "diskann-tools",
     "vectorset",
diff --git a/diskann-benchmark-multi-vector/Cargo.toml b/diskann-benchmark-multi-vector/Cargo.toml
deleted file mode 100644
index f8eb937e1..000000000
--- a/diskann-benchmark-multi-vector/Cargo.toml
+++ /dev/null
@@ -1,30 +0,0 @@
-[package]
-name = "diskann-benchmark-multi-vector"
-version.workspace = true
-description.workspace = true
-authors.workspace = true
-documentation.workspace = true
-license.workspace = true
-edition.workspace = true
-
-[[bin]]
-name = "benchmark-multi-vector"
-path = "src/bin.rs"
-
-[dependencies]
-anyhow.workspace = true
-diskann-utils = { workspace = true, default-features = false }
-half = { workspace = true, features = ["rand_distr"] }
-diskann-benchmark-runner = { workspace = true }
-diskann-quantization = { workspace = true }
-diskann-vector = { workspace = true }
-rand.workspace = true
-serde = { workspace = true, features = ["derive"] }
-serde_json.workspace = true
-thiserror.workspace = true
-
-[lints]
-workspace = true
-
-[dev-dependencies]
-tempfile.workspace = true
diff --git a/diskann-benchmark-multi-vector/README.md b/diskann-benchmark-multi-vector/README.md
deleted file mode 100644
index 014a393a1..000000000
--- a/diskann-benchmark-multi-vector/README.md
+++ /dev/null
@@ -1,136 +0,0 @@
-# diskann-benchmark-multi-vector
-
-Benchmarks and regression detection for the **multi-vector distance
-operations** exposed by `diskann-quantization` — `Chamfer` and `MaxSim` —
-across `f32` and `f16` element types.
-
-## Layout
-
-- `src/lib.rs` — benchmark library: input/tolerance schemas, kernel
-  dispatch, regression checker.
-- `src/bin.rs` — `benchmark-multi-vector` CLI entry point.
-- `examples/multi-vector.json` — full benchmark matrix covering both
-  operations across the registered kernels and a representative range of
-  shapes.
-- `examples/test.json` — minimal smoke configuration consumed by the
-  integration tests.
-- `examples/tolerance.json` — default regression thresholds.
-
-## Registered kernels
-
-The crate registers four kernels — one per `(element_type, implementation)`
-pair:
-
-| Tag                              | Element | Implementation       |
-| -------------------------------- | ------- | -------------------- |
-| `multi-vector-op-f32-optimized`  | `f32`   | `QueryComputer`      |
-| `multi-vector-op-f16-optimized`  | `f16`   | `QueryComputer`      |
-| `multi-vector-op-f32-reference`  | `f32`   | `Chamfer` / `MaxSim` |
-| `multi-vector-op-f16-reference`  | `f16`   | `Chamfer` / `MaxSim` |
-
-The **optimized** path constructs a `QueryComputer` once per shape (which
-internally selects the best available SIMD kernel for the host) and calls
-`chamfer` / `max_sim` inside the timed loop. The **reference** path drives
-the `Chamfer` / `MaxSim` fallback used by the `multi_vector` unit tests —
-useful both as a numerical ground truth and as a baseline to measure SIMD
-speedups against.
-
-## Time normalization
-
-Per-measurement latency is normalized to **nanoseconds per inner-product
-call**, abbreviated `ns/IP`:
-
-```
-ns/IP = min_latency_µs * 1000 / (Q * D * loops_per_measurement)
-```
-
-Two important properties:
-
-- **Independent of `Q`, `D`, and `loops_per_measurement`.** Reshaping the
-  benchmark or scaling the loop budget leaves the metric unchanged, so
-  cache-residency effects and SIMD utilization show up directly.
-- **Approximately linear in `Dim`.** Each inner-product call is itself an
-  O(`Dim`) operation, so `ns/IP` grows with `Dim` — that is why the table
-  headers read `ns/IP @ Dim`. Compare across rows with the same `Dim`; to
-  compare across different `Dim`s, divide further by `Dim` to recover ns
-  per scalar multiply.
-
-This is the right metric for the two things this crate cares about:
-detecting per-shape regressions (the `Dim` factor cancels) and comparing
-optimized vs. reference at a fixed shape.
-
-## Usage
-
-All examples below assume you are inside the crate directory and use a
-small shell function for brevity:
-
-```bash
-bench() { cargo run --release -p diskann-benchmark-multi-vector --bin benchmark-multi-vector -- "$@"; }
-```
-
-### Run benchmarks
-
-`run` executes every job in the input file and writes per-measurement
-latencies plus percentiles to the output file:
-
-```bash
-bench run --input-file examples/multi-vector.json --output-file before.json
-```
-
-### Regression check workflow
-
-The check workflow is **two-phase**: validate the tolerance file once, then
-compare two recorded result files.
-
-**Phase 1 — preflight.** No benchmarks are executed. The verifier confirms
-that every entry in `tolerance.json` matches at least one job in the input
-file, and that every job is matched by exactly one entry. Run it whenever
-you edit `tolerance.json`:
-
-```bash
-bench check verify \
-  --tolerances examples/tolerance.json \
-  --input-file examples/multi-vector.json
-```
-
-**Phase 2 — comparison.** Record results before and after a code change,
-then compare. The command exits non-zero if any run regresses past its
-tolerance:
-
-```bash
-# On the baseline commit
-bench run --input-file examples/multi-vector.json --output-file before.json
-
-# On the change commit
-bench run --input-file examples/multi-vector.json --output-file after.json
-
-# Compare
-bench check run \
-  --tolerances examples/tolerance.json \
-  --input-file examples/multi-vector.json \
-  --before before.json --after after.json \
-  --output-file checks.json
-```
-
-A run **fails** when its post-change `ns/IP` minimum exceeds the
-baseline minimum by more than `min_time_regression` (default `0.05` =
-5%). Improvements (negative change) always pass.
-
-### How tolerances are matched to jobs
-
-Each entry in `tolerance.json` has the shape `{ input, tolerance }`. The
-`input` block acts as a **partial template** against the jobs in the input
-file: any field present must match; missing fields are wildcards.
-
-The shipped `tolerance.json` uses an empty `"content": {}`, which matches
-every `multi-vector-op` job — so a single 5% threshold applies to all four
-kernels. To apply different thresholds per implementation, add more
-specific entries, e.g.:
-
-```json
-{ "input":     { "type": "multi-vector-op", "content": { "implementation": "reference" } },
-  "tolerance": { "type": "multi-vector-tolerance", "content": { "min_time_regression": 0.10 } } }
-```
-
-`check verify` will reject the file if entries overlap or leave any job
-unmatched.
diff --git a/diskann-benchmark-multi-vector/src/bin.rs b/diskann-benchmark-multi-vector/src/bin.rs
deleted file mode 100644
index d595533e7..000000000
--- a/diskann-benchmark-multi-vector/src/bin.rs
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation.
- * Licensed under the MIT license.
- */
-
-use diskann_benchmark_multi_vector::{register, MultiVectorOp};
-use diskann_benchmark_runner::{output, registry, App, Output};
-
-pub fn main() -> anyhow::Result<()> {
-    // Create the pocket bench application.
-    let app = App::parse();
-    main_inner(&app, &mut output::default())
-}
-
-fn main_inner(app: &App, output: &mut dyn Output) -> anyhow::Result<()> {
-    // Register inputs and benchmarks.
-    let mut inputs = registry::Inputs::new();
-    inputs.register::<MultiVectorOp>()?;
-
-    let mut benchmarks = registry::Benchmarks::new();
-    register(&mut benchmarks);
-
-    // Here we go!
-    app.run(&inputs, &benchmarks, output)
-}
-
-///////////
-// Tests //
-///////////
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use std::path::{Path, PathBuf};
-
-    use diskann_benchmark_runner::app::{Check, Commands};
-
-    fn run_integration_test(input_file: &Path, output_file: &Path) {
-        let commands = Commands::Run {
-            input_file: input_file.to_str().unwrap().into(),
-            output_file: output_file.to_str().unwrap().into(),
-            dry_run: false,
-            allow_debug: true,
-        };
-
-        let app = App::from_commands(commands);
-
-        let mut output = output::Memory::new();
-        main_inner(&app, &mut output).unwrap();
-        println!(
-            "output = {}",
-            String::from_utf8(output.into_inner()).unwrap()
-        );
-
-        assert!(output_file.exists());
-    }
-
-    fn run_check_test(input_file: &Path, tolerances: &Path) -> String {
-        let commands = Commands::Check(Check::Verify {
-            tolerances: tolerances.to_str().unwrap().into(),
-            input_file: input_file.to_str().unwrap().into(),
-        });
-
-        let app = App::from_commands(commands);
-
-        let mut output = output::Memory::new();
-        main_inner(&app, &mut output).unwrap();
-        String::from_utf8(output.into_inner()).unwrap()
-    }
-
-    #[test]
-    fn integration_test() {
-        let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-            .join("examples")
-            .join("test.json");
-
-        let tempdir = tempfile::tempdir().unwrap();
-        let output_path = tempdir.path().join("output.json");
-
-        run_integration_test(&input_path, &output_path);
-    }
-
-    #[test]
-    fn check_verify() {
-        let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-            .join("examples")
-            .join("test.json");
-        let tolerance_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-            .join("examples")
-            .join("tolerance.json");
-
-        let stdout = run_check_test(&input_path, &tolerance_path);
-        println!("stdout = {}", stdout);
-    }
-}
diff --git a/diskann-benchmark-multi-vector/src/lib.rs b/diskann-benchmark-multi-vector/src/lib.rs
deleted file mode 100644
index df08d93dd..000000000
--- a/diskann-benchmark-multi-vector/src/lib.rs
+++ /dev/null
@@ -1,938 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation.
- * Licensed under the MIT license.
- */
-
-//! Multi-vector distance benchmarks with regression detection.
-
-use std::{io::Write, num::NonZeroUsize};
-
-use diskann_quantization::multi_vector::{Chamfer, MatRef, MaxSim, QueryComputer, Standard};
-use diskann_vector::distance::InnerProduct;
-use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
-use half::f16;
-use rand::{
-    distr::{Distribution, StandardUniform},
-    rngs::StdRng,
-    SeedableRng,
-};
-use serde::{Deserialize, Serialize};
-use thiserror::Error;
-
-use diskann_benchmark_runner::{
-    benchmark::{PassFail, Regression},
-    dispatcher::{Description, DispatchRule, FailureScore, MatchScore},
-    utils::{
-        datatype::{self, DataType},
-        num::{relative_change, NonNegativeFinite},
-        percentiles, MicroSeconds,
-    },
-    Any, Benchmark, CheckDeserialization, Checker, Input,
-};
-
-////////////////
-// Public API //
-////////////////
-
-/// Register all multi-vector benchmarks with the runner's dispatcher.
-pub fn register(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) {
-    register_benchmarks_impl(dispatcher)
-}
-
-///////////
-// Utils //
-///////////
-
-#[derive(Debug, Clone, Copy)]
-struct DisplayWrapper<'a, T: ?Sized>(&'a T);
-
-impl<T: ?Sized> std::ops::Deref for DisplayWrapper<'_, T> {
-    type Target = T;
-    fn deref(&self) -> &T {
-        self.0
-    }
-}
-
-////////////
-// Inputs //
-////////////
-
-/// The two distance operations exposed by [`QueryComputer`].
-#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
-#[serde(rename_all = "snake_case")]
-pub enum Operation {
-    Chamfer,
-    MaxSim,
-}
-
-impl std::fmt::Display for Operation {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let st = match self {
-            Self::Chamfer => "chamfer",
-            Self::MaxSim => "max_sim",
-        };
-        write!(f, "{}", st)
-    }
-}
-
-/// Which implementation tier to benchmark.
-#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
-#[serde(rename_all = "kebab-case")]
-enum Implementation {
-    Optimized,
-    Reference,
-}
-
-impl std::fmt::Display for Implementation {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let st = match self {
-            Self::Optimized => "optimized",
-            Self::Reference => "reference",
-        };
-        write!(f, "{}", st)
-    }
-}
-
-/// One benchmark configuration: a single (operation, shape) measurement.
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
-struct Run {
-    operation: Operation,
-    num_query_vectors: NonZeroUsize,
-    num_doc_vectors: NonZeroUsize,
-    dim: NonZeroUsize,
-    loops_per_measurement: NonZeroUsize,
-    num_measurements: NonZeroUsize,
-}
-
-/// A complete multi-vector benchmark job.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct MultiVectorOp {
-    element_type: DataType,
-    implementation: Implementation,
-    runs: Vec<Run>,
-}
-
-impl CheckDeserialization for MultiVectorOp {
-    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
-        Ok(())
-    }
-}
-
-macro_rules! write_field {
-    ($f:ident, $field:tt, $($expr:tt)*) => {
-        writeln!($f, "{:>18}: {}", $field, $($expr)*)
-    }
-}
-
-impl MultiVectorOp {
-    fn summarize_fields(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write_field!(f, "element type", self.element_type)?;
-        write_field!(f, "implementation", self.implementation)?;
-        write_field!(f, "number of runs", self.runs.len())?;
-        Ok(())
-    }
-}
-
-impl std::fmt::Display for MultiVectorOp {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        writeln!(f, "Multi-Vector Operation\n")?;
-        write_field!(f, "tag", Self::tag())?;
-        self.summarize_fields(f)
-    }
-}
-
-impl Input for MultiVectorOp {
-    fn tag() -> &'static str {
-        "multi-vector-op"
-    }
-
-    fn try_deserialize(
-        serialized: &serde_json::Value,
-        checker: &mut Checker,
-    ) -> anyhow::Result<Any> {
-        checker.any(Self::deserialize(serialized)?)
-    }
-
-    fn example() -> anyhow::Result<serde_json::Value> {
-        const NUM_QUERY_VECTORS: NonZeroUsize = NonZeroUsize::new(32).unwrap();
-        const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
-        const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap();
-        const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap();
-        const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap();
-
-        let runs = vec![
-            Run {
-                operation: Operation::Chamfer,
-                num_query_vectors: NUM_QUERY_VECTORS,
-                num_doc_vectors: NUM_DOC_VECTORS,
-                dim: DIM,
-                loops_per_measurement: LOOPS_PER_MEASUREMENT,
-                num_measurements: NUM_MEASUREMENTS,
-            },
-            Run {
-                operation: Operation::MaxSim,
-                num_query_vectors: NUM_QUERY_VECTORS,
-                num_doc_vectors: NUM_DOC_VECTORS,
-                dim: DIM,
-                loops_per_measurement: LOOPS_PER_MEASUREMENT,
-                num_measurements: NUM_MEASUREMENTS,
-            },
-        ];
-
-        Ok(serde_json::to_value(&Self {
-            element_type: DataType::Float32,
-            implementation: Implementation::Optimized,
-            runs,
-        })?)
-    }
-}
-
-//////////////////////
-// Regression Check //
-//////////////////////
-
-/// Tolerance thresholds for multi-vector benchmark regression detection.
-///
-/// Each field specifies the maximum allowed relative increase in the corresponding metric.
-/// For example, a value of `0.05` means a 5% increase is tolerated.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-struct MultiVectorTolerance {
-    min_time_regression: NonNegativeFinite,
-}
-
-impl CheckDeserialization for MultiVectorTolerance {
-    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
-        Ok(())
-    }
-}
-
-impl Input for MultiVectorTolerance {
-    fn tag() -> &'static str {
-        "multi-vector-tolerance"
-    }
-
-    fn try_deserialize(
-        serialized: &serde_json::Value,
-        checker: &mut Checker,
-    ) -> anyhow::Result<Any> {
-        checker.any(Self::deserialize(serialized)?)
-    }
-
-    fn example() -> anyhow::Result<serde_json::Value> {
-        const EXAMPLE: NonNegativeFinite = match NonNegativeFinite::new(0.05) {
-            Ok(v) => v,
-            Err(_) => panic!("use a non-negative finite please"),
-        };
-
-        Ok(serde_json::to_value(MultiVectorTolerance {
-            min_time_regression: EXAMPLE,
-        })?)
-    }
-}
-
-/// Per-run comparison result showing before/after percentile differences.
-#[derive(Debug, Serialize)]
-struct Comparison {
-    run: Run,
-    tolerance: MultiVectorTolerance,
-    before_min: f64,
-    after_min: f64,
-}
-
-/// Aggregated result of the regression check across all runs.
-#[derive(Debug, Serialize)]
-struct CheckResult {
-    checks: Vec<Comparison>,
-}
-
-impl std::fmt::Display for CheckResult {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let header = [
-            "Operation",
-            "Q",
-            "D",
-            "Dim",
-            "Min Before (ns/IP @ Dim)",
-            "Min After (ns/IP @ Dim)",
-            "Change (%)",
-            "Remark",
-        ];
-
-        let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.checks.len());
-
-        for (i, c) in self.checks.iter().enumerate() {
-            let mut row = table.row(i);
-            let change = relative_change(c.before_min, c.after_min);
-
-            row.insert(c.run.operation, 0);
-            row.insert(c.run.num_query_vectors, 1);
-            row.insert(c.run.num_doc_vectors, 2);
-            row.insert(c.run.dim, 3);
-            row.insert(format!("{:.3}", c.before_min), 4);
-            row.insert(format!("{:.3}", c.after_min), 5);
-            match change {
-                Ok(change) => {
-                    row.insert(format!("{:.3} %", change * 100.0), 6);
-                    if change > c.tolerance.min_time_regression.get() {
-                        row.insert("FAIL", 7);
-                    }
-                }
-                Err(err) => {
-                    row.insert("invalid", 6);
-                    row.insert(err, 7);
-                }
-            }
-        }
-
-        table.fmt(f)
-    }
-}
-
-////////////////////////////
-// Benchmark Registration //
-////////////////////////////
-
-fn register_benchmarks_impl(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) {
-    macro_rules! register {
-        ($impl:ident, $t:ty, $tag:literal) => {
-            dispatcher.register_regression($tag, Kernel::<$impl, $t>::new());
-        };
-    }
-
-    // Optimized (architecture-dispatched QueryComputer).
-    register!(Optimized, f32, "multi-vector-op-f32-optimized");
-    register!(Optimized, f16, "multi-vector-op-f16-optimized");
-
-    // Reference (Chamfer / MaxSim fallback path).
-    register!(Reference, f32, "multi-vector-op-f32-reference");
-    register!(Reference, f16, "multi-vector-op-f16-reference");
-}
-
-//////////////
-// Dispatch //
-//////////////
-
-/// Dispatch marker for the [`QueryComputer`] implementation.
-#[derive(Debug)]
-struct Optimized;
-
-/// Dispatch marker for the [`Chamfer`] / [`MaxSim`] fallback.
-#[derive(Debug)]
-struct Reference;
-
-/// A multi-vector benchmark.
-struct Kernel<I, T> {
-    _type: std::marker::PhantomData<(I, T)>,
-}
-
-impl<I, T> Kernel<I, T> {
-    fn new() -> Self {
-        Self {
-            _type: std::marker::PhantomData,
-        }
-    }
-}
-
-#[derive(Debug, Error)]
-#[error("this kernel handles a different implementation than {0}")]
-pub(crate) struct ImplementationMismatch(Implementation);
-
-macro_rules! impl_dispatch_rule {
-    ($marker:ident, $variant:ident, $description:literal) => {
-        impl DispatchRule<Implementation> for $marker {
-            type Error = ImplementationMismatch;
-
-            fn try_match(from: &Implementation) -> Result<MatchScore, FailureScore> {
-                if *from == Implementation::$variant {
-                    Ok(MatchScore(0))
-                } else {
-                    Err(FailureScore(1))
-                }
-            }
-
-            fn convert(from: Implementation) -> Result<Self, Self::Error> {
-                if from == Implementation::$variant {
-                    Ok($marker)
-                } else {
-                    Err(ImplementationMismatch(from))
-                }
-            }
-
-            fn description(
-                f: &mut std::fmt::Formatter<'_>,
-                from: Option<&Implementation>,
-            ) -> std::fmt::Result {
-                match from {
-                    None => write!(f, $description),
-                    Some(impl_) => {
-                        if Self::try_match(impl_).is_ok() {
-                            write!(f, "matched {}", impl_)
-                        } else {
-                            write!(f, "expected {}, got {}", Implementation::$variant, impl_)
-                        }
-                    }
-                }
-            }
-        }
-    };
-}
-
-impl_dispatch_rule!(
-    Optimized,
-    Optimized,
-    "QueryComputer (architecture-dispatched)"
-);
-impl_dispatch_rule!(Reference, Reference, "Chamfer / MaxSim fallback");
-
-impl<I, T> Benchmark for Kernel<I, T>
-where
-    datatype::Type<T>: DispatchRule<datatype::DataType>,
-    I: DispatchRule<Implementation, Error = ImplementationMismatch> + 'static,
-    Kernel<I, T>: RunBenchmark<I>,
-    T: 'static,
-{
-    type Input = MultiVectorOp;
-    type Output = Vec<RunResult>;
-
-    fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
-        let mut failscore: Option<u32> = None;
-        if datatype::Type::<T>::try_match(&from.element_type).is_err() {
-            *failscore.get_or_insert(0) += 10;
-        }
-        if let Err(FailureScore(score)) = I::try_match(&from.implementation) {
-            *failscore.get_or_insert(0) += 2 + score;
-        }
-
-        match failscore {
-            None => Ok(MatchScore(0)),
-            Some(score) => Err(FailureScore(score)),
-        }
-    }
-
-    fn run(
-        &self,
-        input: &MultiVectorOp,
-        _: diskann_benchmark_runner::Checkpoint<'_>,
-        mut output: &mut dyn diskann_benchmark_runner::Output,
-    ) -> anyhow::Result<Self::Output> {
-        // The dispatcher only invokes `run` after `try_match` has already accepted
-        // the input, so a failure here would indicate a dispatcher bug.
-        I::convert(input.implementation).expect("try_match accepted the input");
-        writeln!(output, "{}", input)?;
-        let results = self.run_benchmark(input)?;
-        writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
-        Ok(results)
-    }
-
-    fn description(
-        &self,
-        f: &mut std::fmt::Formatter<'_>,
-        input: Option<&MultiVectorOp>,
-    ) -> std::fmt::Result {
-        match input {
-            None => {
-                writeln!(
-                    f,
-                    "- Element Type: {}",
-                    Description::<datatype::DataType, datatype::Type<T>>::new()
-                )?;
-                writeln!(
-                    f,
-                    "- Implementation: {}",
-                    Description::<Implementation, I>::new()
-                )?;
-            }
-            Some(input) => {
-                if let Err(err) = datatype::Type::<T>::try_match_verbose(&input.element_type) {
-                    writeln!(f, "\n    - Mismatched element type: {}", err)?;
-                }
-                if let Err(err) = I::try_match_verbose(&input.implementation) {
-                    writeln!(f, "\n    - Mismatched implementation: {}", err)?;
-                }
-            }
-        }
-        Ok(())
-    }
-}
-
-impl<I, T> Regression for Kernel<I, T>
-where
-    datatype::Type<T>: DispatchRule<datatype::DataType>,
-    I: DispatchRule<Implementation, Error = ImplementationMismatch> + 'static,
-    Kernel<I, T>: RunBenchmark<I>,
-    T: 'static,
-{
-    type Tolerances = MultiVectorTolerance;
-    type Pass = CheckResult;
-    type Fail = CheckResult;
-
-    fn check(
-        &self,
-        tolerance: &MultiVectorTolerance,
-        _input: &MultiVectorOp,
-        before: &Vec<RunResult>,
-        after: &Vec<RunResult>,
-    ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
-        anyhow::ensure!(
-            before.len() == after.len(),
-            "before has {} runs but after has {}",
-            before.len(),
-            after.len(),
-        );
-
-        let mut passed = true;
-        let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
-            .enumerate()
-            .map(|(i, (b, a))| {
-                anyhow::ensure!(b.run == a.run, "run {i} mismatched");
-
-                let computations_per_latency = b.computations_per_latency() as f64;
-
-                let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
-                let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
-
-                let comparison = Comparison {
-                    run: b.run.clone(),
-                    tolerance: *tolerance,
-                    before_min,
-                    after_min,
-                };
-
-                match relative_change(before_min, after_min) {
-                    Ok(change) => {
-                        if change > tolerance.min_time_regression.get() {
-                            passed = false;
-                        }
-                    }
-                    Err(_) => passed = false,
-                };
-
-                Ok(comparison)
-            })
-            .collect::<anyhow::Result<Vec<Comparison>>>()?;
-
-        let check = CheckResult { checks };
-
-        if passed {
-            Ok(PassFail::Pass(check))
-        } else {
-            Ok(PassFail::Fail(check))
-        }
-    }
-}
-
-///////////////
-// Benchmark //
-///////////////
-
-trait RunBenchmark<I> {
-    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error>;
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-struct RunResult {
-    /// The configuration for this run.
-    run: Run,
-    /// Per-measurement latencies (over `loops_per_measurement` calls).
-    latencies: Vec<MicroSeconds>,
-    /// Latency percentiles.
-    percentiles: percentiles::Percentiles<MicroSeconds>,
-}
-
-impl RunResult {
-    fn computations_per_latency(&self) -> usize {
-        self.run.num_query_vectors.get()
-            * self.run.num_doc_vectors.get()
-            * self.run.loops_per_measurement.get()
-    }
-}
-
-impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.is_empty() {
-            return Ok(());
-        }
-
-        // ns/IP is normalized as `min_latency_us * 1000 / (Q * D * loops)` and is
-        // approximately linear in `dim`. Compare across rows with the same `Dim`;
-        // divide further by `Dim` to recover ns per scalar multiply.
-        writeln!(
-            f,
-            "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)"
-        )?;
-
-        let header = [
-            "Operation",
-            "Q",
-            "D",
-            "Dim",
-            "Min Time (ns/IP @ Dim)",
-            "Mean Time (ns/IP @ Dim)",
-            "Loops",
-            "Measurements",
-        ];
-
-        let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.len());
-
-        self.iter().enumerate().for_each(|(row, r)| {
-            let mut row = table.row(row);
-
-            let min_latency = r
-                .latencies
-                .iter()
-                .min()
-                .copied()
-                .unwrap_or(MicroSeconds::new(u64::MAX));
-            let mean_latency = r.percentiles.mean;
-
-            let computations_per_latency = r.computations_per_latency() as f64;
-
-            // Convert time from micro-seconds to nano-seconds per inner-product call
-            // (one (query, doc) pair, ~ linear in dim).
-            let min_time = min_latency.as_f64() / computations_per_latency * 1000.0;
-            let mean_time = mean_latency / computations_per_latency * 1000.0;
-
-            row.insert(r.run.operation, 0);
-            row.insert(r.run.num_query_vectors, 1);
-            row.insert(r.run.num_doc_vectors, 2);
-            row.insert(r.run.dim, 3);
-            row.insert(format!("{:.3}", min_time), 4);
-            row.insert(format!("{:.3}", mean_time), 5);
-            row.insert(r.run.loops_per_measurement, 6);
-            row.insert(r.run.num_measurements, 7);
-        });
-
-        table.fmt(f)
-    }
-}
-
-fn run_loops<F>(run: &Run, mut body: F) -> RunResult
-where
-    F: FnMut(),
-{
-    let mut latencies = Vec::with_capacity(run.num_measurements.get());
-
-    for _ in 0..run.num_measurements.get() {
-        let start = std::time::Instant::now();
-        for _ in 0..run.loops_per_measurement.get() {
-            body();
-        }
-        latencies.push(start.elapsed().into());
-    }
-
-    let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap();
-    RunResult {
-        run: run.clone(),
-        latencies,
-        percentiles,
-    }
-}
-
-///////////////////
-// Data fixtures //
-///////////////////
-
-const RNG_SEED: u64 = 0x12345;
-
-struct Data<T> {
-    query_data: Box<[T]>,
-    doc_data: Box<[T]>,
-}
-
-impl<T: Copy> Data<T>
-where
-    StandardUniform: Distribution<T>,
-{
-    fn new(run: &Run) -> Self {
-        let mut rng = StdRng::seed_from_u64(RNG_SEED);
-        let query_data: Box<[T]> = (0..run.num_query_vectors.get() * run.dim.get())
-            .map(|_| StandardUniform.sample(&mut rng))
-            .collect();
-        let doc_data: Box<[T]> = (0..run.num_doc_vectors.get() * run.dim.get())
-            .map(|_| StandardUniform.sample(&mut rng))
-            .collect();
-
-        Self {
-            query_data,
-            doc_data,
-        }
-    }
-
-    fn query(&self, run: &Run) -> MatRef<'_, Standard<T>> {
-        MatRef::new(
-            Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(),
-            &self.query_data,
-        )
-        .unwrap()
-    }
-
-    fn doc(&self, run: &Run) -> MatRef<'_, Standard<T>> {
-        MatRef::new(
-            Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(),
-            &self.doc_data,
-        )
-        .unwrap()
-    }
-}
-
-/////////////////////
-// Implementations //
-/////////////////////
-
-fn run_optimized<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
-where
-    T: Copy,
-    StandardUniform: Distribution<T>,
-    QueryComputer<T>: NewFromMatRef<T>,
-{
-    let mut results = Vec::with_capacity(input.runs.len());
-    for run in input.runs.iter() {
-        let data = Data::<T>::new(run);
-        // `QueryComputer` performs query-side precomputation that is intentionally
-        // amortized across many `chamfer` / `max_sim` calls; construct it once per
-        // shape, outside the timed loop.
-        let computer = <QueryComputer<T> as NewFromMatRef<T>>::new_from(data.query(run));
-        let doc = data.doc(run);
-
-        let result = match run.operation {
-            Operation::Chamfer => run_loops(run, || {
-                let v = computer.chamfer(doc);
-                std::hint::black_box(v);
-            }),
-            Operation::MaxSim => {
-                let mut scores = vec![0.0f32; run.num_query_vectors.get()];
-                run_loops(run, || {
-                    computer.max_sim(doc, &mut scores);
-                    std::hint::black_box(&mut scores);
-                })
-            }
-        };
-        results.push(result);
-    }
-    Ok(results)
-}
-
-/// Drive the [`Chamfer`] / [`MaxSim`] fallback path.
-fn run_reference<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
-where
-    T: Copy,
-    StandardUniform: Distribution<T>,
-    InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
-{
-    let mut results = Vec::with_capacity(input.runs.len());
-    for run in input.runs.iter() {
-        let data = Data::<T>::new(run);
-        let doc = data.doc(run);
-        // Hoist out of the timed loop to mirror the optimized path's
-        // per-shape precomputation.
-        let query: diskann_quantization::multi_vector::distance::QueryMatRef<'_, _> =
-            data.query(run).into();
-
-        let result = match run.operation {
-            Operation::Chamfer => run_loops(run, || {
-                let v = Chamfer::evaluate(query, doc);
-                std::hint::black_box(v);
-            }),
-            Operation::MaxSim => {
-                let mut scores = vec![0.0f32; run.num_query_vectors.get()];
-                let mut max_sim = MaxSim::new(&mut scores).unwrap();
-                run_loops(run, || {
-                    let _ = max_sim.evaluate(query, doc);
-                    std::hint::black_box(max_sim.scores_mut());
-                })
-            }
-        };
-        results.push(result);
-    }
-    Ok(results)
-}
-
-/// Element-type-erasing constructor for [`QueryComputer`].
-trait NewFromMatRef<T: Copy> {
-    fn new_from(query: MatRef<'_, Standard<T>>) -> QueryComputer<T>;
-}
-
-macro_rules! impl_kernel_for {
-    ($t:ty) => {
-        impl NewFromMatRef<$t> for QueryComputer<$t> {
-            fn new_from(query: MatRef<'_, Standard<$t>>) -> QueryComputer<$t> {
-                QueryComputer::<$t>::new(query)
-            }
-        }
-
-        impl RunBenchmark<Optimized> for Kernel<Optimized, $t> {
-            fn run_benchmark(
-                &self,
-                input: &MultiVectorOp,
-            ) -> Result<Vec<RunResult>, anyhow::Error> {
-                run_optimized::<$t>(input)
-            }
-        }
-
-        impl RunBenchmark<Reference> for Kernel<Reference, $t> {
-            fn run_benchmark(
-                &self,
-                input: &MultiVectorOp,
-            ) -> Result<Vec<RunResult>, anyhow::Error> {
-                run_reference::<$t>(input)
-            }
-        }
-    };
-}
-
-impl_kernel_for!(f32);
-impl_kernel_for!(f16);
-
-///////////
-// Tests //
-///////////
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use diskann_benchmark_runner::{
-        benchmark::{PassFail, Regression},
-        utils::percentiles::compute_percentiles,
-    };
-
-    fn tiny_run(operation: Operation) -> Run {
-        Run {
-            operation,
-            num_query_vectors: NonZeroUsize::new(2).unwrap(),
-            num_doc_vectors: NonZeroUsize::new(2).unwrap(),
-            dim: NonZeroUsize::new(4).unwrap(),
-            loops_per_measurement: NonZeroUsize::new(1).unwrap(),
-            num_measurements: NonZeroUsize::new(1).unwrap(),
-        }
-    }
-
-    fn tiny_op() -> MultiVectorOp {
-        MultiVectorOp {
-            element_type: DataType::Float32,
-            implementation: Implementation::Optimized,
-            runs: vec![tiny_run(Operation::Chamfer)],
-        }
-    }
-
-    fn tiny_result(operation: Operation, minimum: u64) -> RunResult {
-        let run = tiny_run(operation);
-        let minimum = MicroSeconds::new(minimum);
-        let mut latencies = vec![minimum];
-        let percentiles = compute_percentiles(&mut latencies).unwrap();
-        RunResult {
-            run,
-            latencies,
-            percentiles,
-        }
-    }
-
-    fn tolerance(limit: f64) -> MultiVectorTolerance {
-        MultiVectorTolerance {
-            min_time_regression: NonNegativeFinite::new(limit).unwrap(),
-        }
-    }
-
-    #[test]
-    fn check_rejects_mismatched_runs() {
-        let kernel = Kernel::<Optimized, f32>::new();
-
-        let err = kernel
-            .check(
-                &tolerance(0.0),
-                &tiny_op(),
-                &vec![tiny_result(Operation::Chamfer, 100)],
-                &vec![tiny_result(Operation::MaxSim, 100)],
-            )
-            .unwrap_err();
-
-        assert_eq!(err.to_string(), "run 0 mismatched");
-    }
-
-    #[test]
-    fn check_allows_negative_relative_change() {
-        let kernel = Kernel::<Optimized, f32>::new();
-
-        let result = kernel
-            .check(
-                &tolerance(0.0),
-                &tiny_op(),
-                &vec![tiny_result(Operation::Chamfer, 100)],
-                &vec![tiny_result(Operation::Chamfer, 95)],
-            )
-            .unwrap();
-
-        assert!(matches!(result, PassFail::Pass(_)));
-    }
-
-    #[test]
-    fn check_passes_on_tolerance_boundary() {
-        let kernel = Kernel::<Optimized, f32>::new();
-
-        let result = kernel
-            .check(
-                &tolerance(0.05),
-                &tiny_op(),
-                &vec![tiny_result(Operation::Chamfer, 100)],
-                &vec![tiny_result(Operation::Chamfer, 105)],
-            )
-            .unwrap();
-
-        assert!(matches!(result, PassFail::Pass(_)));
-    }
-
-    #[test]
-    fn check_fails_above_tolerance_boundary() {
-        let kernel = Kernel::<Optimized, f32>::new();
-
-        let result = kernel
-            .check(
-                &tolerance(0.05),
-                &tiny_op(),
-                &vec![tiny_result(Operation::Chamfer, 100)],
-                &vec![tiny_result(Operation::Chamfer, 106)],
-            )
-            .unwrap();
-
-        assert!(matches!(result, PassFail::Fail(_)));
-    }
-
-    #[test]
-    fn check_result_display_includes_failure_details() {
-        let check = CheckResult {
-            checks: vec![Comparison {
-                run: tiny_run(Operation::Chamfer),
-                tolerance: tolerance(0.05),
-                before_min: 100.0,
-                after_min: 106.0,
-            }],
-        };
-
-        let rendered = check.to_string();
-        assert!(rendered.contains("Operation"), "rendered = {rendered}");
-        assert!(rendered.contains("chamfer"), "rendered = {rendered}");
-        assert!(rendered.contains("100.000"), "rendered = {rendered}");
-        assert!(rendered.contains("106.000"), "rendered = {rendered}");
-        assert!(rendered.contains("6.000 %"), "rendered = {rendered}");
-        assert!(rendered.contains("FAIL"), "rendered = {rendered}");
-    }
-
-    /// A "before" value of 0 means the measurement was too fast to obtain a
-    /// reliable signal, so we *could* be letting a regression through. We
-    /// require at least a non-zero value.
-    #[test]
-    fn zero_values_rejected() {
-        let kernel = Kernel::<Optimized, f32>::new();
-
-        let result = kernel
-            .check(
-                &tolerance(0.05),
-                &tiny_op(),
-                &vec![tiny_result(Operation::Chamfer, 0)],
-                &vec![tiny_result(Operation::Chamfer, 0)],
-            )
-            .unwrap();
-
-        assert!(matches!(result, PassFail::Fail(_)));
-    }
-}
diff --git a/diskann-benchmark/Cargo.toml b/diskann-benchmark/Cargo.toml
index bebaf4b8e..efd058ffb 100644
--- a/diskann-benchmark/Cargo.toml
+++ b/diskann-benchmark/Cargo.toml
@@ -63,6 +63,9 @@ scalar-quantization = []
 # Enable minmax-quantization based algorithms
 minmax-quantization = []
 
+# Enable multi-vector distance benchmarks (Chamfer / MaxSim)
+multi-vector = []
+
 # Enable Disk Index benchmarks
 disk-index = [
     "diskann-disk/perf_test",
diff --git a/diskann-benchmark-multi-vector/examples/test.json b/diskann-benchmark/example/multi-vector-test.json
similarity index 100%
rename from diskann-benchmark-multi-vector/examples/test.json
rename to diskann-benchmark/example/multi-vector-test.json
diff --git a/diskann-benchmark-multi-vector/examples/multi-vector.json b/diskann-benchmark/example/multi-vector.json
similarity index 100%
rename from diskann-benchmark-multi-vector/examples/multi-vector.json
rename to diskann-benchmark/example/multi-vector.json
diff --git a/diskann-benchmark-multi-vector/examples/tolerance.json b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json
similarity index 100%
rename from diskann-benchmark-multi-vector/examples/tolerance.json
rename to diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json
diff --git a/diskann-benchmark/src/backend/mod.rs b/diskann-benchmark/src/backend/mod.rs
index 24fe91d7e..0d1c61345 100644
--- a/diskann-benchmark/src/backend/mod.rs
+++ b/diskann-benchmark/src/backend/mod.rs
@@ -7,10 +7,12 @@ mod disk_index;
 mod exhaustive;
 mod filters;
 mod index;
+mod multi_vector;
 
 pub(crate) fn register_benchmarks(registry: &mut diskann_benchmark_runner::registry::Benchmarks) {
     exhaustive::register_benchmarks(registry);
     disk_index::register_benchmarks(registry);
     index::register_benchmarks(registry);
     filters::register_benchmarks(registry);
+    multi_vector::register_benchmarks(registry);
 }
diff --git a/diskann-benchmark/src/backend/multi_vector.rs b/diskann-benchmark/src/backend/multi_vector.rs
new file mode 100644
index 000000000..cfdb77f33
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector.rs
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Multi-vector distance benchmarks (Chamfer / MaxSim) with regression detection.
+
+use diskann_benchmark_runner::registry::Benchmarks;
+
+// Create a stub-module if the "multi-vector" feature is disabled.
+crate::utils::stub_impl!("multi-vector", inputs::multi_vector::MultiVectorOp);
+
+pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) {
+    #[cfg(feature = "multi-vector")]
+    {
+        use half::f16;
+
+        // Optimized (architecture-dispatched QueryComputer).
+        benchmarks.register_regression(
+            "multi-vector-op-f32-optimized",
+            imp::Kernel::<imp::Optimized, f32>::new(),
+        );
+        benchmarks.register_regression(
+            "multi-vector-op-f16-optimized",
+            imp::Kernel::<imp::Optimized, f16>::new(),
+        );
+
+        // Reference (Chamfer / MaxSim fallback path).
+        benchmarks.register_regression(
+            "multi-vector-op-f32-reference",
+            imp::Kernel::<imp::Reference, f32>::new(),
+        );
+        benchmarks.register_regression(
+            "multi-vector-op-f16-reference",
+            imp::Kernel::<imp::Reference, f16>::new(),
+        );
+    }
+
+    // Stub implementation
+    #[cfg(not(feature = "multi-vector"))]
+    imp::register("multi-vector-op", benchmarks);
+}
+
+#[cfg(feature = "multi-vector")]
+mod imp {
+    use std::io::Write;
+
+    use diskann_benchmark_runner::{
+        benchmark::{PassFail, Regression},
+        dispatcher::{DispatchRule, FailureScore, MatchScore},
+        utils::{datatype, num::relative_change, percentiles, MicroSeconds},
+        Benchmark,
+    };
+    use diskann_quantization::multi_vector::{
+        Chamfer, Init, Mat, MatRef, MaxSim, QueryComputer, Standard,
+    };
+    use diskann_vector::distance::InnerProduct;
+    use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
+    use half::f16;
+    use rand::{
+        distr::{Distribution, StandardUniform},
+        rngs::StdRng,
+        SeedableRng,
+    };
+    use serde::{Deserialize, Serialize};
+
+    use crate::inputs::multi_vector::{
+        Implementation, MultiVectorOp, MultiVectorTolerance, Operation, Run,
+    };
+
+    ///////////
+    // Utils //
+    ///////////
+
+    #[derive(Debug, Clone, Copy)]
+    pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T);
+
+    impl<T: ?Sized> std::ops::Deref for DisplayWrapper<'_, T> {
+        type Target = T;
+        fn deref(&self) -> &T {
+            self.0
+        }
+    }
+
+    //////////////
+    // Dispatch //
+    //////////////
+
+    /// Dispatch marker for the [`QueryComputer`] implementation.
+    #[derive(Debug)]
+    pub(super) struct Optimized;
+
+    /// Dispatch marker for the [`Chamfer`] / [`MaxSim`] fallback.
+    #[derive(Debug)]
+    pub(super) struct Reference;
+
+    /// A multi-vector benchmark.
+    pub(super) struct Kernel<I, T> {
+        _type: std::marker::PhantomData<(I, T)>,
+    }
+
+    impl<I, T> Kernel<I, T> {
+        pub(super) fn new() -> Self {
+            Self {
+                _type: std::marker::PhantomData,
+            }
+        }
+    }
+
+    /// Pairs the standard `TryFrom<Implementation>` conversion with the static
+    /// description info needed for friendly diagnostics in `Benchmark::description`.
+    pub(super) trait ImplementationMatcher:
+        TryFrom<Implementation, Error = FailureScore> + 'static
+    {
+        /// Human-readable description of which implementation this marker handles.
+        const DESCRIPTION: &'static str;
+        /// The implementation variant this marker expects (for mismatch diagnostics).
+        const EXPECTED: Implementation;
+    }
+
+    impl TryFrom<Implementation> for Optimized {
+        type Error = FailureScore;
+        fn try_from(i: Implementation) -> Result<Self, Self::Error> {
+            match i {
+                Implementation::Optimized => Ok(Self),
+                _ => Err(FailureScore(1)),
+            }
+        }
+    }
+
+    impl ImplementationMatcher for Optimized {
+        const DESCRIPTION: &'static str = "QueryComputer (architecture-dispatched)";
+        const EXPECTED: Implementation = Implementation::Optimized;
+    }
+
+    impl TryFrom<Implementation> for Reference {
+        type Error = FailureScore;
+        fn try_from(i: Implementation) -> Result<Self, Self::Error> {
+            match i {
+                Implementation::Reference => Ok(Self),
+                _ => Err(FailureScore(1)),
+            }
+        }
+    }
+
+    impl ImplementationMatcher for Reference {
+        const DESCRIPTION: &'static str = "Chamfer / MaxSim fallback";
+        const EXPECTED: Implementation = Implementation::Reference;
+    }
+
+    impl<I, T> Benchmark for Kernel<I, T>
+    where
+        datatype::Type<T>: DispatchRule<datatype::DataType>,
+        I: ImplementationMatcher,
+        Kernel<I, T>: RunBenchmark<I>,
+        T: 'static,
+    {
+        type Input = MultiVectorOp;
+        type Output = Vec<RunResult>;
+
+        fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
+            let mut failscore: Option<u32> = None;
+            if datatype::Type::<T>::try_match(&from.element_type).is_err() {
+                *failscore.get_or_insert(0) += 10;
+            }
+            if let Err(FailureScore(score)) = I::try_from(from.implementation) {
+                *failscore.get_or_insert(0) += 2 + score;
+            }
+
+            match failscore {
+                None => Ok(MatchScore(0)),
+                Some(score) => Err(FailureScore(score)),
+            }
+        }
+
+        fn run(
+            &self,
+            input: &MultiVectorOp,
+            _: diskann_benchmark_runner::Checkpoint<'_>,
+            mut output: &mut dyn diskann_benchmark_runner::Output,
+        ) -> anyhow::Result<Self::Output> {
+            // The dispatcher only invokes `run` after `try_match` has already accepted
+            // the input, so a failure here would indicate a dispatcher bug.
+            I::try_from(input.implementation).expect("try_match accepted the input");
+            writeln!(output, "{}", input)?;
+            let results = self.run_benchmark(input)?;
+            writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
+            Ok(results)
+        }
+
+        fn description(
+            &self,
+            f: &mut std::fmt::Formatter<'_>,
+            input: Option<&MultiVectorOp>,
+        ) -> std::fmt::Result {
+            match input {
+                None => {
+                    writeln!(
+                        f,
+                        "- Element Type: {}",
+                        diskann_benchmark_runner::dispatcher::Description::<
+                            datatype::DataType,
+                            datatype::Type<T>,
+                        >::new()
+                    )?;
+                    writeln!(f, "- Implementation: {}", I::DESCRIPTION)?;
+                }
+                Some(input) => {
+                    if let Err(err) = datatype::Type::<T>::try_match_verbose(&input.element_type) {
+                        writeln!(f, "\n    - Mismatched element type: {}", err)?;
+                    }
+                    if I::try_from(input.implementation).is_err() {
+                        writeln!(
+                            f,
+                            "\n    - Mismatched implementation: expected {}, got {}",
+                            I::EXPECTED,
+                            input.implementation
+                        )?;
+                    }
+                }
+            }
+            Ok(())
+        }
+    }
+
+    impl<I, T> Regression for Kernel<I, T>
+    where
+        datatype::Type<T>: DispatchRule<datatype::DataType>,
+        I: ImplementationMatcher,
+        Kernel<I, T>: RunBenchmark<I>,
+        T: 'static,
+    {
+        type Tolerances = MultiVectorTolerance;
+        type Pass = CheckResult;
+        type Fail = CheckResult;
+
+        fn check(
+            &self,
+            tolerance: &MultiVectorTolerance,
+            _input: &MultiVectorOp,
+            before: &Vec<RunResult>,
+            after: &Vec<RunResult>,
+        ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
+            anyhow::ensure!(
+                before.len() == after.len(),
+                "before has {} runs but after has {}",
+                before.len(),
+                after.len(),
+            );
+
+            let mut passed = true;
+            let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
+                .enumerate()
+                .map(|(i, (b, a))| {
+                    anyhow::ensure!(b.run == a.run, "run {i} mismatched");
+
+                    let computations_per_latency = b.computations_per_latency() as f64;
+
+                    let before_min =
+                        b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+                    let after_min =
+                        a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+
+                    let comparison = Comparison {
+                        run: b.run.clone(),
+                        tolerance: *tolerance,
+                        before_min,
+                        after_min,
+                    };
+
+                    match relative_change(before_min, after_min) {
+                        Ok(change) => {
+                            if change > tolerance.min_time_regression.get() {
+                                passed = false;
+                            }
+                        }
+                        Err(_) => passed = false,
+                    };
+
+                    Ok(comparison)
+                })
+                .collect::<anyhow::Result<Vec<Comparison>>>()?;
+
+            let check = CheckResult { checks };
+
+            if passed {
+                Ok(PassFail::Pass(check))
+            } else {
+                Ok(PassFail::Fail(check))
+            }
+        }
+    }
+
+    //////////////////////
+    // Regression Check //
+    //////////////////////
+
+    /// Per-run comparison result showing before/after percentile differences.
+    #[derive(Debug, Serialize)]
+    pub(super) struct Comparison {
+        run: Run,
+        tolerance: MultiVectorTolerance,
+        before_min: f64,
+        after_min: f64,
+    }
+
+    /// Aggregated result of the regression check across all runs.
+    #[derive(Debug, Serialize)]
+    pub(super) struct CheckResult {
+        checks: Vec<Comparison>,
+    }
+
+    impl std::fmt::Display for CheckResult {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            let header = [
+                "Operation",
+                "Q",
+                "D",
+                "Dim",
+                "Min Before (ns/IP @ Dim)",
+                "Min After (ns/IP @ Dim)",
+                "Change (%)",
+                "Remark",
+            ];
+
+            let mut table =
+                diskann_benchmark_runner::utils::fmt::Table::new(header, self.checks.len());
+
+            for (i, c) in self.checks.iter().enumerate() {
+                let mut row = table.row(i);
+                let change = relative_change(c.before_min, c.after_min);
+
+                row.insert(c.run.operation, 0);
+                row.insert(c.run.num_query_vectors, 1);
+                row.insert(c.run.num_doc_vectors, 2);
+                row.insert(c.run.dim, 3);
+                row.insert(format!("{:.3}", c.before_min), 4);
+                row.insert(format!("{:.3}", c.after_min), 5);
+                match change {
+                    Ok(change) => {
+                        row.insert(format!("{:.3} %", change * 100.0), 6);
+                        if change > c.tolerance.min_time_regression.get() {
+                            row.insert("FAIL", 7);
+                        }
+                    }
+                    Err(err) => {
+                        row.insert("invalid", 6);
+                        row.insert(err, 7);
+                    }
+                }
+            }
+
+            table.fmt(f)
+        }
+    }
+
+    ///////////////
+    // Benchmark //
+    ///////////////
+
+    pub(super) trait RunBenchmark<I> {
+        fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error>;
+    }
+
+    #[derive(Debug, Serialize, Deserialize)]
+    pub(super) struct RunResult {
+        /// The configuration for this run.
+        run: Run,
+        /// Per-measurement latencies (over `loops_per_measurement` calls).
+        latencies: Vec<MicroSeconds>,
+        /// Latency percentiles.
+        percentiles: percentiles::Percentiles<MicroSeconds>,
+    }
+
+    impl RunResult {
+        fn computations_per_latency(&self) -> usize {
+            self.run.num_query_vectors.get()
+                * self.run.num_doc_vectors.get()
+                * self.run.loops_per_measurement.get()
+        }
+    }
+
+    impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            if self.is_empty() {
+                return Ok(());
+            }
+
+            // ns/IP is normalized as `min_latency_us * 1000 / (Q * D * loops)` and is
+            // approximately linear in `dim`. Compare across rows with the same `Dim`;
+            // divide further by `Dim` to recover ns per scalar multiply.
+            writeln!(
+                f,
+                "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)"
+            )?;
+
+            let header = [
+                "Operation",
+                "Q",
+                "D",
+                "Dim",
+                "Min Time (ns/IP @ Dim)",
+                "Mean Time (ns/IP @ Dim)",
+                "Loops",
+                "Measurements",
+            ];
+
+            let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.len());
+
+            self.iter().enumerate().for_each(|(row, r)| {
+                let mut row = table.row(row);
+
+                let min_latency = r
+                    .latencies
+                    .iter()
+                    .min()
+                    .copied()
+                    .unwrap_or(MicroSeconds::new(u64::MAX));
+                let mean_latency = r.percentiles.mean;
+
+                let computations_per_latency = r.computations_per_latency() as f64;
+
+                // Convert time from micro-seconds to nano-seconds per inner-product call
+                // (one (query, doc) pair, ~ linear in dim).
+                let min_time = min_latency.as_f64() / computations_per_latency * 1000.0;
+                let mean_time = mean_latency / computations_per_latency * 1000.0;
+
+                row.insert(r.run.operation, 0);
+                row.insert(r.run.num_query_vectors, 1);
+                row.insert(r.run.num_doc_vectors, 2);
+                row.insert(r.run.dim, 3);
+                row.insert(format!("{:.3}", min_time), 4);
+                row.insert(format!("{:.3}", mean_time), 5);
+                row.insert(r.run.loops_per_measurement, 6);
+                row.insert(r.run.num_measurements, 7);
+            });
+
+            table.fmt(f)
+        }
+    }
+
+    fn run_loops<F>(run: &Run, mut body: F) -> RunResult
+    where
+        F: FnMut(),
+    {
+        let mut latencies = Vec::with_capacity(run.num_measurements.get());
+
+        for _ in 0..run.num_measurements.get() {
+            let start = std::time::Instant::now();
+            for _ in 0..run.loops_per_measurement.get() {
+                body();
+            }
+            latencies.push(start.elapsed().into());
+        }
+
+        let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap();
+        RunResult {
+            run: run.clone(),
+            latencies,
+            percentiles,
+        }
+    }
+
+    ///////////////////
+    // Data fixtures //
+    ///////////////////
+
+    const RNG_SEED: u64 = 0x12345;
+
+    struct Data<T: Copy> {
+        queries: Mat<Standard<T>>,
+        docs: Mat<Standard<T>>,
+    }
+
+    impl<T: Copy> Data<T>
+    where
+        StandardUniform: Distribution<T>,
+    {
+        fn new(run: &Run) -> Self {
+            let mut rng = StdRng::seed_from_u64(RNG_SEED);
+            let queries = Mat::new(
+                Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(),
+                Init(|| StandardUniform.sample(&mut rng)),
+            )
+            .unwrap();
+            let docs = Mat::new(
+                Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(),
+                Init(|| StandardUniform.sample(&mut rng)),
+            )
+            .unwrap();
+            Self { queries, docs }
+        }
+    }
+
+    //////////////////////
+    // Distance kernels //
+    //////////////////////
+
+    /// Object-safe abstraction over a per-shape distance executor.
+    ///
+    /// The two implementations ([`OptimizedDistance`] and [`ReferenceDistance`]) share the
+    /// same hot-loop nest in [`run_with_distance`]; dispatching through `&dyn Distance<T>`
+    /// keeps `run_loops` from being monomorphised over the implementation axis.
+    trait Distance<T: Copy> {
+        fn chamfer(&self, doc: MatRef<'_, Standard<T>>) -> f32;
+        fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
+    }
+
+    /// Distance executor that drives [`QueryComputer`] (architecture-dispatched SIMD).
+    struct OptimizedDistance<T: Copy>(QueryComputer<T>);
+
+    impl<T: Copy> Distance<T> for OptimizedDistance<T> {
+        fn chamfer(&self, doc: MatRef<'_, Standard<T>>) -> f32 {
+            self.0.chamfer(doc)
+        }
+        fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
+            self.0.max_sim(doc, scores);
+        }
+    }
+
+    /// Distance executor that drives the [`Chamfer`] / [`MaxSim`] fallback path.
+    struct ReferenceDistance<'a, T: Copy>(
+        diskann_quantization::multi_vector::distance::QueryMatRef<'a, Standard<T>>,
+    );
+
+    impl<T: Copy> Distance<T> for ReferenceDistance<'_, T>
+    where
+        InnerProduct: for<'q, 'd> PureDistanceFunction<&'q [T], &'d [T], f32>,
+    {
+        fn chamfer(&self, doc: MatRef<'_, Standard<T>>) -> f32 {
+            Chamfer::evaluate(self.0, doc)
+        }
+        fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
+            // `MaxSim::new` is a non-empty check + pointer wrap, so constructing it per
+            // iteration is free — no need to hoist it out of the loop.
+            let mut max_sim = MaxSim::new(scores).unwrap();
+            let _ = max_sim.evaluate(self.0, doc);
+        }
+    }
+
+    /////////////////////
+    // Implementations //
+    /////////////////////
+
+    /// Shared loop nest. The trait-object dispatch happens once per outer iteration of
+    /// `run_loops`; the work inside each `chamfer` / `max_sim` call is O(Q*D*dim), so the
+    /// vtable hop is in the noise.
+    fn run_with_distance<T: Copy>(
+        run: &Run,
+        doc: MatRef<'_, Standard<T>>,
+        dist: &dyn Distance<T>,
+    ) -> RunResult {
+        match run.operation {
+            Operation::Chamfer => run_loops(run, || {
+                let v = dist.chamfer(doc);
+                std::hint::black_box(v);
+            }),
+            Operation::MaxSim => {
+                let mut scores = vec![0.0f32; run.num_query_vectors.get()];
+                run_loops(run, || {
+                    dist.max_sim(doc, &mut scores);
+                    std::hint::black_box(&mut scores);
+                })
+            }
+        }
+    }
+
+    fn run_optimized<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
+    where
+        T: Copy,
+        StandardUniform: Distribution<T>,
+        QueryComputer<T>: NewFromMatRef<T>,
+        OptimizedDistance<T>: Distance<T>,
+    {
+        let mut results = Vec::with_capacity(input.runs.len());
+        for run in input.runs.iter() {
+            let data = Data::<T>::new(run);
+            // `QueryComputer` performs query-side precomputation that is intentionally
+            // amortized across many `chamfer` / `max_sim` calls; construct it once per
+            // shape, outside the timed loop.
+            let dist = OptimizedDistance(<QueryComputer<T> as NewFromMatRef<T>>::new_from(
+                data.queries.as_view(),
+            ));
+            results.push(run_with_distance(run, data.docs.as_view(), &dist));
+        }
+        Ok(results)
+    }
+
+    /// Drive the [`Chamfer`] / [`MaxSim`] fallback path.
+    fn run_reference<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
+    where
+        T: Copy,
+        StandardUniform: Distribution<T>,
+        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+        for<'a> ReferenceDistance<'a, T>: Distance<T>,
+    {
+        let mut results = Vec::with_capacity(input.runs.len());
+        for run in input.runs.iter() {
+            let data = Data::<T>::new(run);
+            let dist = ReferenceDistance(data.queries.as_view().into());
+            results.push(run_with_distance(run, data.docs.as_view(), &dist));
+        }
+        Ok(results)
+    }
+
+    /// Element-type-erasing constructor for [`QueryComputer`].
+    ///
+    /// `QueryComputer::<T>::new` is defined as an inherent method on the concrete
+    /// `QueryComputer<f32>` / `QueryComputer<half::f16>` types (not a generic), so we need
+    /// this shim trait to let generic code (e.g. `run_optimized<T>`) call it.
+    trait NewFromMatRef<T: Copy> {
+        fn new_from(query: MatRef<'_, Standard<T>>) -> QueryComputer<T>;
+    }
+
+    impl NewFromMatRef<f32> for QueryComputer<f32> {
+        fn new_from(query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
+            QueryComputer::<f32>::new(query)
+        }
+    }
+
+    impl NewFromMatRef<f16> for QueryComputer<f16> {
+        fn new_from(query: MatRef<'_, Standard<f16>>) -> QueryComputer<f16> {
+            QueryComputer::<f16>::new(query)
+        }
+    }
+
+    impl<T> RunBenchmark<Optimized> for Kernel<Optimized, T>
+    where
+        T: Copy + 'static,
+        StandardUniform: Distribution<T>,
+        QueryComputer<T>: NewFromMatRef<T>,
+        OptimizedDistance<T>: Distance<T>,
+    {
+        fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>> {
+            run_optimized::<T>(input)
+        }
+    }
+
+    impl<T> RunBenchmark<Reference> for Kernel<Reference, T>
+    where
+        T: Copy + 'static,
+        StandardUniform: Distribution<T>,
+        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+        for<'a> ReferenceDistance<'a, T>: Distance<T>,
+    {
+        fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>> {
+            run_reference::<T>(input)
+        }
+    }
+
+    ///////////
+    // Tests //
+    ///////////
+
+    #[cfg(test)]
+    mod tests {
+        use std::num::NonZeroUsize;
+
+        use diskann_benchmark_runner::{
+            benchmark::{PassFail, Regression},
+            utils::{datatype::DataType, num::NonNegativeFinite, percentiles::compute_percentiles},
+        };
+
+        use super::*;
+
+        fn tiny_run(operation: Operation) -> Run {
+            Run {
+                operation,
+                num_query_vectors: NonZeroUsize::new(2).unwrap(),
+                num_doc_vectors: NonZeroUsize::new(2).unwrap(),
+                dim: NonZeroUsize::new(4).unwrap(),
+                loops_per_measurement: NonZeroUsize::new(1).unwrap(),
+                num_measurements: NonZeroUsize::new(1).unwrap(),
+            }
+        }
+
+        fn tiny_op() -> MultiVectorOp {
+            MultiVectorOp {
+                element_type: DataType::Float32,
+                implementation: Implementation::Optimized,
+                runs: vec![tiny_run(Operation::Chamfer)],
+            }
+        }
+
+        fn tiny_result(operation: Operation, minimum: u64) -> RunResult {
+            let run = tiny_run(operation);
+            let minimum = MicroSeconds::new(minimum);
+            let mut latencies = vec![minimum];
+            let percentiles = compute_percentiles(&mut latencies).unwrap();
+            RunResult {
+                run,
+                latencies,
+                percentiles,
+            }
+        }
+
+        fn tolerance(limit: f64) -> MultiVectorTolerance {
+            MultiVectorTolerance {
+                min_time_regression: NonNegativeFinite::new(limit).unwrap(),
+            }
+        }
+
+        #[test]
+        fn check_rejects_mismatched_runs() {
+            let kernel = Kernel::<Optimized, f32>::new();
+
+            let err = kernel
+                .check(
+                    &tolerance(0.0),
+                    &tiny_op(),
+                    &vec![tiny_result(Operation::Chamfer, 100)],
+                    &vec![tiny_result(Operation::MaxSim, 100)],
+                )
+                .unwrap_err();
+
+            assert_eq!(err.to_string(), "run 0 mismatched");
+        }
+
+        #[test]
+        fn check_allows_negative_relative_change() {
+            let kernel = Kernel::<Optimized, f32>::new();
+
+            let result = kernel
+                .check(
+                    &tolerance(0.0),
+                    &tiny_op(),
+                    &vec![tiny_result(Operation::Chamfer, 100)],
+                    &vec![tiny_result(Operation::Chamfer, 95)],
+                )
+                .unwrap();
+
+            assert!(matches!(result, PassFail::Pass(_)));
+        }
+
+        #[test]
+        fn check_passes_on_tolerance_boundary() {
+            let kernel = Kernel::<Optimized, f32>::new();
+
+            let result = kernel
+                .check(
+                    &tolerance(0.05),
+                    &tiny_op(),
+                    &vec![tiny_result(Operation::Chamfer, 100)],
+                    &vec![tiny_result(Operation::Chamfer, 105)],
+                )
+                .unwrap();
+
+            assert!(matches!(result, PassFail::Pass(_)));
+        }
+
+        #[test]
+        fn check_fails_above_tolerance_boundary() {
+            let kernel = Kernel::<Optimized, f32>::new();
+
+            let result = kernel
+                .check(
+                    &tolerance(0.05),
+                    &tiny_op(),
+                    &vec![tiny_result(Operation::Chamfer, 100)],
+                    &vec![tiny_result(Operation::Chamfer, 106)],
+                )
+                .unwrap();
+
+            assert!(matches!(result, PassFail::Fail(_)));
+        }
+
+        #[test]
+        fn check_result_display_includes_failure_details() {
+            let check = CheckResult {
+                checks: vec![Comparison {
+                    run: tiny_run(Operation::Chamfer),
+                    tolerance: tolerance(0.05),
+                    before_min: 100.0,
+                    after_min: 106.0,
+                }],
+            };
+
+            let rendered = check.to_string();
+            assert!(rendered.contains("Operation"), "rendered = {rendered}");
+            assert!(rendered.contains("chamfer"), "rendered = {rendered}");
+            assert!(rendered.contains("100.000"), "rendered = {rendered}");
+            assert!(rendered.contains("106.000"), "rendered = {rendered}");
+            assert!(rendered.contains("6.000 %"), "rendered = {rendered}");
+            assert!(rendered.contains("FAIL"), "rendered = {rendered}");
+        }
+
+        /// A "before" value of 0 means the measurement was too fast to obtain a
+        /// reliable signal, so we *could* be letting a regression through. We
+        /// require at least a non-zero value.
+        #[test]
+        fn zero_values_rejected() {
+            let kernel = Kernel::<Optimized, f32>::new();
+
+            let result = kernel
+                .check(
+                    &tolerance(0.05),
+                    &tiny_op(),
+                    &vec![tiny_result(Operation::Chamfer, 0)],
+                    &vec![tiny_result(Operation::Chamfer, 0)],
+                )
+                .unwrap();
+
+            assert!(matches!(result, PassFail::Fail(_)));
+        }
+    }
+}
diff --git a/diskann-benchmark/src/inputs/mod.rs b/diskann-benchmark/src/inputs/mod.rs
index 856412e2a..414a0b52e 100644
--- a/diskann-benchmark/src/inputs/mod.rs
+++ b/diskann-benchmark/src/inputs/mod.rs
@@ -7,6 +7,7 @@ pub(crate) mod disk;
 pub(crate) mod exhaustive;
 pub(crate) mod filters;
 pub(crate) mod graph_index;
+pub(crate) mod multi_vector;
 pub(crate) mod save_and_load;
 
 pub(crate) fn register_inputs(
@@ -16,6 +17,7 @@ pub(crate) fn register_inputs(
     exhaustive::register_inputs(registry)?;
     disk::register_inputs(registry)?;
     filters::register_inputs(registry)?;
+    multi_vector::register_inputs(registry)?;
     Ok(())
 }
 
diff --git a/diskann-benchmark/src/inputs/multi_vector.rs b/diskann-benchmark/src/inputs/multi_vector.rs
new file mode 100644
index 000000000..8010162d6
--- /dev/null
+++ b/diskann-benchmark/src/inputs/multi_vector.rs
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+use std::num::NonZeroUsize;
+
+use diskann_benchmark_runner::{
+    utils::{datatype::DataType, num::NonNegativeFinite},
+    CheckDeserialization, Checker,
+};
+use serde::{Deserialize, Serialize};
+
+use crate::inputs::{as_input, Example};
+
+//////////////
+// Registry //
+//////////////
+
+as_input!(MultiVectorOp);
+as_input!(MultiVectorTolerance);
+
+pub(super) fn register_inputs(
+    registry: &mut diskann_benchmark_runner::registry::Inputs,
+) -> anyhow::Result<()> {
+    registry.register::<MultiVectorOp>()?;
+    registry.register::<MultiVectorTolerance>()?;
+    Ok(())
+}
+
+////////////////
+// Enum types //
+////////////////
+
+/// The two distance operations exposed by `QueryComputer`.
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub(crate) enum Operation {
+    Chamfer,
+    MaxSim,
+}
+
+impl std::fmt::Display for Operation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let st = match self {
+            Self::Chamfer => "chamfer",
+            Self::MaxSim => "max_sim",
+        };
+        write!(f, "{}", st)
+    }
+}
+
+/// Which implementation tier to benchmark.
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub(crate) enum Implementation {
+    Optimized,
+    Reference,
+}
+
+impl std::fmt::Display for Implementation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let st = match self {
+            Self::Optimized => "optimized",
+            Self::Reference => "reference",
+        };
+        write!(f, "{}", st)
+    }
+}
+
+/// One benchmark configuration: a single (operation, shape) measurement.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub(crate) struct Run {
+    pub(crate) operation: Operation,
+    pub(crate) num_query_vectors: NonZeroUsize,
+    pub(crate) num_doc_vectors: NonZeroUsize,
+    pub(crate) dim: NonZeroUsize,
+    pub(crate) loops_per_measurement: NonZeroUsize,
+    pub(crate) num_measurements: NonZeroUsize,
+}
+
+///////////////////////
+// Multi-Vector Op   //
+///////////////////////
+
+/// A complete multi-vector benchmark job.
+#[derive(Debug, Serialize, Deserialize)]
+pub(crate) struct MultiVectorOp {
+    pub(crate) element_type: DataType,
+    pub(crate) implementation: Implementation,
+    pub(crate) runs: Vec<Run>,
+}
+
+impl MultiVectorOp {
+    pub(crate) const fn tag() -> &'static str {
+        "multi-vector-op"
+    }
+}
+
+impl CheckDeserialization for MultiVectorOp {
+    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
+        Ok(())
+    }
+}
+
+impl Example for MultiVectorOp {
+    fn example() -> Self {
+        const NUM_QUERY_VECTORS: NonZeroUsize = NonZeroUsize::new(32).unwrap();
+        const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
+        const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap();
+        const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap();
+        const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap();
+
+        let runs = vec![
+            Run {
+                operation: Operation::Chamfer,
+                num_query_vectors: NUM_QUERY_VECTORS,
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+            Run {
+                operation: Operation::MaxSim,
+                num_query_vectors: NUM_QUERY_VECTORS,
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+        ];
+
+        Self {
+            element_type: DataType::Float32,
+            implementation: Implementation::Optimized,
+            runs,
+        }
+    }
+}
+
+macro_rules! write_field {
+    ($f:ident, $field:tt, $($expr:tt)*) => {
+        writeln!($f, "{:>18}: {}", $field, $($expr)*)
+    }
+}
+
+impl std::fmt::Display for MultiVectorOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "Multi-Vector Operation\n")?;
+        write_field!(f, "tag", Self::tag())?;
+        write_field!(f, "element type", self.element_type)?;
+        write_field!(f, "implementation", self.implementation)?;
+        write_field!(f, "number of runs", self.runs.len())?;
+        Ok(())
+    }
+}
+
+/////////////////////////////
+// Multi-Vector Tolerance  //
+/////////////////////////////
+
+/// Tolerance thresholds for multi-vector benchmark regression detection.
+///
+/// Each field specifies the maximum allowed relative increase in the corresponding metric.
+/// For example, a value of `0.05` means a 5% increase is tolerated.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub(crate) struct MultiVectorTolerance {
+    pub(crate) min_time_regression: NonNegativeFinite,
+}
+
+impl MultiVectorTolerance {
+    pub(crate) const fn tag() -> &'static str {
+        "multi-vector-tolerance"
+    }
+}
+
+impl CheckDeserialization for MultiVectorTolerance {
+    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
+        Ok(())
+    }
+}
+
+impl Example for MultiVectorTolerance {
+    fn example() -> Self {
+        Self {
+            min_time_regression: NonNegativeFinite::new(0.05)
+                .expect("0.05 is a valid non-negative finite"),
+        }
+    }
+}
diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs
index 424e63bb7..c7276f2e1 100644
--- a/diskann-benchmark/src/main.rs
+++ b/diskann-benchmark/src/main.rs
@@ -776,6 +776,92 @@ mod tests {
         assert!(!output_path.exists());
     }
 
+    ///////////////////
+    // Multi-Vector  //
+    ///////////////////
+
+    #[test]
+    fn multi_vector_integration() {
+        let path = example_directory().join("multi-vector-test.json");
+        let tempdir = tempfile::tempdir().unwrap();
+        let output_path = tempdir.path().join("output.json");
+        assert!(!output_path.exists());
+
+        let modified_input_path = tempdir.path().join("input.json");
+
+        let mut raw = value_from_file(&path);
+        prefix_search_directories(&mut raw, &root_directory());
+        save_to_file(&modified_input_path, &raw);
+
+        run_multi_vector_integration(&modified_input_path, &output_path)
+    }
+
+    #[cfg(feature = "multi-vector")]
+    fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) {
+        let command = Commands::Run {
+            input_file: input_path.to_owned(),
+            output_file: output_path.to_owned(),
+            dry_run: false,
+            allow_debug: true,
+        };
+
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+
+        cli.run(&mut output).unwrap();
+        println!(
+            "output = {}",
+            String::from_utf8(output.into_inner()).unwrap()
+        );
+
+        // Check that the results file is generated.
+        assert!(output_path.exists());
+    }
+
+    #[cfg(not(feature = "multi-vector"))]
+    fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) {
+        let command = Commands::Run {
+            input_file: input_path.to_owned(),
+            output_file: output_path.to_owned(),
+            dry_run: false,
+            allow_debug: true,
+        };
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+
+        let err = cli.run(&mut output).unwrap_err();
+        println!("err = {:?}", err);
+
+        let output = String::from_utf8(output.into_inner()).unwrap();
+        assert!(output.contains("\"multi-vector\" feature"));
+        println!("output = {}", output);
+
+        // The output file should not have been created because we failed the test.
+        assert!(!output_path.exists());
+    }
+
+    #[test]
+    #[cfg(feature = "multi-vector")]
+    fn multi_vector_check_verify() {
+        let input_path = example_directory().join("multi-vector-test.json");
+        let tolerance_path = project_directory()
+            .join("perf_test_inputs")
+            .join("multi-vector-tolerance.json");
+
+        let command = Commands::Check(diskann_benchmark_runner::app::Check::Verify {
+            tolerances: tolerance_path,
+            input_file: input_path,
+        });
+
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+        cli.run(&mut output).unwrap();
+        println!(
+            "output = {}",
+            String::from_utf8(output.into_inner()).unwrap()
+        );
+    }
+
     #[test]
     fn quiet_suppresses_check_target_warning() {
         let cli = Cli::from_commands(Commands::Skeleton, true);
diff --git a/diskann-quantization/src/multi_vector/matrix.rs b/diskann-quantization/src/multi_vector/matrix.rs
index 70629d44c..bcbafaaa3 100644
--- a/diskann-quantization/src/multi_vector/matrix.rs
+++ b/diskann-quantization/src/multi_vector/matrix.rs
@@ -244,6 +244,18 @@ pub unsafe trait NewOwned<T>: ReprOwned {
 #[derive(Debug, Clone, Copy)]
 pub struct Defaulted;
 
+/// An initializer argument to [`NewOwned`] that invokes the wrapped closure for each
+/// element.
+///
+/// # Example
+/// ```
+/// use diskann_quantization::multi_vector::{Init, Mat, Standard};
+/// let mut n = 0;
+/// let mat = Mat::new(Standard::<i32>::new(1, 4).unwrap(), Init(|| { n += 1; n })).unwrap();
+/// assert_eq!(mat.as_slice(), &[1, 2, 3, 4]);
+/// ```
+pub struct Init<F>(pub F);
+
 /// Create a new [`Mat`] cloned from a view.
 pub trait NewCloned: ReprOwned {
     /// Clone the contents behind `v`, returning a new owning [`Mat`].
@@ -514,6 +526,22 @@ where
     }
 }
 
+// SAFETY: The implementation uses guarantees from `Box` to ensure that the pointer
+// initialized by it is non-null and properly aligned to the underlying type.
+unsafe impl<T, F> NewOwned<Init<F>> for Standard<T>
+where
+    T: Copy,
+    F: FnMut() -> T,
+{
+    type Error = crate::error::Infallible;
+    fn new_owned(self, mut init: Init<F>) -> Result<Mat<Self>, Self::Error> {
+        let b: Box<[T]> = (0..self.num_elements()).map(|_| (init.0)()).collect();
+
+        // SAFETY: By construction, `b` has length `self.num_elements()`.
+        Ok(unsafe { self.box_to_mat(b) })
+    }
+}
+
 // SAFETY: This checks that the slice has the correct length, which is all that is
 // required for [`Repr`].
 unsafe impl<T> NewRef<T> for Standard<T>
@@ -1767,6 +1795,22 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_standard_new_owned_with_init() {
+        let mut counter: i32 = 0;
+        let m = Mat::new(
+            Standard::<i32>::new(2, 3).unwrap(),
+            Init(|| {
+                let v = counter;
+                counter += 1;
+                v
+            }),
+        )
+        .unwrap();
+
+        assert_eq!(m.as_slice(), &[0, 1, 2, 3, 4, 5]);
+    }
+
     #[test]
     fn matref_new_slice_length_error() {
         let repr = Standard::<u32>::new(3, 4).unwrap();
diff --git a/diskann-quantization/src/multi_vector/mod.rs b/diskann-quantization/src/multi_vector/mod.rs
index 3670b1aaf..1d765bacc 100644
--- a/diskann-quantization/src/multi_vector/mod.rs
+++ b/diskann-quantization/src/multi_vector/mod.rs
@@ -74,6 +74,6 @@ pub(crate) mod matrix;
 pub use block_transposed::{BlockTransposed, BlockTransposedMut, BlockTransposedRef};
 pub use distance::{Chamfer, MaxSim, MaxSimError, QueryComputer, QueryMatRef};
 pub use matrix::{
-    Defaulted, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef, Overflow,
-    Repr, ReprMut, ReprOwned, SliceError, Standard,
+    Defaulted, Init, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef,
+    Overflow, Repr, ReprMut, ReprOwned, SliceError, Standard,
 };

From a64279e7172bbaaf3dd3be6cca8b7a05746fb2ae Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Thu, 14 May 2026 20:36:54 +0530
Subject: [PATCH 09/13] Revamp the benchmark to be also kernel-research seam

---
 diskann-benchmark/Cargo.toml                  |   2 +-
 .../example/multi-vector-test.json            |  47 -
 diskann-benchmark/example/multi-vector.json   |  92 +-
 .../perf_test_inputs/multi-vector.json        | 149 ++++
 diskann-benchmark/src/backend/multi_vector.rs | 806 ------------------
 .../src/backend/multi_vector/driver.rs        | 279 ++++++
 .../backend/multi_vector/experimental/mod.rs  |  23 +
 .../multi_vector/experimental/template.rs     | 254 ++++++
 .../backend/multi_vector/library_kernels.rs   | 510 +++++++++++
 .../src/backend/multi_vector/mod.rs           | 233 +++++
 diskann-benchmark/src/inputs/multi_vector.rs  |  63 +-
 diskann-benchmark/src/main.rs                 |   4 +-
 .../src/multi_vector/distance/kernels/f16.rs  |   2 +-
 .../multi_vector/distance/kernels/f32/mod.rs  |   6 +-
 .../distance/kernels/f32/scalar.rs            |   2 +-
 .../multi_vector/distance/kernels/f32/v3.rs   |   2 +-
 .../multi_vector/distance/kernels/layouts.rs  |  37 +-
 .../src/multi_vector/distance/kernels/mod.rs  |  57 +-
 .../distance/kernels/tiled_reduce.rs          |  12 +-
 .../src/multi_vector/distance/mod.rs          |   4 +-
 .../distance/query_computer/f16.rs            |  15 +-
 .../distance/query_computer/f32.rs            |  17 +-
 .../distance/query_computer/mod.rs            |  65 +-
 .../src/multi_vector/matrix.rs                |  60 +-
 diskann-quantization/src/multi_vector/mod.rs  |   4 +-
 25 files changed, 1692 insertions(+), 1053 deletions(-)
 delete mode 100644 diskann-benchmark/example/multi-vector-test.json
 create mode 100644 diskann-benchmark/perf_test_inputs/multi-vector.json
 delete mode 100644 diskann-benchmark/src/backend/multi_vector.rs
 create mode 100644 diskann-benchmark/src/backend/multi_vector/driver.rs
 create mode 100644 diskann-benchmark/src/backend/multi_vector/experimental/mod.rs
 create mode 100644 diskann-benchmark/src/backend/multi_vector/experimental/template.rs
 create mode 100644 diskann-benchmark/src/backend/multi_vector/library_kernels.rs
 create mode 100644 diskann-benchmark/src/backend/multi_vector/mod.rs

diff --git a/diskann-benchmark/Cargo.toml b/diskann-benchmark/Cargo.toml
index efd058ffb..ecc3a53dd 100644
--- a/diskann-benchmark/Cargo.toml
+++ b/diskann-benchmark/Cargo.toml
@@ -63,7 +63,7 @@ scalar-quantization = []
 # Enable minmax-quantization based algorithms
 minmax-quantization = []
 
-# Enable multi-vector distance benchmarks (Chamfer / MaxSim)
+# Enable multi-vector MaxSim distance benchmarks
 multi-vector = []
 
 # Enable Disk Index benchmarks
diff --git a/diskann-benchmark/example/multi-vector-test.json b/diskann-benchmark/example/multi-vector-test.json
deleted file mode 100644
index 28e9b9d64..000000000
--- a/diskann-benchmark/example/multi-vector-test.json
+++ /dev/null
@@ -1,47 +0,0 @@
-{
-  "search_directories": [],
-  "jobs": [
-    {
-      "type": "multi-vector-op",
-      "content": {
-        "element_type": "float32",
-        "implementation": "optimized",
-        "runs": [
-          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 2, "num_measurements": 1 }
-        ]
-      }
-    },
-    {
-      "type": "multi-vector-op",
-      "content": {
-        "element_type": "float16",
-        "implementation": "optimized",
-        "runs": [
-          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
-        ]
-      }
-    },
-    {
-      "type": "multi-vector-op",
-      "content": {
-        "element_type": "float32",
-        "implementation": "reference",
-        "runs": [
-          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 2, "num_measurements": 1 }
-        ]
-      }
-    },
-    {
-      "type": "multi-vector-op",
-      "content": {
-        "element_type": "float16",
-        "implementation": "reference",
-        "runs": [
-          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
-        ]
-      }
-    }
-  ]
-}
diff --git a/diskann-benchmark/example/multi-vector.json b/diskann-benchmark/example/multi-vector.json
index 553a6a9d8..7a4e59539 100644
--- a/diskann-benchmark/example/multi-vector.json
+++ b/diskann-benchmark/example/multi-vector.json
@@ -5,55 +5,20 @@
       "type": "multi-vector-op",
       "content": {
         "element_type": "float32",
-        "implementation": "optimized",
+        "arch": "auto",
         "runs": [
-          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
-
-          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 2, "num_measurements": 1 }
         ]
       }
     },
     {
       "type": "multi-vector-op",
       "content": {
-        "element_type": "float16",
-        "implementation": "optimized",
+        "element_type": "float32",
+        "arch": "scalar",
         "runs": [
-          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
-
-          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
         ]
       }
     },
@@ -61,27 +26,10 @@
       "type": "multi-vector-op",
       "content": {
         "element_type": "float32",
-        "implementation": "reference",
+        "arch": "reference",
         "runs": [
-          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
-
-          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 2, "num_measurements": 1 }
         ]
       }
     },
@@ -89,27 +37,9 @@
       "type": "multi-vector-op",
       "content": {
         "element_type": "float16",
-        "implementation": "reference",
+        "arch": "auto",
         "runs": [
-          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
-
-          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
         ]
       }
     }
diff --git a/diskann-benchmark/perf_test_inputs/multi-vector.json b/diskann-benchmark/perf_test_inputs/multi-vector.json
new file mode 100644
index 000000000..57922fe10
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/multi-vector.json
@@ -0,0 +1,149 @@
+{
+  "search_directories": [],
+  "jobs": [
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "auto",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "scalar",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "x86-64-v3",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "x86-64-v4",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "reference",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "arch": "x86-64-v3",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "arch": "x86-64-v4",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "arch": "reference",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark/src/backend/multi_vector.rs b/diskann-benchmark/src/backend/multi_vector.rs
deleted file mode 100644
index cfdb77f33..000000000
--- a/diskann-benchmark/src/backend/multi_vector.rs
+++ /dev/null
@@ -1,806 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation.
- * Licensed under the MIT license.
- */
-
-//! Multi-vector distance benchmarks (Chamfer / MaxSim) with regression detection.
-
-use diskann_benchmark_runner::registry::Benchmarks;
-
-// Create a stub-module if the "multi-vector" feature is disabled.
-crate::utils::stub_impl!("multi-vector", inputs::multi_vector::MultiVectorOp);
-
-pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) {
-    #[cfg(feature = "multi-vector")]
-    {
-        use half::f16;
-
-        // Optimized (architecture-dispatched QueryComputer).
-        benchmarks.register_regression(
-            "multi-vector-op-f32-optimized",
-            imp::Kernel::<imp::Optimized, f32>::new(),
-        );
-        benchmarks.register_regression(
-            "multi-vector-op-f16-optimized",
-            imp::Kernel::<imp::Optimized, f16>::new(),
-        );
-
-        // Reference (Chamfer / MaxSim fallback path).
-        benchmarks.register_regression(
-            "multi-vector-op-f32-reference",
-            imp::Kernel::<imp::Reference, f32>::new(),
-        );
-        benchmarks.register_regression(
-            "multi-vector-op-f16-reference",
-            imp::Kernel::<imp::Reference, f16>::new(),
-        );
-    }
-
-    // Stub implementation
-    #[cfg(not(feature = "multi-vector"))]
-    imp::register("multi-vector-op", benchmarks);
-}
-
-#[cfg(feature = "multi-vector")]
-mod imp {
-    use std::io::Write;
-
-    use diskann_benchmark_runner::{
-        benchmark::{PassFail, Regression},
-        dispatcher::{DispatchRule, FailureScore, MatchScore},
-        utils::{datatype, num::relative_change, percentiles, MicroSeconds},
-        Benchmark,
-    };
-    use diskann_quantization::multi_vector::{
-        Chamfer, Init, Mat, MatRef, MaxSim, QueryComputer, Standard,
-    };
-    use diskann_vector::distance::InnerProduct;
-    use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
-    use half::f16;
-    use rand::{
-        distr::{Distribution, StandardUniform},
-        rngs::StdRng,
-        SeedableRng,
-    };
-    use serde::{Deserialize, Serialize};
-
-    use crate::inputs::multi_vector::{
-        Implementation, MultiVectorOp, MultiVectorTolerance, Operation, Run,
-    };
-
-    ///////////
-    // Utils //
-    ///////////
-
-    #[derive(Debug, Clone, Copy)]
-    pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T);
-
-    impl<T: ?Sized> std::ops::Deref for DisplayWrapper<'_, T> {
-        type Target = T;
-        fn deref(&self) -> &T {
-            self.0
-        }
-    }
-
-    //////////////
-    // Dispatch //
-    //////////////
-
-    /// Dispatch marker for the [`QueryComputer`] implementation.
-    #[derive(Debug)]
-    pub(super) struct Optimized;
-
-    /// Dispatch marker for the [`Chamfer`] / [`MaxSim`] fallback.
-    #[derive(Debug)]
-    pub(super) struct Reference;
-
-    /// A multi-vector benchmark.
-    pub(super) struct Kernel<I, T> {
-        _type: std::marker::PhantomData<(I, T)>,
-    }
-
-    impl<I, T> Kernel<I, T> {
-        pub(super) fn new() -> Self {
-            Self {
-                _type: std::marker::PhantomData,
-            }
-        }
-    }
-
-    /// Pairs the standard `TryFrom<Implementation>` conversion with the static
-    /// description info needed for friendly diagnostics in `Benchmark::description`.
-    pub(super) trait ImplementationMatcher:
-        TryFrom<Implementation, Error = FailureScore> + 'static
-    {
-        /// Human-readable description of which implementation this marker handles.
-        const DESCRIPTION: &'static str;
-        /// The implementation variant this marker expects (for mismatch diagnostics).
-        const EXPECTED: Implementation;
-    }
-
-    impl TryFrom<Implementation> for Optimized {
-        type Error = FailureScore;
-        fn try_from(i: Implementation) -> Result<Self, Self::Error> {
-            match i {
-                Implementation::Optimized => Ok(Self),
-                _ => Err(FailureScore(1)),
-            }
-        }
-    }
-
-    impl ImplementationMatcher for Optimized {
-        const DESCRIPTION: &'static str = "QueryComputer (architecture-dispatched)";
-        const EXPECTED: Implementation = Implementation::Optimized;
-    }
-
-    impl TryFrom<Implementation> for Reference {
-        type Error = FailureScore;
-        fn try_from(i: Implementation) -> Result<Self, Self::Error> {
-            match i {
-                Implementation::Reference => Ok(Self),
-                _ => Err(FailureScore(1)),
-            }
-        }
-    }
-
-    impl ImplementationMatcher for Reference {
-        const DESCRIPTION: &'static str = "Chamfer / MaxSim fallback";
-        const EXPECTED: Implementation = Implementation::Reference;
-    }
-
-    impl<I, T> Benchmark for Kernel<I, T>
-    where
-        datatype::Type<T>: DispatchRule<datatype::DataType>,
-        I: ImplementationMatcher,
-        Kernel<I, T>: RunBenchmark<I>,
-        T: 'static,
-    {
-        type Input = MultiVectorOp;
-        type Output = Vec<RunResult>;
-
-        fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
-            let mut failscore: Option<u32> = None;
-            if datatype::Type::<T>::try_match(&from.element_type).is_err() {
-                *failscore.get_or_insert(0) += 10;
-            }
-            if let Err(FailureScore(score)) = I::try_from(from.implementation) {
-                *failscore.get_or_insert(0) += 2 + score;
-            }
-
-            match failscore {
-                None => Ok(MatchScore(0)),
-                Some(score) => Err(FailureScore(score)),
-            }
-        }
-
-        fn run(
-            &self,
-            input: &MultiVectorOp,
-            _: diskann_benchmark_runner::Checkpoint<'_>,
-            mut output: &mut dyn diskann_benchmark_runner::Output,
-        ) -> anyhow::Result<Self::Output> {
-            // The dispatcher only invokes `run` after `try_match` has already accepted
-            // the input, so a failure here would indicate a dispatcher bug.
-            I::try_from(input.implementation).expect("try_match accepted the input");
-            writeln!(output, "{}", input)?;
-            let results = self.run_benchmark(input)?;
-            writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
-            Ok(results)
-        }
-
-        fn description(
-            &self,
-            f: &mut std::fmt::Formatter<'_>,
-            input: Option<&MultiVectorOp>,
-        ) -> std::fmt::Result {
-            match input {
-                None => {
-                    writeln!(
-                        f,
-                        "- Element Type: {}",
-                        diskann_benchmark_runner::dispatcher::Description::<
-                            datatype::DataType,
-                            datatype::Type<T>,
-                        >::new()
-                    )?;
-                    writeln!(f, "- Implementation: {}", I::DESCRIPTION)?;
-                }
-                Some(input) => {
-                    if let Err(err) = datatype::Type::<T>::try_match_verbose(&input.element_type) {
-                        writeln!(f, "\n    - Mismatched element type: {}", err)?;
-                    }
-                    if I::try_from(input.implementation).is_err() {
-                        writeln!(
-                            f,
-                            "\n    - Mismatched implementation: expected {}, got {}",
-                            I::EXPECTED,
-                            input.implementation
-                        )?;
-                    }
-                }
-            }
-            Ok(())
-        }
-    }
-
-    impl<I, T> Regression for Kernel<I, T>
-    where
-        datatype::Type<T>: DispatchRule<datatype::DataType>,
-        I: ImplementationMatcher,
-        Kernel<I, T>: RunBenchmark<I>,
-        T: 'static,
-    {
-        type Tolerances = MultiVectorTolerance;
-        type Pass = CheckResult;
-        type Fail = CheckResult;
-
-        fn check(
-            &self,
-            tolerance: &MultiVectorTolerance,
-            _input: &MultiVectorOp,
-            before: &Vec<RunResult>,
-            after: &Vec<RunResult>,
-        ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
-            anyhow::ensure!(
-                before.len() == after.len(),
-                "before has {} runs but after has {}",
-                before.len(),
-                after.len(),
-            );
-
-            let mut passed = true;
-            let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
-                .enumerate()
-                .map(|(i, (b, a))| {
-                    anyhow::ensure!(b.run == a.run, "run {i} mismatched");
-
-                    let computations_per_latency = b.computations_per_latency() as f64;
-
-                    let before_min =
-                        b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
-                    let after_min =
-                        a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
-
-                    let comparison = Comparison {
-                        run: b.run.clone(),
-                        tolerance: *tolerance,
-                        before_min,
-                        after_min,
-                    };
-
-                    match relative_change(before_min, after_min) {
-                        Ok(change) => {
-                            if change > tolerance.min_time_regression.get() {
-                                passed = false;
-                            }
-                        }
-                        Err(_) => passed = false,
-                    };
-
-                    Ok(comparison)
-                })
-                .collect::<anyhow::Result<Vec<Comparison>>>()?;
-
-            let check = CheckResult { checks };
-
-            if passed {
-                Ok(PassFail::Pass(check))
-            } else {
-                Ok(PassFail::Fail(check))
-            }
-        }
-    }
-
-    //////////////////////
-    // Regression Check //
-    //////////////////////
-
-    /// Per-run comparison result showing before/after percentile differences.
-    #[derive(Debug, Serialize)]
-    pub(super) struct Comparison {
-        run: Run,
-        tolerance: MultiVectorTolerance,
-        before_min: f64,
-        after_min: f64,
-    }
-
-    /// Aggregated result of the regression check across all runs.
-    #[derive(Debug, Serialize)]
-    pub(super) struct CheckResult {
-        checks: Vec<Comparison>,
-    }
-
-    impl std::fmt::Display for CheckResult {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            let header = [
-                "Operation",
-                "Q",
-                "D",
-                "Dim",
-                "Min Before (ns/IP @ Dim)",
-                "Min After (ns/IP @ Dim)",
-                "Change (%)",
-                "Remark",
-            ];
-
-            let mut table =
-                diskann_benchmark_runner::utils::fmt::Table::new(header, self.checks.len());
-
-            for (i, c) in self.checks.iter().enumerate() {
-                let mut row = table.row(i);
-                let change = relative_change(c.before_min, c.after_min);
-
-                row.insert(c.run.operation, 0);
-                row.insert(c.run.num_query_vectors, 1);
-                row.insert(c.run.num_doc_vectors, 2);
-                row.insert(c.run.dim, 3);
-                row.insert(format!("{:.3}", c.before_min), 4);
-                row.insert(format!("{:.3}", c.after_min), 5);
-                match change {
-                    Ok(change) => {
-                        row.insert(format!("{:.3} %", change * 100.0), 6);
-                        if change > c.tolerance.min_time_regression.get() {
-                            row.insert("FAIL", 7);
-                        }
-                    }
-                    Err(err) => {
-                        row.insert("invalid", 6);
-                        row.insert(err, 7);
-                    }
-                }
-            }
-
-            table.fmt(f)
-        }
-    }
-
-    ///////////////
-    // Benchmark //
-    ///////////////
-
-    pub(super) trait RunBenchmark<I> {
-        fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error>;
-    }
-
-    #[derive(Debug, Serialize, Deserialize)]
-    pub(super) struct RunResult {
-        /// The configuration for this run.
-        run: Run,
-        /// Per-measurement latencies (over `loops_per_measurement` calls).
-        latencies: Vec<MicroSeconds>,
-        /// Latency percentiles.
-        percentiles: percentiles::Percentiles<MicroSeconds>,
-    }
-
-    impl RunResult {
-        fn computations_per_latency(&self) -> usize {
-            self.run.num_query_vectors.get()
-                * self.run.num_doc_vectors.get()
-                * self.run.loops_per_measurement.get()
-        }
-    }
-
-    impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            if self.is_empty() {
-                return Ok(());
-            }
-
-            // ns/IP is normalized as `min_latency_us * 1000 / (Q * D * loops)` and is
-            // approximately linear in `dim`. Compare across rows with the same `Dim`;
-            // divide further by `Dim` to recover ns per scalar multiply.
-            writeln!(
-                f,
-                "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)"
-            )?;
-
-            let header = [
-                "Operation",
-                "Q",
-                "D",
-                "Dim",
-                "Min Time (ns/IP @ Dim)",
-                "Mean Time (ns/IP @ Dim)",
-                "Loops",
-                "Measurements",
-            ];
-
-            let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.len());
-
-            self.iter().enumerate().for_each(|(row, r)| {
-                let mut row = table.row(row);
-
-                let min_latency = r
-                    .latencies
-                    .iter()
-                    .min()
-                    .copied()
-                    .unwrap_or(MicroSeconds::new(u64::MAX));
-                let mean_latency = r.percentiles.mean;
-
-                let computations_per_latency = r.computations_per_latency() as f64;
-
-                // Convert time from micro-seconds to nano-seconds per inner-product call
-                // (one (query, doc) pair, ~ linear in dim).
-                let min_time = min_latency.as_f64() / computations_per_latency * 1000.0;
-                let mean_time = mean_latency / computations_per_latency * 1000.0;
-
-                row.insert(r.run.operation, 0);
-                row.insert(r.run.num_query_vectors, 1);
-                row.insert(r.run.num_doc_vectors, 2);
-                row.insert(r.run.dim, 3);
-                row.insert(format!("{:.3}", min_time), 4);
-                row.insert(format!("{:.3}", mean_time), 5);
-                row.insert(r.run.loops_per_measurement, 6);
-                row.insert(r.run.num_measurements, 7);
-            });
-
-            table.fmt(f)
-        }
-    }
-
-    fn run_loops<F>(run: &Run, mut body: F) -> RunResult
-    where
-        F: FnMut(),
-    {
-        let mut latencies = Vec::with_capacity(run.num_measurements.get());
-
-        for _ in 0..run.num_measurements.get() {
-            let start = std::time::Instant::now();
-            for _ in 0..run.loops_per_measurement.get() {
-                body();
-            }
-            latencies.push(start.elapsed().into());
-        }
-
-        let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap();
-        RunResult {
-            run: run.clone(),
-            latencies,
-            percentiles,
-        }
-    }
-
-    ///////////////////
-    // Data fixtures //
-    ///////////////////
-
-    const RNG_SEED: u64 = 0x12345;
-
-    struct Data<T: Copy> {
-        queries: Mat<Standard<T>>,
-        docs: Mat<Standard<T>>,
-    }
-
-    impl<T: Copy> Data<T>
-    where
-        StandardUniform: Distribution<T>,
-    {
-        fn new(run: &Run) -> Self {
-            let mut rng = StdRng::seed_from_u64(RNG_SEED);
-            let queries = Mat::new(
-                Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(),
-                Init(|| StandardUniform.sample(&mut rng)),
-            )
-            .unwrap();
-            let docs = Mat::new(
-                Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(),
-                Init(|| StandardUniform.sample(&mut rng)),
-            )
-            .unwrap();
-            Self { queries, docs }
-        }
-    }
-
-    //////////////////////
-    // Distance kernels //
-    //////////////////////
-
-    /// Object-safe abstraction over a per-shape distance executor.
-    ///
-    /// The two implementations ([`OptimizedDistance`] and [`ReferenceDistance`]) share the
-    /// same hot-loop nest in [`run_with_distance`]; dispatching through `&dyn Distance<T>`
-    /// keeps `run_loops` from being monomorphised over the implementation axis.
-    trait Distance<T: Copy> {
-        fn chamfer(&self, doc: MatRef<'_, Standard<T>>) -> f32;
-        fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
-    }
-
-    /// Distance executor that drives [`QueryComputer`] (architecture-dispatched SIMD).
-    struct OptimizedDistance<T: Copy>(QueryComputer<T>);
-
-    impl<T: Copy> Distance<T> for OptimizedDistance<T> {
-        fn chamfer(&self, doc: MatRef<'_, Standard<T>>) -> f32 {
-            self.0.chamfer(doc)
-        }
-        fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
-            self.0.max_sim(doc, scores);
-        }
-    }
-
-    /// Distance executor that drives the [`Chamfer`] / [`MaxSim`] fallback path.
-    struct ReferenceDistance<'a, T: Copy>(
-        diskann_quantization::multi_vector::distance::QueryMatRef<'a, Standard<T>>,
-    );
-
-    impl<T: Copy> Distance<T> for ReferenceDistance<'_, T>
-    where
-        InnerProduct: for<'q, 'd> PureDistanceFunction<&'q [T], &'d [T], f32>,
-    {
-        fn chamfer(&self, doc: MatRef<'_, Standard<T>>) -> f32 {
-            Chamfer::evaluate(self.0, doc)
-        }
-        fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
-            // `MaxSim::new` is a non-empty check + pointer wrap, so constructing it per
-            // iteration is free — no need to hoist it out of the loop.
-            let mut max_sim = MaxSim::new(scores).unwrap();
-            let _ = max_sim.evaluate(self.0, doc);
-        }
-    }
-
-    /////////////////////
-    // Implementations //
-    /////////////////////
-
-    /// Shared loop nest. The trait-object dispatch happens once per outer iteration of
-    /// `run_loops`; the work inside each `chamfer` / `max_sim` call is O(Q*D*dim), so the
-    /// vtable hop is in the noise.
-    fn run_with_distance<T: Copy>(
-        run: &Run,
-        doc: MatRef<'_, Standard<T>>,
-        dist: &dyn Distance<T>,
-    ) -> RunResult {
-        match run.operation {
-            Operation::Chamfer => run_loops(run, || {
-                let v = dist.chamfer(doc);
-                std::hint::black_box(v);
-            }),
-            Operation::MaxSim => {
-                let mut scores = vec![0.0f32; run.num_query_vectors.get()];
-                run_loops(run, || {
-                    dist.max_sim(doc, &mut scores);
-                    std::hint::black_box(&mut scores);
-                })
-            }
-        }
-    }
-
-    fn run_optimized<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
-    where
-        T: Copy,
-        StandardUniform: Distribution<T>,
-        QueryComputer<T>: NewFromMatRef<T>,
-        OptimizedDistance<T>: Distance<T>,
-    {
-        let mut results = Vec::with_capacity(input.runs.len());
-        for run in input.runs.iter() {
-            let data = Data::<T>::new(run);
-            // `QueryComputer` performs query-side precomputation that is intentionally
-            // amortized across many `chamfer` / `max_sim` calls; construct it once per
-            // shape, outside the timed loop.
-            let dist = OptimizedDistance(<QueryComputer<T> as NewFromMatRef<T>>::new_from(
-                data.queries.as_view(),
-            ));
-            results.push(run_with_distance(run, data.docs.as_view(), &dist));
-        }
-        Ok(results)
-    }
-
-    /// Drive the [`Chamfer`] / [`MaxSim`] fallback path.
-    fn run_reference<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
-    where
-        T: Copy,
-        StandardUniform: Distribution<T>,
-        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
-        for<'a> ReferenceDistance<'a, T>: Distance<T>,
-    {
-        let mut results = Vec::with_capacity(input.runs.len());
-        for run in input.runs.iter() {
-            let data = Data::<T>::new(run);
-            let dist = ReferenceDistance(data.queries.as_view().into());
-            results.push(run_with_distance(run, data.docs.as_view(), &dist));
-        }
-        Ok(results)
-    }
-
-    /// Element-type-erasing constructor for [`QueryComputer`].
-    ///
-    /// `QueryComputer::<T>::new` is defined as an inherent method on the concrete
-    /// `QueryComputer<f32>` / `QueryComputer<half::f16>` types (not a generic), so we need
-    /// this shim trait to let generic code (e.g. `run_optimized<T>`) call it.
-    trait NewFromMatRef<T: Copy> {
-        fn new_from(query: MatRef<'_, Standard<T>>) -> QueryComputer<T>;
-    }
-
-    impl NewFromMatRef<f32> for QueryComputer<f32> {
-        fn new_from(query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
-            QueryComputer::<f32>::new(query)
-        }
-    }
-
-    impl NewFromMatRef<f16> for QueryComputer<f16> {
-        fn new_from(query: MatRef<'_, Standard<f16>>) -> QueryComputer<f16> {
-            QueryComputer::<f16>::new(query)
-        }
-    }
-
-    impl<T> RunBenchmark<Optimized> for Kernel<Optimized, T>
-    where
-        T: Copy + 'static,
-        StandardUniform: Distribution<T>,
-        QueryComputer<T>: NewFromMatRef<T>,
-        OptimizedDistance<T>: Distance<T>,
-    {
-        fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>> {
-            run_optimized::<T>(input)
-        }
-    }
-
-    impl<T> RunBenchmark<Reference> for Kernel<Reference, T>
-    where
-        T: Copy + 'static,
-        StandardUniform: Distribution<T>,
-        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
-        for<'a> ReferenceDistance<'a, T>: Distance<T>,
-    {
-        fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>> {
-            run_reference::<T>(input)
-        }
-    }
-
-    ///////////
-    // Tests //
-    ///////////
-
-    #[cfg(test)]
-    mod tests {
-        use std::num::NonZeroUsize;
-
-        use diskann_benchmark_runner::{
-            benchmark::{PassFail, Regression},
-            utils::{datatype::DataType, num::NonNegativeFinite, percentiles::compute_percentiles},
-        };
-
-        use super::*;
-
-        fn tiny_run(operation: Operation) -> Run {
-            Run {
-                operation,
-                num_query_vectors: NonZeroUsize::new(2).unwrap(),
-                num_doc_vectors: NonZeroUsize::new(2).unwrap(),
-                dim: NonZeroUsize::new(4).unwrap(),
-                loops_per_measurement: NonZeroUsize::new(1).unwrap(),
-                num_measurements: NonZeroUsize::new(1).unwrap(),
-            }
-        }
-
-        fn tiny_op() -> MultiVectorOp {
-            MultiVectorOp {
-                element_type: DataType::Float32,
-                implementation: Implementation::Optimized,
-                runs: vec![tiny_run(Operation::Chamfer)],
-            }
-        }
-
-        fn tiny_result(operation: Operation, minimum: u64) -> RunResult {
-            let run = tiny_run(operation);
-            let minimum = MicroSeconds::new(minimum);
-            let mut latencies = vec![minimum];
-            let percentiles = compute_percentiles(&mut latencies).unwrap();
-            RunResult {
-                run,
-                latencies,
-                percentiles,
-            }
-        }
-
-        fn tolerance(limit: f64) -> MultiVectorTolerance {
-            MultiVectorTolerance {
-                min_time_regression: NonNegativeFinite::new(limit).unwrap(),
-            }
-        }
-
-        #[test]
-        fn check_rejects_mismatched_runs() {
-            let kernel = Kernel::<Optimized, f32>::new();
-
-            let err = kernel
-                .check(
-                    &tolerance(0.0),
-                    &tiny_op(),
-                    &vec![tiny_result(Operation::Chamfer, 100)],
-                    &vec![tiny_result(Operation::MaxSim, 100)],
-                )
-                .unwrap_err();
-
-            assert_eq!(err.to_string(), "run 0 mismatched");
-        }
-
-        #[test]
-        fn check_allows_negative_relative_change() {
-            let kernel = Kernel::<Optimized, f32>::new();
-
-            let result = kernel
-                .check(
-                    &tolerance(0.0),
-                    &tiny_op(),
-                    &vec![tiny_result(Operation::Chamfer, 100)],
-                    &vec![tiny_result(Operation::Chamfer, 95)],
-                )
-                .unwrap();
-
-            assert!(matches!(result, PassFail::Pass(_)));
-        }
-
-        #[test]
-        fn check_passes_on_tolerance_boundary() {
-            let kernel = Kernel::<Optimized, f32>::new();
-
-            let result = kernel
-                .check(
-                    &tolerance(0.05),
-                    &tiny_op(),
-                    &vec![tiny_result(Operation::Chamfer, 100)],
-                    &vec![tiny_result(Operation::Chamfer, 105)],
-                )
-                .unwrap();
-
-            assert!(matches!(result, PassFail::Pass(_)));
-        }
-
-        #[test]
-        fn check_fails_above_tolerance_boundary() {
-            let kernel = Kernel::<Optimized, f32>::new();
-
-            let result = kernel
-                .check(
-                    &tolerance(0.05),
-                    &tiny_op(),
-                    &vec![tiny_result(Operation::Chamfer, 100)],
-                    &vec![tiny_result(Operation::Chamfer, 106)],
-                )
-                .unwrap();
-
-            assert!(matches!(result, PassFail::Fail(_)));
-        }
-
-        #[test]
-        fn check_result_display_includes_failure_details() {
-            let check = CheckResult {
-                checks: vec![Comparison {
-                    run: tiny_run(Operation::Chamfer),
-                    tolerance: tolerance(0.05),
-                    before_min: 100.0,
-                    after_min: 106.0,
-                }],
-            };
-
-            let rendered = check.to_string();
-            assert!(rendered.contains("Operation"), "rendered = {rendered}");
-            assert!(rendered.contains("chamfer"), "rendered = {rendered}");
-            assert!(rendered.contains("100.000"), "rendered = {rendered}");
-            assert!(rendered.contains("106.000"), "rendered = {rendered}");
-            assert!(rendered.contains("6.000 %"), "rendered = {rendered}");
-            assert!(rendered.contains("FAIL"), "rendered = {rendered}");
-        }
-
-        /// A "before" value of 0 means the measurement was too fast to obtain a
-        /// reliable signal, so we *could* be letting a regression through. We
-        /// require at least a non-zero value.
-        #[test]
-        fn zero_values_rejected() {
-            let kernel = Kernel::<Optimized, f32>::new();
-
-            let result = kernel
-                .check(
-                    &tolerance(0.05),
-                    &tiny_op(),
-                    &vec![tiny_result(Operation::Chamfer, 0)],
-                    &vec![tiny_result(Operation::Chamfer, 0)],
-                )
-                .unwrap();
-
-            assert!(matches!(result, PassFail::Fail(_)));
-        }
-    }
-}
diff --git a/diskann-benchmark/src/backend/multi_vector/driver.rs b/diskann-benchmark/src/backend/multi_vector/driver.rs
new file mode 100644
index 000000000..2f83eb22f
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/driver.rs
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Shared benchmark infrastructure for multi-vector kernels.
+//!
+//! Houses the timing harness ([`run_loops`]), data fixtures ([`Data`]), result
+//! types ([`RunResult`], [`Comparison`], [`CheckResult`]), and the trait-object
+//! [`Distance<T>`] boundary that both library and experimental kernels go
+//! through. None of the contents are kernel-aware.
+
+use diskann_benchmark_runner::utils::{
+    fmt::Table, num::relative_change, percentiles, MicroSeconds,
+};
+use diskann_quantization::multi_vector::distance::QueryMatRef;
+use diskann_quantization::multi_vector::{Mat, MatRef, MaxSim, QueryComputer, Standard};
+use diskann_vector::distance::InnerProduct;
+use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
+use rand::{
+    distr::{Distribution, StandardUniform},
+    rngs::StdRng,
+    SeedableRng,
+};
+use serde::{Deserialize, Serialize};
+
+use crate::inputs::multi_vector::{MultiVectorTolerance, Run};
+
+///////////////////
+// Data fixtures //
+///////////////////
+
+/// Random query / doc fixture for a single benchmark run.
+pub(super) struct Data<T: Copy> {
+    pub(super) queries: Mat<Standard<T>>,
+    pub(super) docs: Mat<Standard<T>>,
+}
+
+impl<T: Copy> Data<T>
+where
+    StandardUniform: Distribution<T>,
+{
+    pub(super) fn new(run: &Run) -> Self {
+        let mut rng = StdRng::seed_from_u64(0x12345);
+        let queries = Mat::from_fn(
+            Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(),
+            || StandardUniform.sample(&mut rng),
+        );
+        let docs = Mat::from_fn(
+            Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(),
+            || StandardUniform.sample(&mut rng),
+        );
+        Self { queries, docs }
+    }
+}
+
+//////////////////////
+// Distance kernels //
+//////////////////////
+
+/// Object-safe abstraction over a per-shape distance executor.
+///
+/// `OptimizedDistance` wraps any [`QueryComputer<T>`] — library-shipped
+/// arch-pinned ones (via `from_arch`) AND experimental ones (via
+/// `from_dyn`) — so the driver's hot loop dispatches through one vtable
+/// hop regardless of which kernel produced the computer.
+/// `ReferenceDistance` is the only path that doesn't go through
+/// `QueryComputer` (it uses the `MaxSim` fallback directly).
+pub(super) trait Distance<T: Copy> {
+    fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
+}
+
+/// Distance executor wrapping a [`QueryComputer<T>`] — covers all arch-pinned,
+/// auto-dispatched, and experimental kernels.
+pub(super) struct OptimizedDistance<T: Copy>(pub(super) QueryComputer<T>);
+
+impl<T: Copy> Distance<T> for OptimizedDistance<T> {
+    fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
+        self.0.max_sim(doc, scores);
+    }
+}
+
+/// Distance executor driving the [`MaxSim`] fallback path.
+pub(super) struct ReferenceDistance<'a, T: Copy>(pub(super) QueryMatRef<'a, Standard<T>>);
+
+impl<T: Copy> Distance<T> for ReferenceDistance<'_, T>
+where
+    InnerProduct: for<'q, 'd> PureDistanceFunction<&'q [T], &'d [T], f32>,
+{
+    fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
+        // `MaxSim::new` is a non-empty check + pointer wrap, free per iteration.
+        let mut max_sim = MaxSim::new(scores).unwrap();
+        let _ = max_sim.evaluate(self.0, doc);
+    }
+}
+
+//////////////////////
+// Timing harness   //
+//////////////////////
+
+fn run_loops<F>(run: &Run, mut body: F) -> RunResult
+where
+    F: FnMut(),
+{
+    let mut latencies = Vec::with_capacity(run.num_measurements.get());
+
+    for _ in 0..run.num_measurements.get() {
+        let start = std::time::Instant::now();
+        for _ in 0..run.loops_per_measurement.get() {
+            body();
+        }
+        latencies.push(start.elapsed().into());
+    }
+
+    let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap();
+    RunResult {
+        run: run.clone(),
+        latencies,
+        percentiles,
+    }
+}
+
+/// Shared loop nest. The trait-object dispatch happens once per outer iteration
+/// of `run_loops`; the work inside each `max_sim` call is O(Q·D·dim), so the
+/// vtable hop is in the noise.
+pub(super) fn run_with_distance<T: Copy>(
+    run: &Run,
+    doc: MatRef<'_, Standard<T>>,
+    dist: &dyn Distance<T>,
+) -> RunResult {
+    let mut scores = vec![0.0f32; run.num_query_vectors.get()];
+    run_loops(run, || {
+        dist.max_sim(doc, &mut scores);
+        std::hint::black_box(&mut scores);
+    })
+}
+
+//////////////////////
+// Result types     //
+//////////////////////
+
+#[derive(Debug, Clone, Copy)]
+pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T);
+
+impl<T: ?Sized> std::ops::Deref for DisplayWrapper<'_, T> {
+    type Target = T;
+    fn deref(&self) -> &T {
+        self.0
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub(super) struct RunResult {
+    /// The configuration for this run.
+    pub(super) run: Run,
+    /// Per-measurement latencies (over `loops_per_measurement` calls).
+    pub(super) latencies: Vec<MicroSeconds>,
+    /// Latency percentiles.
+    pub(super) percentiles: percentiles::Percentiles<MicroSeconds>,
+}
+
+impl RunResult {
+    pub(super) fn computations_per_latency(&self) -> usize {
+        self.run.num_query_vectors.get()
+            * self.run.num_doc_vectors.get()
+            * self.run.loops_per_measurement.get()
+    }
+}
+
+impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.is_empty() {
+            return Ok(());
+        }
+
+        writeln!(
+            f,
+            "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)"
+        )?;
+
+        let header = [
+            "Q",
+            "D",
+            "Dim",
+            "Min Time (ns/IP @ Dim)",
+            "Mean Time (ns/IP @ Dim)",
+            "Loops",
+            "Measurements",
+        ];
+
+        let mut table = Table::new(header, self.len());
+
+        self.iter().enumerate().for_each(|(row, r)| {
+            let mut row = table.row(row);
+
+            let min_latency = r
+                .latencies
+                .iter()
+                .min()
+                .copied()
+                .unwrap_or(MicroSeconds::new(u64::MAX));
+            let mean_latency = r.percentiles.mean;
+
+            let computations_per_latency = r.computations_per_latency() as f64;
+            let min_time = min_latency.as_f64() / computations_per_latency * 1000.0;
+            let mean_time = mean_latency / computations_per_latency * 1000.0;
+
+            row.insert(r.run.num_query_vectors, 0);
+            row.insert(r.run.num_doc_vectors, 1);
+            row.insert(r.run.dim, 2);
+            row.insert(format!("{:.3}", min_time), 3);
+            row.insert(format!("{:.3}", mean_time), 4);
+            row.insert(r.run.loops_per_measurement, 5);
+            row.insert(r.run.num_measurements, 6);
+        });
+
+        table.fmt(f)
+    }
+}
+
+//////////////////////
+// Regression Check //
+//////////////////////
+
+/// Per-run comparison result showing before/after percentile differences.
+#[derive(Debug, Serialize)]
+pub(super) struct Comparison {
+    pub(super) run: Run,
+    pub(super) tolerance: MultiVectorTolerance,
+    pub(super) before_min: f64,
+    pub(super) after_min: f64,
+}
+
+/// Aggregated result of the regression check across all runs.
+#[derive(Debug, Serialize)]
+pub(super) struct CheckResult {
+    pub(super) checks: Vec<Comparison>,
+}
+
+impl std::fmt::Display for CheckResult {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let header = [
+            "Q",
+            "D",
+            "Dim",
+            "Min Before (ns/IP @ Dim)",
+            "Min After (ns/IP @ Dim)",
+            "Change (%)",
+            "Remark",
+        ];
+
+        let mut table = Table::new(header, self.checks.len());
+
+        for (i, c) in self.checks.iter().enumerate() {
+            let mut row = table.row(i);
+            let change = relative_change(c.before_min, c.after_min);
+
+            row.insert(c.run.num_query_vectors, 0);
+            row.insert(c.run.num_doc_vectors, 1);
+            row.insert(c.run.dim, 2);
+            row.insert(format!("{:.3}", c.before_min), 3);
+            row.insert(format!("{:.3}", c.after_min), 4);
+            match change {
+                Ok(change) => {
+                    row.insert(format!("{:.3} %", change * 100.0), 5);
+                    if change > c.tolerance.min_time_regression.get() {
+                        row.insert("FAIL", 6);
+                    }
+                }
+                Err(err) => {
+                    row.insert("invalid", 5);
+                    row.insert(err, 6);
+                }
+            }
+        }
+
+        table.fmt(f)
+    }
+}
diff --git a/diskann-benchmark/src/backend/multi_vector/experimental/mod.rs b/diskann-benchmark/src/backend/multi_vector/experimental/mod.rs
new file mode 100644
index 000000000..b0e106fc4
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/experimental/mod.rs
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Researcher-authored experimental multi-vector kernels.
+//!
+//! See [`template`] for the full kernel-author workflow (writing a `Kernel<A>`
+//! impl, adapting it via `DynQueryComputer<T>`, wiring up dispatch and
+//! registration, and validating under Miri).
+//!
+//! New experimental kernels live in their own module file in this directory.
+//! Their registration goes in [`register`] below.
+
+use diskann_benchmark_runner::registry::Benchmarks;
+
+mod template;
+
+pub(super) fn register(_benchmarks: &mut Benchmarks) {
+    // No experimental kernels registered by default.
+    // Add `benchmarks.register_regression(...)` calls here when authoring
+    // new experimental kernels.
+}
diff --git a/diskann-benchmark/src/backend/multi_vector/experimental/template.rs b/diskann-benchmark/src/backend/multi_vector/experimental/template.rs
new file mode 100644
index 000000000..f09f0c74e
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/experimental/template.rs
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! **Template for an experimental multi-vector kernel.**
+//!
+//! Copy this file (e.g. to `v4_wide.rs`), rename `Template*`, change the
+//! `Kernel<A>` impl to your target ISA, and add an `Arch` variant + a
+//! `register_regression` call to wire it up.
+//!
+//! # The 6-step workflow
+//!
+//! 1. **Add an [`Arch`](crate::inputs::multi_vector::Arch) variant** for your
+//!    experimental kernel (e.g. `X86_64_V4_Wide`). The `#[non_exhaustive]`
+//!    attribute on `Arch` makes this a non-breaking addition.
+//! 2. **Author the micro-kernel.** Implement
+//!    [`Kernel<A>`](diskann_quantization::multi_vector::distance::kernels::Kernel)
+//!    on your struct (`unsafe impl Kernel<V4>` etc.), filling in
+//!    `full_panel` and `partial_panel` with your SIMD intrinsics.
+//! 3. **Author the adapter.** Implement
+//!    [`DynQueryComputer<T>`](diskann_quantization::multi_vector::distance::DynQueryComputer)
+//!    on a struct that owns the prepared query data; in `compute_max_sim`,
+//!    call
+//!    [`tiled_reduce`](diskann_quantization::multi_vector::distance::kernels::tiled_reduce)
+//!    with your kernel.
+//! 4. **Add a marker + `DispatchRule<Arch>`.** Mirror the pattern in
+//!    `library_kernels.rs` (e.g. `match_arch_x86_64!`) for your new variant.
+//! 5. **Add a `RunBenchmark<Marker>` impl + `register_regression` call.** Use
+//!    `Kernel::<Marker, T>::new()` as the registered benchmark entry.
+//! 6. **Validate under Miri.** See the section below.
+//!
+//! # Validating under Miri (REQUIRED)
+//!
+//! Experimental kernels rely on `unsafe fn full_panel` / `partial_panel`
+//! with raw-pointer arithmetic. Pointer provenance, alignment, and
+//! out-of-bounds bugs are easy to introduce and hard to catch by
+//! inspection. **Run your kernel under Miri before assuming it's correct.**
+//!
+//! Rules:
+//!
+//! - Inside your `#[cfg(test)]` module, construct arch tokens via the
+//!   Miri-friendly variants: `Scalar::new()` (always Miri-safe) or
+//!   `V4::new_checked_miri()` (returns a token unconditionally under
+//!   `cfg(miri)` using AVX-512 emulation, so tests run even when Miri
+//!   can't do real CPU detection). `V3` and `Neon` only expose
+//!   `new_checked()` today — if you need them under Miri, follow
+//!   `V4::new_checked_miri()`'s pattern in `diskann-wide`.
+//! - Any SIMD intrinsic Miri doesn't support must have a scalar fallback
+//!   gated by `#[cfg(miri)]`.
+//! - Add at least one small-shape correctness test that runs your kernel
+//!   against a naive reference and is Miri-friendly.
+//! - Run: `cargo +nightly miri test -p diskann-benchmark --features multi-vector
+//!   backend::multi_vector::experimental::<your_kernel>`. Reduce
+//!   test-sweep size under Miri with `if cfg!(miri) { small } else { full }`
+//!   (see this file's test for the pattern).
+//!
+//! Miri won't catch performance bugs, but it'll catch UB — and UB in an
+//! experimental kernel breaks the benchmark binary, not the kernel you're
+//! trying to measure.
+//!
+//! # This template
+//!
+//! This file defines `TemplateKernel: Kernel<Scalar>` (uses `Scalar` so the
+//! template is host-portable + Miri-friendly) and a `TemplateComputer`
+//! adapter that pipes it through `tiled_reduce`. It is **not registered** as
+//! a benchmark entry — see step 5 in the workflow. The included
+//! `#[cfg(test)]` `template_matches_pinned_scalar` test exercises the API
+//! surface end-to-end so this file catches public-API drift even though it
+//! isn't wired into the benchmark dispatcher.
+
+#![allow(dead_code)]
+
+use diskann_quantization::multi_vector::distance::{
+    kernels::{layouts, tiled_reduce, Kernel, TileBudget},
+    DynQueryComputer,
+};
+use diskann_quantization::multi_vector::{BlockTransposed, BlockTransposedRef, MatRef, Standard};
+use diskann_wide::arch::Scalar;
+
+/// Step 2: the micro-kernel struct. Rename and implement for your target arch.
+pub(super) struct TemplateKernel;
+
+// SAFETY: `full_panel` / `partial_panel` only access `A_PANEL * k` /
+// `B_PANEL * k` source elements and write `A_PANEL` destination f32s,
+// matching `Kernel<Scalar>`'s safety contract. The simple scalar
+// computation here is Miri-clean.
+unsafe impl Kernel<Scalar> for TemplateKernel {
+    type Left = layouts::BlockTransposedLayout<f32, 8>;
+    type Right = layouts::RowMajor<f32>;
+    const A_PANEL: usize = 8;
+    const B_PANEL: usize = 2;
+
+    unsafe fn full_panel(_arch: Scalar, a: *const f32, b: *const f32, k: usize, r: *mut f32) {
+        // SAFETY: a covers A_PANEL * k contiguous block-transposed f32s,
+        // b covers B_PANEL * k contiguous row-major f32s, r covers A_PANEL f32s.
+        unsafe { panel::<8, 2>(a, b, k, r) }
+    }
+
+    unsafe fn partial_panel(
+        _arch: Scalar,
+        remainder: usize,
+        a: *const f32,
+        b: *const f32,
+        k: usize,
+        r: *mut f32,
+    ) {
+        debug_assert!(remainder == 1);
+        // SAFETY: as full_panel but with `b` covering `remainder * k` f32s.
+        unsafe { panel::<8, 1>(a, b, k, r) }
+    }
+}
+
+/// Replace this with your SIMD intrinsics. The block-transposed A layout
+/// stores `A_ROWS` contiguous f32s per dimension index `i`, so the q-th
+/// query row at dimension i lives at `a[i * A_ROWS + q]`. The row-major B
+/// layout stores doc d's k-th element at `b[d * k_dim + k]`. The scratch
+/// `r` accumulates max IP per query row (library convention; the
+/// `QueryComputer` veneer negates at the end).
+///
+/// # Safety
+/// - `a` covers `A_ROWS * k` block-transposed f32s.
+/// - `b` covers `B_ROWS * k` row-major f32s.
+/// - `r` covers `A_ROWS` writable f32s.
+unsafe fn panel<const A_ROWS: usize, const B_ROWS: usize>(
+    a: *const f32,
+    b: *const f32,
+    k: usize,
+    r: *mut f32,
+) {
+    for q in 0..A_ROWS {
+        // SAFETY: q < A_ROWS.
+        let mut best = unsafe { *r.add(q) };
+
+        for d in 0..B_ROWS {
+            let mut ip: f32 = 0.0;
+            for i in 0..k {
+                // SAFETY: i < k, q < A_ROWS.
+                let a_val = unsafe { *a.add(i * A_ROWS + q) };
+                // SAFETY: d < B_ROWS, b covers B_ROWS rows of k f32s each.
+                let b_val = unsafe { *b.add(d * k + i) };
+                ip += a_val * b_val;
+            }
+            best = best.max(ip);
+        }
+
+        // SAFETY: q < A_ROWS.
+        unsafe { *r.add(q) = best };
+    }
+}
+
+/// Step 3: the `DynQueryComputer<T>` adapter. Owns the prepared query data
+/// and routes `compute_max_sim` through `tiled_reduce` with the kernel.
+#[derive(Debug)]
+pub(super) struct TemplateComputer {
+    arch: Scalar,
+    prepared: BlockTransposed<f32, 8>,
+}
+
+impl TemplateComputer {
+    pub(super) fn new(query: MatRef<'_, Standard<f32>>) -> Self {
+        let prepared = BlockTransposed::<f32, 8>::from_matrix_view(query.as_matrix_view());
+        Self {
+            arch: Scalar::new(),
+            prepared,
+        }
+    }
+}
+
+impl DynQueryComputer<f32> for TemplateComputer {
+    fn nrows(&self) -> usize {
+        self.prepared.nrows()
+    }
+
+    fn compute_max_sim(&self, doc: MatRef<'_, Standard<f32>>, scores: &mut [f32]) {
+        let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()];
+        let prepared_ref: BlockTransposedRef<'_, f32, 8> = self.prepared.as_view();
+        let ca = <BlockTransposedRef<'_, f32, 8> as layouts::DescribeLayout>::layout(&prepared_ref);
+        let cb = <MatRef<'_, Standard<f32>> as layouts::DescribeLayout>::layout(&doc);
+
+        // SAFETY: prepared.as_ptr() covers padded_nrows * ncols block-transposed
+        // f32s; doc.as_slice() covers num_vectors * vector_dim row-major f32s;
+        // scratch length == padded_nrows; padded_nrows is a multiple of
+        // A_PANEL=8 by BlockTransposed construction.
+        unsafe {
+            tiled_reduce::<Scalar, TemplateKernel, _, _>(
+                self.arch,
+                &ca,
+                &cb,
+                self.prepared.as_ptr(),
+                self.prepared.padded_nrows(),
+                doc.as_slice().as_ptr(),
+                doc.num_vectors(),
+                doc.vector_dim(),
+                &mut scratch,
+                TileBudget::default(),
+            );
+        }
+
+        for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) {
+            *dst = -src;
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    //! Miri-friendly correctness test for the template kernel.
+    //!
+    //! Validates that the template's adapter machinery produces the same
+    //! per-row scores as `QueryComputer::from_arch(Scalar)`. Iteration
+    //! count is reduced under Miri so `cargo +nightly miri test` finishes
+    //! in seconds, not minutes.
+    use super::*;
+    use diskann_quantization::multi_vector::QueryComputer;
+
+    fn make_data(nrows: usize, ncols: usize, shift: usize) -> Vec<f32> {
+        (0..nrows * ncols)
+            .map(|v| ((v + shift) % ncols) as f32)
+            .collect()
+    }
+
+    #[test]
+    fn template_matches_pinned_scalar() {
+        let cases: &[(usize, usize, usize)] = if cfg!(miri) {
+            // Single small case under Miri to keep runtime reasonable.
+            &[(3, 4, 8)]
+        } else {
+            &[(1, 1, 4), (3, 5, 8), (8, 4, 16), (10, 6, 32)]
+        };
+
+        for &(nq, nd, dim) in cases {
+            let qd = make_data(nq, dim, dim / 2);
+            let dd = make_data(nd, dim, dim);
+            let query = MatRef::new(Standard::<f32>::new(nq, dim).unwrap(), &qd).unwrap();
+            let doc = MatRef::new(Standard::<f32>::new(nd, dim).unwrap(), &dd).unwrap();
+
+            let pinned = QueryComputer::<f32>::from_arch(query, Scalar::new());
+            let template = QueryComputer::<f32>::from_dyn(Box::new(TemplateComputer::new(query)));
+
+            let mut pinned_scores = vec![0.0f32; nq];
+            let mut template_scores = vec![0.0f32; nq];
+            pinned.max_sim(doc, &mut pinned_scores);
+            template.max_sim(doc, &mut template_scores);
+
+            for (i, (p, t)) in pinned_scores.iter().zip(template_scores.iter()).enumerate() {
+                assert!(
+                    (p - t).abs() < 1e-10,
+                    "shape ({nq},{nd},{dim}) row {i}: pinned={p} template={t}",
+                );
+            }
+        }
+    }
+}
diff --git a/diskann-benchmark/src/backend/multi_vector/library_kernels.rs b/diskann-benchmark/src/backend/multi_vector/library_kernels.rs
new file mode 100644
index 000000000..56d6c0db1
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/library_kernels.rs
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Library kernel registrations and arch-dispatch machinery.
+//!
+//! Mirrors the structure of `diskann-benchmark-simd`: a `Kernel<A, T>`
+//! PhantomData carrier carries the (arch × element type) pair through the
+//! benchmark registry, [`DispatchRule<Arch>`] maps the JSON-facing `Arch`
+//! enum to a concrete arch token, and the `stamp!` / `match_arch!` macros
+//! generate the repetitive `RunBenchmark<A>` / `DispatchRule` impls.
+//!
+//! Library kernels registered here:
+//! - `multi-vector-op-{f32,f16}-auto` — `QueryComputer::new` (auto-dispatch)
+//! - `multi-vector-op-{f32,f16}-scalar` — `from_arch(Scalar)`
+//! - `multi-vector-op-{f32,f16}-x86_64_V3` — `from_arch(V3)` (x86_64 only)
+//! - `multi-vector-op-{f32,f16}-x86_64_V4` — `from_arch(V4)` (x86_64 only)
+//! - `multi-vector-op-{f32,f16}-aarch64_neon` — `from_arch(Neon)` (aarch64 only)
+//! - `multi-vector-op-{f32,f16}-reference` — `MaxSim` fallback
+
+use std::io::Write;
+use std::marker::PhantomData;
+
+use diskann_benchmark_runner::{
+    benchmark::{PassFail, Regression},
+    dispatcher::{Description, DispatchRule, FailureScore, MatchScore},
+    utils::{datatype, num::relative_change},
+    Benchmark, Checkpoint, Output,
+};
+use diskann_quantization::multi_vector::{MatRef, QueryComputer, Standard};
+use diskann_vector::distance::InnerProduct;
+use diskann_vector::PureDistanceFunction;
+#[cfg(target_arch = "aarch64")]
+use diskann_wide::arch::aarch64::Neon;
+#[cfg(target_arch = "x86_64")]
+use diskann_wide::arch::x86_64::{V3, V4};
+use diskann_wide::arch::Scalar;
+use diskann_wide::Architecture;
+use rand::distr::{Distribution, StandardUniform};
+
+use super::driver::{
+    run_with_distance, CheckResult, Comparison, Data, DisplayWrapper, OptimizedDistance,
+    ReferenceDistance, RunResult,
+};
+use crate::inputs::multi_vector::{Arch, MultiVectorOp, MultiVectorTolerance};
+
+/// PhantomData carrier for one (arch, element-type) entry in the benchmark
+/// registry. The arch parameter `A` is either a real arch token (`Scalar`,
+/// `V3`, `V4`, `Neon`) or one of the marker types [`Auto`] / [`Reference`].
+pub(super) struct Kernel<A, T> {
+    _type: PhantomData<(A, T)>,
+}
+
+impl<A, T> Kernel<A, T> {
+    pub(super) fn new() -> Self {
+        Self { _type: PhantomData }
+    }
+}
+
+/// Marker for the auto-dispatched (CPU-detected) kernel — `QueryComputer::new`.
+#[derive(Debug, Clone, Copy)]
+pub(super) struct Auto;
+
+/// Marker for the reference (`MaxSim` fallback) kernel.
+#[derive(Debug, Clone, Copy)]
+pub(super) struct Reference;
+
+/// Wrapper around an arch token (real or marker) that implements
+/// [`DispatchRule<Arch>`] for the JSON-facing [`Arch`] enum.
+pub(super) struct Identity<A>(pub(super) A);
+
+/// Returned by `Identity::<A>::convert` when the host CPU doesn't support the
+/// requested ISA. The dispatcher converts this into a friendly error message.
+#[derive(Debug, Clone, Copy)]
+pub(super) struct ArchNotSupported(pub(super) Arch);
+
+impl std::fmt::Display for ArchNotSupported {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} not supported on this CPU", self.0)
+    }
+}
+
+impl std::error::Error for ArchNotSupported {}
+
+//////////////////////
+// Dispatch rules   //
+//////////////////////
+
+/// Generates a [`DispatchRule<Arch>`] for a real arch token. `try_match` returns:
+/// - `Ok(MatchScore(0))` when the input names this arch AND the host CPU supports it
+/// - `Err(FailureScore(0))` when the input names this arch but the CPU doesn't support it
+///   (this surfaces in the dispatcher's near-miss diagnostic)
+/// - `Err(FailureScore(1))` when the input names a different arch
+macro_rules! match_arch_x86_64 {
+    ($arch:path, $enum:ident) => {
+        #[cfg(target_arch = "x86_64")]
+        impl DispatchRule<Arch> for Identity<$arch> {
+            type Error = ArchNotSupported;
+            fn try_match(from: &Arch) -> Result<MatchScore, FailureScore> {
+                if *from != Arch::$enum {
+                    return Err(FailureScore(1));
+                }
+                if <$arch>::new_checked().is_some() {
+                    Ok(MatchScore(0))
+                } else {
+                    Err(FailureScore(0))
+                }
+            }
+            fn convert(from: Arch) -> Result<Self, Self::Error> {
+                <$arch>::new_checked()
+                    .ok_or(ArchNotSupported(from))
+                    .map(Identity)
+            }
+        }
+    };
+}
+
+match_arch_x86_64!(V3, X86_64_V3);
+match_arch_x86_64!(V4, X86_64_V4);
+
+#[cfg(target_arch = "aarch64")]
+impl DispatchRule<Arch> for Identity<Neon> {
+    type Error = ArchNotSupported;
+    fn try_match(from: &Arch) -> Result<MatchScore, FailureScore> {
+        if *from != Arch::Neon {
+            return Err(FailureScore(1));
+        }
+        if Neon::new_checked().is_some() {
+            Ok(MatchScore(0))
+        } else {
+            Err(FailureScore(0))
+        }
+    }
+    fn convert(from: Arch) -> Result<Self, Self::Error> {
+        Neon::new_checked()
+            .ok_or(ArchNotSupported(from))
+            .map(Identity)
+    }
+}
+
+// Scalar is always available; no CPU check needed.
+impl DispatchRule<Arch> for Identity<Scalar> {
+    type Error = ArchNotSupported;
+    fn try_match(from: &Arch) -> Result<MatchScore, FailureScore> {
+        if *from == Arch::Scalar {
+            Ok(MatchScore(0))
+        } else {
+            Err(FailureScore(1))
+        }
+    }
+    fn convert(_from: Arch) -> Result<Self, Self::Error> {
+        Ok(Identity(Scalar::new()))
+    }
+}
+
+impl DispatchRule<Arch> for Identity<Auto> {
+    type Error = ArchNotSupported;
+    fn try_match(from: &Arch) -> Result<MatchScore, FailureScore> {
+        if *from == Arch::Auto {
+            Ok(MatchScore(0))
+        } else {
+            Err(FailureScore(1))
+        }
+    }
+    fn convert(_from: Arch) -> Result<Self, Self::Error> {
+        Ok(Identity(Auto))
+    }
+}
+
+impl DispatchRule<Arch> for Identity<Reference> {
+    type Error = ArchNotSupported;
+    fn try_match(from: &Arch) -> Result<MatchScore, FailureScore> {
+        if *from == Arch::Reference {
+            Ok(MatchScore(0))
+        } else {
+            Err(FailureScore(1))
+        }
+    }
+    fn convert(_from: Arch) -> Result<Self, Self::Error> {
+        Ok(Identity(Reference))
+    }
+}
+
+//////////////////////
+// Benchmark trait  //
+//////////////////////
+
+/// Per-arch run trait. The `stamp!` macro generates impls for real arch tokens;
+/// `Auto` and `Reference` get hand-written impls.
+pub(super) trait RunBenchmark<A> {
+    fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>;
+}
+
+impl<A, T> Benchmark for Kernel<A, T>
+where
+    datatype::Type<T>: DispatchRule<datatype::DataType>,
+    Identity<A>: DispatchRule<Arch, Error = ArchNotSupported>,
+    Kernel<A, T>: RunBenchmark<A>,
+    A: 'static,
+    T: 'static,
+{
+    type Input = MultiVectorOp;
+    type Output = Vec<RunResult>;
+
+    fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
+        let mut failscore: Option<u32> = None;
+        if datatype::Type::<T>::try_match(&from.element_type).is_err() {
+            *failscore.get_or_insert(0) += 10;
+        }
+        match Identity::<A>::try_match(&from.arch) {
+            Ok(MatchScore(_)) => (),
+            Err(FailureScore(score)) => {
+                *failscore.get_or_insert(0) += score;
+            }
+        }
+        match failscore {
+            None => Ok(MatchScore(0)),
+            Some(score) => Err(FailureScore(score)),
+        }
+    }
+
+    fn run(
+        &self,
+        input: &MultiVectorOp,
+        _: Checkpoint<'_>,
+        mut output: &mut dyn Output,
+    ) -> anyhow::Result<Self::Output> {
+        writeln!(output, "{}", input)?;
+        let results = self.run_benchmark(input)?;
+        writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
+        Ok(results)
+    }
+
+    fn description(
+        &self,
+        f: &mut std::fmt::Formatter<'_>,
+        input: Option<&MultiVectorOp>,
+    ) -> std::fmt::Result {
+        match input {
+            None => {
+                writeln!(
+                    f,
+                    "- Element Type: {}",
+                    Description::<datatype::DataType, datatype::Type<T>>::new()
+                )?;
+                writeln!(f, "- Arch: {}", Description::<Arch, Identity<A>>::new())?;
+            }
+            Some(input) => {
+                if let Err(err) = datatype::Type::<T>::try_match_verbose(&input.element_type) {
+                    writeln!(f, "\n    - Mismatched element type: {}", err)?;
+                }
+                if Identity::<A>::try_match(&input.arch).is_err() {
+                    writeln!(f, "\n    - Wrong or unsupported arch: {}", input.arch)?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<A, T> Regression for Kernel<A, T>
+where
+    datatype::Type<T>: DispatchRule<datatype::DataType>,
+    Identity<A>: DispatchRule<Arch, Error = ArchNotSupported>,
+    Kernel<A, T>: RunBenchmark<A>,
+    A: 'static,
+    T: 'static,
+{
+    type Tolerances = MultiVectorTolerance;
+    type Pass = CheckResult;
+    type Fail = CheckResult;
+
+    fn check(
+        &self,
+        tolerance: &MultiVectorTolerance,
+        _input: &MultiVectorOp,
+        before: &Vec<RunResult>,
+        after: &Vec<RunResult>,
+    ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
+        anyhow::ensure!(
+            before.len() == after.len(),
+            "before has {} runs but after has {}",
+            before.len(),
+            after.len(),
+        );
+
+        let mut passed = true;
+        let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
+            .enumerate()
+            .map(|(i, (b, a))| {
+                anyhow::ensure!(b.run == a.run, "run {i} mismatched");
+
+                let computations_per_latency = b.computations_per_latency() as f64;
+                let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+                let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+
+                let comparison = Comparison {
+                    run: b.run.clone(),
+                    tolerance: *tolerance,
+                    before_min,
+                    after_min,
+                };
+
+                match relative_change(before_min, after_min) {
+                    Ok(change) => {
+                        if change > tolerance.min_time_regression.get() {
+                            passed = false;
+                        }
+                    }
+                    Err(_) => passed = false,
+                };
+
+                Ok(comparison)
+            })
+            .collect::<anyhow::Result<Vec<Comparison>>>()?;
+
+        let check = CheckResult { checks };
+        Ok(if passed {
+            PassFail::Pass(check)
+        } else {
+            PassFail::Fail(check)
+        })
+    }
+}
+
+//////////////////////
+// RunBenchmark impls
+//////////////////////
+
+/// Element-type-erasing constructor for [`QueryComputer<T>`]. `QueryComputer`'s
+/// `new` / `from_arch` are inherent methods on the concrete `QueryComputer<f32>`
+/// and `QueryComputer<half::f16>` types, so generic code needs this shim.
+pub(super) trait BuildArchQc<T: Copy> {
+    /// Build a `QueryComputer<T>` pinned to the host's auto-dispatched arch.
+    fn build_auto(query: MatRef<'_, Standard<T>>) -> QueryComputer<T>;
+}
+
+impl BuildArchQc<f32> for f32 {
+    fn build_auto(query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
+        QueryComputer::<f32>::new(query)
+    }
+}
+
+impl BuildArchQc<half::f16> for half::f16 {
+    fn build_auto(query: MatRef<'_, Standard<half::f16>>) -> QueryComputer<half::f16> {
+        QueryComputer::<half::f16>::new(query)
+    }
+}
+
+/// Per-(arch, T) constructor for `QueryComputer::from_arch`. Same idea as
+/// [`BuildArchQc::build_auto`] but pinned to a specific arch token.
+pub(super) trait BuildPinnedQc<A: Architecture, T: Copy> {
+    fn build_pinned(query: MatRef<'_, Standard<T>>, arch: A) -> QueryComputer<T>;
+}
+
+macro_rules! impl_build_pinned {
+    ($arch:path, $T:ty) => {
+        impl BuildPinnedQc<$arch, $T> for $T {
+            fn build_pinned(query: MatRef<'_, Standard<$T>>, arch: $arch) -> QueryComputer<$T> {
+                QueryComputer::<$T>::from_arch(query, arch)
+            }
+        }
+    };
+}
+
+impl_build_pinned!(Scalar, f32);
+impl_build_pinned!(Scalar, half::f16);
+#[cfg(target_arch = "x86_64")]
+impl_build_pinned!(V3, f32);
+#[cfg(target_arch = "x86_64")]
+impl_build_pinned!(V3, half::f16);
+#[cfg(target_arch = "x86_64")]
+impl_build_pinned!(V4, f32);
+#[cfg(target_arch = "x86_64")]
+impl_build_pinned!(V4, half::f16);
+#[cfg(target_arch = "aarch64")]
+impl_build_pinned!(Neon, f32);
+#[cfg(target_arch = "aarch64")]
+impl_build_pinned!(Neon, half::f16);
+
+/// Stamp out `RunBenchmark<$arch>` for `Kernel<$arch, $T>` using
+/// `QueryComputer::<T>::from_arch($arch_token)`.
+macro_rules! stamp {
+    ($arch:path, $T:ty) => {
+        impl RunBenchmark<$arch> for Kernel<$arch, $T>
+        where
+            StandardUniform: Distribution<$T>,
+            $T: BuildPinnedQc<$arch, $T>,
+        {
+            fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>> {
+                let arch = Identity::<$arch>::convert(input.arch)?.0;
+                let mut results = Vec::with_capacity(input.runs.len());
+                for run in input.runs.iter() {
+                    let data = Data::<$T>::new(run);
+                    // `QueryComputer` performs query-side precomputation that is
+                    // intentionally amortized across many `max_sim` calls;
+                    // construct it once per shape, outside the timed loop.
+                    let qc = <$T as BuildPinnedQc<$arch, $T>>::build_pinned(
+                        data.queries.as_view(),
+                        arch,
+                    );
+                    let dist = OptimizedDistance(qc);
+                    results.push(run_with_distance(run, data.docs.as_view(), &dist));
+                }
+                Ok(results)
+            }
+        }
+    };
+    ($target_arch:literal, $arch:path, $T:ty) => {
+        #[cfg(target_arch = $target_arch)]
+        stamp!($arch, $T);
+    };
+}
+
+stamp!(Scalar, f32);
+stamp!(Scalar, half::f16);
+stamp!("x86_64", V3, f32);
+stamp!("x86_64", V3, half::f16);
+stamp!("x86_64", V4, f32);
+stamp!("x86_64", V4, half::f16);
+stamp!("aarch64", Neon, f32);
+stamp!("aarch64", Neon, half::f16);
+
+// Auto and Reference get hand-written impls (different construction paths).
+
+impl<T> RunBenchmark<Auto> for Kernel<Auto, T>
+where
+    T: Copy + 'static + BuildArchQc<T>,
+    StandardUniform: Distribution<T>,
+{
+    fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>> {
+        let mut results = Vec::with_capacity(input.runs.len());
+        for run in input.runs.iter() {
+            let data = Data::<T>::new(run);
+            let qc = <T as BuildArchQc<T>>::build_auto(data.queries.as_view());
+            let dist = OptimizedDistance(qc);
+            results.push(run_with_distance(run, data.docs.as_view(), &dist));
+        }
+        Ok(results)
+    }
+}
+
+impl<T> RunBenchmark<Reference> for Kernel<Reference, T>
+where
+    T: Copy + 'static,
+    StandardUniform: Distribution<T>,
+    InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+    for<'a> ReferenceDistance<'a, T>: super::driver::Distance<T>,
+{
+    fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>> {
+        let mut results = Vec::with_capacity(input.runs.len());
+        for run in input.runs.iter() {
+            let data = Data::<T>::new(run);
+            let dist = ReferenceDistance(data.queries.as_view().into());
+            results.push(run_with_distance(run, data.docs.as_view(), &dist));
+        }
+        Ok(results)
+    }
+}
+
+//////////////////////
+// Registration     //
+//////////////////////
+
+pub(super) fn register(benchmarks: &mut diskann_benchmark_runner::registry::Benchmarks) {
+    benchmarks.register_regression("multi-vector-op-f32-auto", Kernel::<Auto, f32>::new());
+    benchmarks.register_regression("multi-vector-op-f16-auto", Kernel::<Auto, half::f16>::new());
+
+    benchmarks.register_regression("multi-vector-op-f32-scalar", Kernel::<Scalar, f32>::new());
+    benchmarks.register_regression(
+        "multi-vector-op-f16-scalar",
+        Kernel::<Scalar, half::f16>::new(),
+    );
+
+    benchmarks.register_regression(
+        "multi-vector-op-f32-reference",
+        Kernel::<Reference, f32>::new(),
+    );
+    benchmarks.register_regression(
+        "multi-vector-op-f16-reference",
+        Kernel::<Reference, half::f16>::new(),
+    );
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        benchmarks.register_regression("multi-vector-op-f32-x86_64_V3", Kernel::<V3, f32>::new());
+        benchmarks.register_regression(
+            "multi-vector-op-f16-x86_64_V3",
+            Kernel::<V3, half::f16>::new(),
+        );
+        benchmarks.register_regression("multi-vector-op-f32-x86_64_V4", Kernel::<V4, f32>::new());
+        benchmarks.register_regression(
+            "multi-vector-op-f16-x86_64_V4",
+            Kernel::<V4, half::f16>::new(),
+        );
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    {
+        benchmarks.register_regression(
+            "multi-vector-op-f32-aarch64_neon",
+            Kernel::<Neon, f32>::new(),
+        );
+        benchmarks.register_regression(
+            "multi-vector-op-f16-aarch64_neon",
+            Kernel::<Neon, half::f16>::new(),
+        );
+    }
+}
diff --git a/diskann-benchmark/src/backend/multi_vector/mod.rs b/diskann-benchmark/src/backend/multi_vector/mod.rs
new file mode 100644
index 000000000..90426e571
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/mod.rs
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Multi-vector MaxSim distance benchmarks with regression detection.
+//!
+//! This module is a **kernel-research substrate**, not just a benchmark. It
+//! supports two distinct use cases:
+//!
+//! 1. **Head-to-head ISA (instruction set architecture) comparison.** Library
+//!    kernels are registered per arch (`scalar`, `x86-64-v3`, `x86-64-v4`,
+//!    `aarch64-neon`) plus `auto` (CPU-detected) and `reference` (fallback).
+//!    Pinning to a specific ISA lets you compare e.g. AVX2 vs AVX512 on the
+//!    same AVX512 host.
+//!
+//! 2. **Experimental kernel authoring.** External crates and the
+//!    `experimental/` submodule can author new SIMD micro-kernels by
+//!    implementing the public `Kernel<A>` trait in
+//!    `diskann-quantization::multi_vector::distance::kernels`, plug them
+//!    into the existing cache-aware tile orchestrator (`tiled_reduce`),
+//!    and slot them into the benchmark via
+//!    `QueryComputer::from_dyn(Box::new(...))`.
+//!
+//! # Adding a new experimental kernel
+//!
+//! See `experimental/template.rs` for the full step-by-step workflow with
+//! a worked example. Summary:
+//!
+//! 1. Add a variant to [`crate::inputs::multi_vector::Arch`].
+//! 2. Implement `Kernel<A>` for your micro-kernel.
+//! 3. Implement `DynQueryComputer<T>` for your adapter, calling
+//!    `tiled_reduce` with your kernel.
+//! 4. Add a marker type + `DispatchRule<Arch>` impl so the new variant
+//!    routes to your kernel.
+//! 5. Add a `RunBenchmark<Marker>` impl + `register_regression(...)` call
+//!    in `experimental::register`.
+//!
+//! **Validate experimental kernels under Miri:**
+//! - Construct arch tokens via `Scalar::new()` (Miri-safe) or
+//!   `V4::new_checked_miri()` (Miri-safe AVX-512 emulation). `V3::new_checked()`
+//!   and `Neon::new_checked()` don't have `_miri` variants today; if you need
+//!   them under Miri, follow `V4::new_checked_miri()`'s pattern.
+//! - Gate Miri-unsupported intrinsics with `#[cfg(not(miri))]`.
+//! - Reduce test-sweep size under `cfg(miri)` to keep runtimes reasonable.
+
+use diskann_benchmark_runner::registry::Benchmarks;
+
+cfg_if::cfg_if! {
+    if #[cfg(feature = "multi-vector")] {
+        mod driver;
+        mod experimental;
+        mod library_kernels;
+
+        pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) {
+            library_kernels::register(benchmarks);
+            experimental::register(benchmarks);
+        }
+    } else {
+        crate::utils::stub_impl!("multi-vector", inputs::multi_vector::MultiVectorOp);
+
+        pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) {
+            imp::register("multi-vector-op", benchmarks);
+        }
+    }
+}
+
+#[cfg(all(test, feature = "multi-vector"))]
+mod tests {
+    use std::num::NonZeroUsize;
+
+    use diskann_benchmark_runner::{
+        benchmark::{PassFail, Regression},
+        utils::{
+            datatype::DataType, num::NonNegativeFinite, percentiles::compute_percentiles,
+            MicroSeconds,
+        },
+    };
+
+    use super::driver::{CheckResult, Comparison, RunResult};
+    use super::library_kernels::{Auto, Kernel};
+    use crate::inputs::multi_vector::{Arch, MultiVectorOp, MultiVectorTolerance, Run};
+
+    fn tiny_run() -> Run {
+        Run {
+            num_query_vectors: NonZeroUsize::new(2).unwrap(),
+            num_doc_vectors: NonZeroUsize::new(2).unwrap(),
+            dim: NonZeroUsize::new(4).unwrap(),
+            loops_per_measurement: NonZeroUsize::new(1).unwrap(),
+            num_measurements: NonZeroUsize::new(1).unwrap(),
+        }
+    }
+
+    fn tiny_op() -> MultiVectorOp {
+        MultiVectorOp {
+            element_type: DataType::Float32,
+            arch: Arch::Auto,
+            runs: vec![tiny_run()],
+        }
+    }
+
+    fn tiny_result(minimum: u64) -> RunResult {
+        let mut latencies = vec![MicroSeconds::new(minimum)];
+        let percentiles = compute_percentiles(&mut latencies).unwrap();
+        RunResult {
+            run: tiny_run(),
+            latencies,
+            percentiles,
+        }
+    }
+
+    fn tolerance(limit: f64) -> MultiVectorTolerance {
+        MultiVectorTolerance {
+            min_time_regression: NonNegativeFinite::new(limit).unwrap(),
+        }
+    }
+
+    #[test]
+    fn check_rejects_mismatched_runs() {
+        let kernel = Kernel::<Auto, f32>::new();
+
+        // Build a result whose `run` diverges from `tiny_run()` so the
+        // regression check's `b.run == a.run` invariant fires.
+        let mut latencies = vec![MicroSeconds::new(100)];
+        let percentiles = compute_percentiles(&mut latencies).unwrap();
+        let mismatched_result = RunResult {
+            run: Run {
+                num_query_vectors: NonZeroUsize::new(4).unwrap(),
+                ..tiny_run()
+            },
+            latencies,
+            percentiles,
+        };
+
+        let err = kernel
+            .check(
+                &tolerance(0.0),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![mismatched_result],
+            )
+            .unwrap_err();
+
+        assert_eq!(err.to_string(), "run 0 mismatched");
+    }
+
+    #[test]
+    fn check_allows_negative_relative_change() {
+        let kernel = Kernel::<Auto, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.0),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![tiny_result(95)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Pass(_)));
+    }
+
+    #[test]
+    fn check_passes_on_tolerance_boundary() {
+        let kernel = Kernel::<Auto, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![tiny_result(105)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Pass(_)));
+    }
+
+    #[test]
+    fn check_fails_above_tolerance_boundary() {
+        let kernel = Kernel::<Auto, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![tiny_result(106)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Fail(_)));
+    }
+
+    #[test]
+    fn check_result_display_includes_failure_details() {
+        let check = CheckResult {
+            checks: vec![Comparison {
+                run: tiny_run(),
+                tolerance: tolerance(0.05),
+                before_min: 100.0,
+                after_min: 106.0,
+            }],
+        };
+
+        let rendered = check.to_string();
+        assert!(rendered.contains("Q"), "rendered = {rendered}");
+        assert!(rendered.contains("Dim"), "rendered = {rendered}");
+        assert!(rendered.contains("100.000"), "rendered = {rendered}");
+        assert!(rendered.contains("106.000"), "rendered = {rendered}");
+        assert!(rendered.contains("6.000 %"), "rendered = {rendered}");
+        assert!(rendered.contains("FAIL"), "rendered = {rendered}");
+    }
+
+    /// A "before" value of 0 means the measurement was too fast to obtain a
+    /// reliable signal, so we *could* be letting a regression through. We
+    /// require at least a non-zero value.
+    #[test]
+    fn zero_values_rejected() {
+        let kernel = Kernel::<Auto, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(0)],
+                &vec![tiny_result(0)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Fail(_)));
+    }
+}
diff --git a/diskann-benchmark/src/inputs/multi_vector.rs b/diskann-benchmark/src/inputs/multi_vector.rs
index 8010162d6..150d72a8e 100644
--- a/diskann-benchmark/src/inputs/multi_vector.rs
+++ b/diskann-benchmark/src/inputs/multi_vector.rs
@@ -32,46 +32,46 @@ pub(super) fn register_inputs(
 // Enum types //
 ////////////////
 
-/// The two distance operations exposed by `QueryComputer`.
-#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
-#[serde(rename_all = "snake_case")]
-pub(crate) enum Operation {
-    Chamfer,
-    MaxSim,
-}
-
-impl std::fmt::Display for Operation {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let st = match self {
-            Self::Chamfer => "chamfer",
-            Self::MaxSim => "max_sim",
-        };
-        write!(f, "{}", st)
-    }
-}
-
-/// Which implementation tier to benchmark.
+/// Which kernel to benchmark.
+///
+/// Mirrors `diskann-benchmark-simd`'s `Arch` enum: kebab-case serialization,
+/// one variant per supported ISA plus `Reference` (fallback) and `Auto`
+/// (host-portable). Marked `#[non_exhaustive]` so experimental kernels can
+/// add variants without breaking JSON configs.
 #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
 #[serde(rename_all = "kebab-case")]
-pub(crate) enum Implementation {
-    Optimized,
+#[non_exhaustive]
+pub(crate) enum Arch {
+    #[serde(rename = "x86-64-v4")]
+    #[allow(non_camel_case_types)]
+    X86_64_V4,
+    #[serde(rename = "x86-64-v3")]
+    #[allow(non_camel_case_types)]
+    X86_64_V3,
+    Neon,
+    Scalar,
     Reference,
+    /// Auto-dispatch to the host's best supported arch (calls `QueryComputer::new`).
+    Auto,
 }
 
-impl std::fmt::Display for Implementation {
+impl std::fmt::Display for Arch {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         let st = match self {
-            Self::Optimized => "optimized",
+            Self::X86_64_V4 => "x86-64-v4",
+            Self::X86_64_V3 => "x86-64-v3",
+            Self::Neon => "neon",
+            Self::Scalar => "scalar",
             Self::Reference => "reference",
+            Self::Auto => "auto",
         };
         write!(f, "{}", st)
     }
 }
 
-/// One benchmark configuration: a single (operation, shape) measurement.
+/// One benchmark configuration: a single shape measurement.
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub(crate) struct Run {
-    pub(crate) operation: Operation,
     pub(crate) num_query_vectors: NonZeroUsize,
     pub(crate) num_doc_vectors: NonZeroUsize,
     pub(crate) dim: NonZeroUsize,
@@ -87,7 +87,7 @@ pub(crate) struct Run {
 #[derive(Debug, Serialize, Deserialize)]
 pub(crate) struct MultiVectorOp {
     pub(crate) element_type: DataType,
-    pub(crate) implementation: Implementation,
+    pub(crate) arch: Arch,
     pub(crate) runs: Vec<Run>,
 }
 
@@ -105,7 +105,6 @@ impl CheckDeserialization for MultiVectorOp {
 
 impl Example for MultiVectorOp {
     fn example() -> Self {
-        const NUM_QUERY_VECTORS: NonZeroUsize = NonZeroUsize::new(32).unwrap();
         const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
         const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap();
         const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap();
@@ -113,16 +112,14 @@ impl Example for MultiVectorOp {
 
         let runs = vec![
             Run {
-                operation: Operation::Chamfer,
-                num_query_vectors: NUM_QUERY_VECTORS,
+                num_query_vectors: NonZeroUsize::new(32).unwrap(),
                 num_doc_vectors: NUM_DOC_VECTORS,
                 dim: DIM,
                 loops_per_measurement: LOOPS_PER_MEASUREMENT,
                 num_measurements: NUM_MEASUREMENTS,
             },
             Run {
-                operation: Operation::MaxSim,
-                num_query_vectors: NUM_QUERY_VECTORS,
+                num_query_vectors: NonZeroUsize::new(64).unwrap(),
                 num_doc_vectors: NUM_DOC_VECTORS,
                 dim: DIM,
                 loops_per_measurement: LOOPS_PER_MEASUREMENT,
@@ -132,7 +129,7 @@ impl Example for MultiVectorOp {
 
         Self {
             element_type: DataType::Float32,
-            implementation: Implementation::Optimized,
+            arch: Arch::Auto,
             runs,
         }
     }
@@ -149,7 +146,7 @@ impl std::fmt::Display for MultiVectorOp {
         writeln!(f, "Multi-Vector Operation\n")?;
         write_field!(f, "tag", Self::tag())?;
         write_field!(f, "element type", self.element_type)?;
-        write_field!(f, "implementation", self.implementation)?;
+        write_field!(f, "arch", self.arch)?;
         write_field!(f, "number of runs", self.runs.len())?;
         Ok(())
     }
diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs
index 9dba42609..5f641dd9f 100644
--- a/diskann-benchmark/src/main.rs
+++ b/diskann-benchmark/src/main.rs
@@ -782,7 +782,7 @@ mod tests {
 
     #[test]
     fn multi_vector_integration() {
-        let path = example_directory().join("multi-vector-test.json");
+        let path = example_directory().join("multi-vector.json");
         let tempdir = tempfile::tempdir().unwrap();
         let output_path = tempdir.path().join("output.json");
         assert!(!output_path.exists());
@@ -843,7 +843,7 @@ mod tests {
     #[test]
     #[cfg(feature = "multi-vector")]
     fn multi_vector_check_verify() {
-        let input_path = example_directory().join("multi-vector-test.json");
+        let input_path = example_directory().join("multi-vector.json");
         let tolerance_path = project_directory()
             .join("perf_test_inputs")
             .join("multi-vector-tolerance.json");
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f16.rs b/diskann-quantization/src/multi_vector/distance/kernels/f16.rs
index a535c68dc..e6dc8a772 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/f16.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/f16.rs
@@ -34,7 +34,7 @@ impl<A, const GROUP: usize>
 where
     A: Architecture,
     F32Kernel<GROUP>: Kernel<A>,
-    layouts::BlockTransposed<half::f16, GROUP>: layouts::ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Left>
+    layouts::BlockTransposedLayout<half::f16, GROUP>: layouts::ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Left>
         + layouts::Layout<Element = half::f16>,
     layouts::RowMajor<half::f16>: layouts::ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Right>
         + layouts::Layout<Element = half::f16>,
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs
index a900ea356..602da6324 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs
@@ -30,7 +30,7 @@ mod scalar;
 mod v3;
 
 /// Zero-sized kernel type for f32 micro-kernels with block size `GROUP`.
-pub(crate) struct F32Kernel<const GROUP: usize>;
+pub struct F32Kernel<const GROUP: usize>;
 
 #[inline(never)]
 #[cold]
@@ -66,7 +66,7 @@ pub(super) fn max_ip_kernel<A: Architecture, T: Copy, const GROUP: usize>(
     budget: TileBudget,
 ) where
     F32Kernel<GROUP>: Kernel<A>,
-    layouts::BlockTransposed<T, GROUP>:
+    layouts::BlockTransposedLayout<T, GROUP>:
         layouts::ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Left> + layouts::Layout<Element = T>,
     layouts::RowMajor<T>: layouts::ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Right>
         + layouts::Layout<Element = T>,
@@ -117,7 +117,7 @@ impl<A, const GROUP: usize>
 where
     A: Architecture,
     Self: Kernel<A>,
-    layouts::BlockTransposed<f32, GROUP>:
+    layouts::BlockTransposedLayout<f32, GROUP>:
         layouts::ConvertTo<A, <Self as Kernel<A>>::Left> + layouts::Layout<Element = f32>,
     layouts::RowMajor<f32>:
         layouts::ConvertTo<A, <Self as Kernel<A>>::Right> + layouts::Layout<Element = f32>,
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs b/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs
index bd8fb1c4a..2a230ca62 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs
@@ -27,7 +27,7 @@ diskann_wide::alias!(f32s = <Scalar>::f32x8);
 // A_PANEL(8) * k A elements, UNROLL * k B elements, and A_PANEL(8)
 // scratch elements — all within the bounds guaranteed by `tiled_reduce`.
 unsafe impl Kernel<Scalar> for F32Kernel<8> {
-    type Left = layouts::BlockTransposed<f32, 8>;
+    type Left = layouts::BlockTransposedLayout<f32, 8>;
     type Right = layouts::RowMajor<f32>;
     const A_PANEL: usize = 8;
     const B_PANEL: usize = 2;
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs b/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs
index b05195b1e..319cf1cda 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs
@@ -17,7 +17,7 @@ diskann_wide::alias!(f32s = <V3>::f32x8);
 // A_PANEL(16) * k A elements, UNROLL * k B elements, and A_PANEL(16)
 // scratch elements — all within the bounds guaranteed by `tiled_reduce`.
 unsafe impl Kernel<V3> for F32Kernel<16> {
-    type Left = layouts::BlockTransposed<f32, 16>;
+    type Left = layouts::BlockTransposedLayout<f32, 16>;
     type Right = layouts::RowMajor<f32>;
     const A_PANEL: usize = 16;
     const B_PANEL: usize = 4;
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs b/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs
index e1ec8dd36..54962acaa 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs
@@ -4,7 +4,7 @@
 //! Layout markers and tile-level conversion traits.
 //!
 //! - [`Layout`] — marker trait: memory layout + element type.
-//! - [`BlockTransposed`] / [`RowMajor`] — zero-sized layout markers.
+//! - [`BlockTransposedLayout`] / [`RowMajor`] — zero-sized layout markers.
 //! - [`DescribeLayout`] — bridges matrix types to layout markers.
 //! - [`ConvertTo`] — tile-level conversion (blanket identity + f16→f32).
 
@@ -17,7 +17,7 @@ use diskann_wide::arch::Target2;
 // ── Layout trait ─────────────────────────────────────
 
 /// Memory layout and element type marker for tile data.
-pub(super) trait Layout {
+pub trait Layout {
     type Element: Copy;
 }
 
@@ -25,28 +25,36 @@ pub(super) trait Layout {
 
 /// Block-transposed tile layout: `GROUP` rows per block, `PACK` columns
 /// interleaved. Matches [`BlockTransposedRef`](crate::multi_vector::BlockTransposedRef).
-pub(super) struct BlockTransposed<T, const GROUP: usize, const PACK: usize = 1>(PhantomData<T>);
-
-impl<T, const GROUP: usize, const PACK: usize> BlockTransposed<T, GROUP, PACK> {
+///
+/// This is the zero-sized **layout marker** used in [`Kernel<A>::Left`] /
+/// [`Kernel<A>::Right`](super::Kernel) associated types. It is distinct
+/// from the owning storage type [`BlockTransposed`](crate::multi_vector::BlockTransposed)
+/// — the marker carries layout information at the type level; the owning
+/// type holds actual data.
+pub struct BlockTransposedLayout<T, const GROUP: usize, const PACK: usize = 1>(PhantomData<T>);
+
+impl<T, const GROUP: usize, const PACK: usize> BlockTransposedLayout<T, GROUP, PACK> {
     pub(super) fn new() -> Self {
         Self(PhantomData)
     }
 }
 
-impl<T, const GROUP: usize, const PACK: usize> Copy for BlockTransposed<T, GROUP, PACK> {}
+impl<T, const GROUP: usize, const PACK: usize> Copy for BlockTransposedLayout<T, GROUP, PACK> {}
 
-impl<T, const GROUP: usize, const PACK: usize> Clone for BlockTransposed<T, GROUP, PACK> {
+impl<T, const GROUP: usize, const PACK: usize> Clone for BlockTransposedLayout<T, GROUP, PACK> {
     fn clone(&self) -> Self {
         *self
     }
 }
 
-impl<T: Copy, const GROUP: usize, const PACK: usize> Layout for BlockTransposed<T, GROUP, PACK> {
+impl<T: Copy, const GROUP: usize, const PACK: usize> Layout
+    for BlockTransposedLayout<T, GROUP, PACK>
+{
     type Element = T;
 }
 
 /// Dense row-major tile layout. Matches [`MatRef<Standard<T>>`](crate::multi_vector::MatRef).
-pub(super) struct RowMajor<T>(PhantomData<T>);
+pub struct RowMajor<T>(PhantomData<T>);
 
 impl<T> RowMajor<T> {
     pub(super) fn new() -> Self {
@@ -70,7 +78,7 @@ impl<T: Copy> Layout for RowMajor<T> {
 
 /// Bridges a concrete matrix type to its [`Layout`] marker, enabling
 /// type inference of [`ConvertTo`] parameters at call sites.
-pub(super) trait DescribeLayout {
+pub trait DescribeLayout {
     type Layout: Layout;
 
     fn layout(&self) -> Self::Layout;
@@ -79,10 +87,10 @@ pub(super) trait DescribeLayout {
 impl<T: Copy, const GROUP: usize, const PACK: usize> DescribeLayout
     for crate::multi_vector::BlockTransposedRef<'_, T, GROUP, PACK>
 {
-    type Layout = BlockTransposed<T, GROUP, PACK>;
+    type Layout = BlockTransposedLayout<T, GROUP, PACK>;
 
     fn layout(&self) -> Self::Layout {
-        BlockTransposed::new()
+        BlockTransposedLayout::new()
     }
 }
 
@@ -108,7 +116,7 @@ impl<T: Copy> DescribeLayout for crate::multi_vector::MatRef<'_, crate::multi_ve
 /// - `convert` reads at most `rows * k` source elements.
 /// - `convert` writes only within `buf`.
 /// - The returned pointer is valid until the next `&mut` access to `buf`.
-pub(super) unsafe trait ConvertTo<A: Architecture, To: Layout>: Layout {
+pub unsafe trait ConvertTo<A: Architecture, To: Layout>: Layout {
     /// Staging buffer for converted tile data (`()` for identity conversions).
     type Buffer;
 
@@ -162,7 +170,8 @@ unsafe impl<A: Architecture, L: Layout> ConvertTo<A, L> for L {
 // into `rows * k` f32 values in `buf`. The returned pointer is
 // `buf.as_ptr()`, valid until the next `&mut` access to `buf`.
 unsafe impl<A, const GROUP: usize, const PACK: usize>
-    ConvertTo<A, BlockTransposed<f32, GROUP, PACK>> for BlockTransposed<half::f16, GROUP, PACK>
+    ConvertTo<A, BlockTransposedLayout<f32, GROUP, PACK>>
+    for BlockTransposedLayout<half::f16, GROUP, PACK>
 where
     A: Architecture,
     SliceCast<f32, half::f16>: for<'a> Target2<A, (), &'a mut [f32], &'a [half::f16]>,
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
index bd9121a24..e7fbd16ed 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
@@ -12,27 +12,45 @@
 //! - **Query**: Block-transposed (`GROUP` vectors per block, dimensions contiguous
 //!   within each block). The block size is determined by the kernel's `A_PANEL`.
 //! - **Document**: Row-major (standard [`MatRef`](crate::multi_vector::MatRef) format).
+//!
+//! The [`Kernel<A>`] trait + [`tiled_reduce`] + [`layouts`] are public so
+//! external crates can plug new micro-kernels into the existing orchestrator.
 
 pub(super) mod f16;
 pub(super) mod f32;
-mod layouts;
+pub mod layouts;
 mod reduce;
 mod tiled_reduce;
 
+pub use tiled_reduce::tiled_reduce;
+
 // ── Tile budget ──────────────────────────────────────────────────
 
 /// Cache budgets fed to the tile planner.
 ///
 /// `Default` returns the production budgets derived from hardcoded L1/L2
-/// cache-size estimates and fixed fractions.
+/// cache-size estimates and fixed fractions. Researchers benchmarking with
+/// non-default cache assumptions can construct a custom [`TileBudget`] via
+/// [`TileBudget::new`] and pass it to [`tiled_reduce`].
 #[derive(Debug, Clone, Copy)]
-struct TileBudget {
+pub struct TileBudget {
     /// L2 budget in bytes reserved for A tiles.
     l2_a: usize,
     /// L1 budget in bytes reserved for B tiles (before A-panel subtraction).
     l1_b: usize,
 }
 
+impl TileBudget {
+    /// Construct a [`TileBudget`] with explicit L2 and L1 byte budgets.
+    ///
+    /// `l2_a` is the budget the tile planner uses to size A tiles; `l1_b`
+    /// is the budget for B tiles (one A micro-panel is subtracted at
+    /// runtime since both must coexist in L1 during the inner loop).
+    pub fn new(l2_a: usize, l1_b: usize) -> Self {
+        Self { l2_a, l1_b }
+    }
+}
+
 impl Default for TileBudget {
     // TODO: Replace hardcoded fallbacks with detected cache sizes
     // (e.g. via `diskann_platform`, env-var override, or runtime query).
@@ -51,7 +69,7 @@ impl Default for TileBudget {
 
 // ── Kernel trait ─────────────────────────────────────────────────
 
-/// SIMD micro-kernel for the [`tiled_reduce`](tiled_reduce::tiled_reduce) loop.
+/// SIMD micro-kernel for the [`tiled_reduce`] loop.
 ///
 /// The kernel only sees already-converted data: storage-layout to
 /// kernel-layout conversion is handled at tile boundaries by
@@ -59,17 +77,32 @@ impl Default for TileBudget {
 /// pointers reference `<Self::Left as Layout>::Element` /
 /// `<Self::Right as Layout>::Element` directly.
 ///
+/// # Invariant
+///
+/// When pairing this kernel with the owning storage type
+/// [`BlockTransposed<T, GROUP>`](crate::multi_vector::BlockTransposed) via
+/// [`tiled_reduce`], the storage's `GROUP` const must equal this kernel's
+/// [`A_PANEL`](Self::A_PANEL). The library's own f32 / f16 paths enforce
+/// this with a `const { assert!(...) }` in `max_ip_kernel`; external
+/// implementors must uphold it manually.
+///
 /// # Safety
 ///
 /// Implementors must respect the per-method `# Safety` contracts on
 /// [`full_panel`](Self::full_panel) and [`partial_panel`](Self::partial_panel).
-unsafe trait Kernel<A: diskann_wide::Architecture> {
+/// Implementations should be validated under Miri: construct arch tokens
+/// via `*::new_checked_miri()` in tests and gate Miri-unsupported
+/// intrinsics with `#[cfg(not(miri))]`.
+pub unsafe trait Kernel<A: diskann_wide::Architecture> {
     /// Layout consumed by the A (left / query) side of the micro-kernel.
     type Left: layouts::Layout;
     /// Layout consumed by the B (right / document) side of the micro-kernel.
     type Right: layouts::Layout;
 
     /// Number of A rows processed per micro-kernel invocation.
+    ///
+    /// Callers of [`tiled_reduce`] must guarantee
+    /// `a_padded_nrows % A_PANEL == 0`.
     const A_PANEL: usize;
     /// Number of B rows processed per micro-kernel invocation.
     const B_PANEL: usize;
@@ -79,10 +112,13 @@ unsafe trait Kernel<A: diskann_wide::Architecture> {
     /// # Safety
     ///
     /// * `a` must point to `A_PANEL * k` contiguous elements of
-    ///   `<Self::Left as Layout>::Element`.
+    ///   `<Self::Left as Layout>::Element`, properly aligned for that layout.
     /// * `b` must point to `B_PANEL * k` contiguous elements of
-    ///   `<Self::Right as Layout>::Element`.
+    ///   `<Self::Right as Layout>::Element`, properly aligned for that layout.
     /// * `r` must point to at least `A_PANEL` writable `f32` values.
+    /// * `k > 0`.
+    /// * The caller must invoke this from within an `arch.run3` (or
+    ///   equivalent) so that target_feature is active for the entire body.
     unsafe fn full_panel(
         arch: A,
         a: *const <Self::Left as layouts::Layout>::Element,
@@ -95,11 +131,8 @@ unsafe trait Kernel<A: diskann_wide::Architecture> {
     ///
     /// # Safety
     ///
-    /// * `a` must point to `A_PANEL * k` contiguous elements of
-    ///   `<Self::Left as Layout>::Element`.
-    /// * `b` must point to `remainder * k` contiguous elements of
-    ///   `<Self::Right as Layout>::Element`.
-    /// * `r` must point to at least `A_PANEL` writable `f32` values.
+    /// Same as [`full_panel`](Self::full_panel) except `b` points to
+    /// `remainder * k` contiguous elements and `1 <= remainder < B_PANEL`.
     unsafe fn partial_panel(
         arch: A,
         remainder: usize,
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs b/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs
index ff873c01f..285d823b6 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs
@@ -89,7 +89,7 @@ impl FullReduce {
 /// * `b_ptr` must be valid for `b_nrows * k` elements of `BElem`.
 /// * `scratch` must have length ≥ `a_padded_nrows` and be initialized by caller.
 #[allow(clippy::too_many_arguments)]
-pub(super) unsafe fn tiled_reduce<A, K, LA, LB>(
+pub unsafe fn tiled_reduce<A, K, LA, LB>(
     arch: A,
     ca: &LA,
     cb: &LB,
@@ -343,7 +343,7 @@ mod tests {
         let b = vec![0.0f32; 2 * k];
         let mut scratch = vec![f32::MIN; 16];
 
-        let ca = layouts::BlockTransposed::<f32, 8>::new();
+        let ca = layouts::BlockTransposedLayout::<f32, 8>::new();
         let cb = layouts::RowMajor::<f32>::new();
 
         // SAFETY: pointers and scratch are correctly sized; we expect a panic.
@@ -373,7 +373,7 @@ mod tests {
         let b = Vec::<f32>::new();
         let mut scratch = vec![f32::MIN; a_rows];
 
-        let ca = layouts::BlockTransposed::<f32, 8>::new();
+        let ca = layouts::BlockTransposedLayout::<f32, 8>::new();
         let cb = layouts::RowMajor::<f32>::new();
 
         // SAFETY: k == 0 so no elements are read; pointers are never dereferenced.
@@ -402,7 +402,7 @@ mod tests {
         let a_rows = 8;
         let mut scratch = vec![f32::MIN; a_rows];
 
-        let ca = layouts::BlockTransposed::<f32, 8>::new();
+        let ca = layouts::BlockTransposedLayout::<f32, 8>::new();
         let cb = layouts::RowMajor::<f32>::new();
 
         // SAFETY: k == 0, b_nrows == 0; no elements read.
@@ -516,7 +516,7 @@ mod tests {
         A: Architecture,
         T: Copy + Default,
         F32Kernel<GROUP>: Kernel<A>,
-        layouts::BlockTransposed<T, GROUP>:
+        layouts::BlockTransposedLayout<T, GROUP>:
             ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Left> + Layout<Element = T>,
         layouts::RowMajor<T>:
             ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Right> + Layout<Element = T>,
@@ -698,7 +698,7 @@ mod tests {
         A: Architecture,
         T: Copy + Default,
         F32Kernel<GROUP>: Kernel<A>,
-        layouts::BlockTransposed<T, GROUP>:
+        layouts::BlockTransposedLayout<T, GROUP>:
             ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Left> + Layout<Element = T>,
         layouts::RowMajor<T>:
             ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Right> + Layout<Element = T>,
diff --git a/diskann-quantization/src/multi_vector/distance/mod.rs b/diskann-quantization/src/multi_vector/distance/mod.rs
index 853f60753..354a47afe 100644
--- a/diskann-quantization/src/multi_vector/distance/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/mod.rs
@@ -50,10 +50,10 @@
 //! ```
 
 mod fallback;
-mod kernels;
+pub mod kernels;
 mod max_sim;
 mod query_computer;
 
 pub use fallback::QueryMatRef;
 pub use max_sim::{Chamfer, MaxSim, MaxSimError};
-pub use query_computer::QueryComputer;
+pub use query_computer::{DynQueryComputer, QueryComputer};
diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs
index 9bb348a6a..cb575c09f 100644
--- a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs
+++ b/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs
@@ -19,6 +19,18 @@ impl QueryComputer<half::f16> {
     pub fn new(query: MatRef<'_, Standard<half::f16>>) -> Self {
         diskann_wide::arch::dispatch1_no_features(BuildComputer, query)
     }
+
+    /// Build an f16 query computer pinned to a specific architecture token.
+    ///
+    /// See [`QueryComputer::<f32>::from_arch`] for the rationale and usage.
+    pub fn from_arch<A>(query: MatRef<'_, Standard<half::f16>>, arch: A) -> Self
+    where
+        A: Architecture,
+        BuildComputer:
+            for<'a> diskann_wide::arch::Target1<A, Self, MatRef<'a, Standard<half::f16>>>,
+    {
+        arch.run1(BuildComputer, query)
+    }
 }
 
 impl<A, const GROUP: usize> DynQueryComputer<half::f16>
@@ -51,8 +63,9 @@ where
     }
 }
 
+/// Architecture-dispatch target for `QueryComputer::<half::f16>` construction.
 #[derive(Debug, Clone, Copy)]
-pub(super) struct BuildComputer;
+pub struct BuildComputer;
 
 impl diskann_wide::arch::Target1<Scalar, QueryComputer<half::f16>, MatRef<'_, Standard<half::f16>>>
     for BuildComputer
diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs
index 9ff16b8b4..f8193ad7e 100644
--- a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs
+++ b/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs
@@ -19,6 +19,20 @@ impl QueryComputer<f32> {
     pub fn new(query: MatRef<'_, Standard<f32>>) -> Self {
         diskann_wide::arch::dispatch1_no_features(BuildComputer, query)
     }
+
+    /// Build an f32 query computer pinned to a specific architecture token.
+    ///
+    /// The caller obtains the token via `Scalar::new()` (always available)
+    /// or `V3::new_checked()` / `V4::new_checked()` / `Neon::new_checked()`
+    /// (which check CPU support). Use this constructor to A/B compare kernels
+    /// across ISAs on the same machine.
+    pub fn from_arch<A>(query: MatRef<'_, Standard<f32>>, arch: A) -> Self
+    where
+        A: Architecture,
+        BuildComputer: for<'a> diskann_wide::arch::Target1<A, Self, MatRef<'a, Standard<f32>>>,
+    {
+        arch.run1(BuildComputer, query)
+    }
 }
 
 impl<A, const GROUP: usize> DynQueryComputer<f32> for Prepared<A, BlockTransposed<f32, GROUP>>
@@ -50,8 +64,9 @@ where
     }
 }
 
+/// Architecture-dispatch target for `QueryComputer::<f32>` construction.
 #[derive(Debug, Clone, Copy)]
-pub(super) struct BuildComputer;
+pub struct BuildComputer;
 
 impl diskann_wide::arch::Target1<Scalar, QueryComputer<f32>, MatRef<'_, Standard<f32>>>
     for BuildComputer
diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs b/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs
index fbe84fcd3..efd552076 100644
--- a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs
@@ -43,6 +43,16 @@ pub struct QueryComputer<T: Copy> {
 }
 
 impl<T: Copy> QueryComputer<T> {
+    /// Wrap any [`DynQueryComputer<T>`] implementation as a `QueryComputer<T>`.
+    ///
+    /// This is the public seam for experimental kernels: implement
+    /// [`DynQueryComputer<T>`] on a custom struct, then wrap it here so the
+    /// existing [`QueryComputer::max_sim`] / [`QueryComputer::chamfer`] veneer
+    /// works against it.
+    pub fn from_dyn(inner: Box<dyn DynQueryComputer<T>>) -> Self {
+        Self { inner }
+    }
+
     /// Number of logical (non-padded) query vectors.
     #[inline]
     pub fn nrows(&self) -> usize {
@@ -88,8 +98,27 @@ impl<T: Copy> QueryComputer<T> {
     }
 }
 
-trait DynQueryComputer<T: Copy>: std::fmt::Debug + Send + Sync {
+/// Object-safe interface for "anything that can compute MaxSim for a [`QueryComputer<T>`]".
+///
+/// The library's own architecture-dispatched path implements this on the
+/// internal `Prepared<A, Q>` carriers. External crates implement it on their
+/// own structs and wrap via [`QueryComputer::from_dyn`].
+///
+/// # Contract
+///
+/// - [`compute_max_sim`](Self::compute_max_sim) is only invoked by
+///   [`QueryComputer::max_sim`], which has already asserted
+///   `scores.len() == self.nrows()` and short-circuited the zero-doc case.
+///   Implementations may rely on `scores.len() == self.nrows()` and
+///   `doc.num_vectors() > 0`.
+/// - Implementations must populate all `nrows()` entries of `scores`.
+///   [`QueryComputer::chamfer`] sums every entry, so leaving any trailing
+///   slot unwritten would silently corrupt the result.
+pub trait DynQueryComputer<T: Copy>: std::fmt::Debug + Send + Sync {
+    /// Compute MaxSim into `scores`. See trait-level docs for the contract.
     fn compute_max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
+
+    /// Number of query rows.
     fn nrows(&self) -> usize;
 }
 
@@ -287,4 +316,38 @@ mod tests {
 
     test_matches_fallback!(f32, f32, 1e-10, "f32 ");
     test_matches_fallback!(f16, half::f16, 1e-10, "f16 ");
+
+    // ============================================================
+    // from_dyn: wrap a trivial custom DynQueryComputer.
+    // ============================================================
+    #[derive(Debug)]
+    struct ConstantComputer {
+        nrows: usize,
+        value: f32,
+    }
+
+    impl DynQueryComputer<f32> for ConstantComputer {
+        fn compute_max_sim(&self, _doc: MatRef<'_, Standard<f32>>, scores: &mut [f32]) {
+            for s in scores.iter_mut() {
+                *s = self.value;
+            }
+        }
+        fn nrows(&self) -> usize {
+            self.nrows
+        }
+    }
+
+    #[test]
+    fn from_dyn_wraps_custom_impl() {
+        let computer = QueryComputer::<f32>::from_dyn(Box::new(ConstantComputer {
+            nrows: 3,
+            value: -1.5,
+        }));
+        assert_eq!(computer.nrows(), 3);
+
+        let doc = make_mat(&[1.0f32, 0.0, 0.0, 1.0], 2, 2);
+        let mut scores = vec![0.0f32; 3];
+        computer.max_sim(doc, &mut scores);
+        assert_eq!(scores, vec![-1.5, -1.5, -1.5]);
+    }
 }
diff --git a/diskann-quantization/src/multi_vector/matrix.rs b/diskann-quantization/src/multi_vector/matrix.rs
index bcbafaaa3..734cb7247 100644
--- a/diskann-quantization/src/multi_vector/matrix.rs
+++ b/diskann-quantization/src/multi_vector/matrix.rs
@@ -244,18 +244,6 @@ pub unsafe trait NewOwned<T>: ReprOwned {
 #[derive(Debug, Clone, Copy)]
 pub struct Defaulted;
 
-/// An initializer argument to [`NewOwned`] that invokes the wrapped closure for each
-/// element.
-///
-/// # Example
-/// ```
-/// use diskann_quantization::multi_vector::{Init, Mat, Standard};
-/// let mut n = 0;
-/// let mat = Mat::new(Standard::<i32>::new(1, 4).unwrap(), Init(|| { n += 1; n })).unwrap();
-/// assert_eq!(mat.as_slice(), &[1, 2, 3, 4]);
-/// ```
-pub struct Init<F>(pub F);
-
 /// Create a new [`Mat`] cloned from a view.
 pub trait NewCloned: ReprOwned {
     /// Clone the contents behind `v`, returning a new owning [`Mat`].
@@ -526,22 +514,6 @@ where
     }
 }
 
-// SAFETY: The implementation uses guarantees from `Box` to ensure that the pointer
-// initialized by it is non-null and properly aligned to the underlying type.
-unsafe impl<T, F> NewOwned<Init<F>> for Standard<T>
-where
-    T: Copy,
-    F: FnMut() -> T,
-{
-    type Error = crate::error::Infallible;
-    fn new_owned(self, mut init: Init<F>) -> Result<Mat<Self>, Self::Error> {
-        let b: Box<[T]> = (0..self.num_elements()).map(|_| (init.0)()).collect();
-
-        // SAFETY: By construction, `b` has length `self.num_elements()`.
-        Ok(unsafe { self.box_to_mat(b) })
-    }
-}
-
 // SAFETY: This checks that the slice has the correct length, which is all that is
 // required for [`Repr`].
 unsafe impl<T> NewRef<T> for Standard<T>
@@ -740,6 +712,22 @@ impl<T: NewCloned> Clone for Mat<T> {
 }
 
 impl<T: Copy> Mat<Standard<T>> {
+    /// Create a new matrix by invoking `f` once per element in row-major order.
+    ///
+    /// # Example
+    /// ```
+    /// use diskann_quantization::multi_vector::{Mat, Standard};
+    /// let mut n = 0;
+    /// let mat = Mat::from_fn(Standard::<i32>::new(1, 4).unwrap(), || { n += 1; n });
+    /// assert_eq!(mat.as_slice(), &[1, 2, 3, 4]);
+    /// ```
+    pub fn from_fn<F: FnMut() -> T>(repr: Standard<T>, mut f: F) -> Self {
+        let b: Box<[T]> = (0..repr.num_elements()).map(|_| f()).collect();
+
+        // SAFETY: By construction, `b` has length `repr.num_elements()`.
+        unsafe { repr.box_to_mat(b) }
+    }
+
     /// Returns the raw dimension (columns) of the vectors in the matrix.
     #[inline]
     pub fn vector_dim(&self) -> usize {
@@ -1796,17 +1784,13 @@ mod tests {
     }
 
     #[test]
-    fn test_standard_new_owned_with_init() {
+    fn test_standard_from_fn() {
         let mut counter: i32 = 0;
-        let m = Mat::new(
-            Standard::<i32>::new(2, 3).unwrap(),
-            Init(|| {
-                let v = counter;
-                counter += 1;
-                v
-            }),
-        )
-        .unwrap();
+        let m = Mat::from_fn(Standard::<i32>::new(2, 3).unwrap(), || {
+            let v = counter;
+            counter += 1;
+            v
+        });
 
         assert_eq!(m.as_slice(), &[0, 1, 2, 3, 4, 5]);
     }
diff --git a/diskann-quantization/src/multi_vector/mod.rs b/diskann-quantization/src/multi_vector/mod.rs
index 1d765bacc..3670b1aaf 100644
--- a/diskann-quantization/src/multi_vector/mod.rs
+++ b/diskann-quantization/src/multi_vector/mod.rs
@@ -74,6 +74,6 @@ pub(crate) mod matrix;
 pub use block_transposed::{BlockTransposed, BlockTransposedMut, BlockTransposedRef};
 pub use distance::{Chamfer, MaxSim, MaxSimError, QueryComputer, QueryMatRef};
 pub use matrix::{
-    Defaulted, Init, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef,
-    Overflow, Repr, ReprMut, ReprOwned, SliceError, Standard,
+    Defaulted, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef, Overflow,
+    Repr, ReprMut, ReprOwned, SliceError, Standard,
 };

From 94fd8de24a937750e0b1c0fa3b52fc2392e26587 Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryangupta@microsoft.com>
Date: Thu, 14 May 2026 21:23:30 +0530
Subject: [PATCH 10/13] Minor doc fix

---
 .../src/backend/multi_vector/experimental/template.rs       | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/diskann-benchmark/src/backend/multi_vector/experimental/template.rs b/diskann-benchmark/src/backend/multi_vector/experimental/template.rs
index f09f0c74e..64eeb3e00 100644
--- a/diskann-benchmark/src/backend/multi_vector/experimental/template.rs
+++ b/diskann-benchmark/src/backend/multi_vector/experimental/template.rs
@@ -9,7 +9,7 @@
 //! `Kernel<A>` impl to your target ISA, and add an `Arch` variant + a
 //! `register_regression` call to wire it up.
 //!
-//! # The 6-step workflow
+//! # The 5-step workflow
 //!
 //! 1. **Add an [`Arch`](crate::inputs::multi_vector::Arch) variant** for your
 //!    experimental kernel (e.g. `X86_64_V4_Wide`). The `#[non_exhaustive]`
@@ -28,7 +28,9 @@
 //!    `library_kernels.rs` (e.g. `match_arch_x86_64!`) for your new variant.
 //! 5. **Add a `RunBenchmark<Marker>` impl + `register_regression` call.** Use
 //!    `Kernel::<Marker, T>::new()` as the registered benchmark entry.
-//! 6. **Validate under Miri.** See the section below.
+//!
+//! Then validate under Miri before treating the kernel as correct — see the
+//! section below.
 //!
 //! # Validating under Miri (REQUIRED)
 //!

From f51bc2a2b0ceb635be46c0773fdfdc90ab018169 Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Tue, 19 May 2026 03:07:22 +0530
Subject: [PATCH 11/13] MaxSim benchmark with BYOTE factory

---
 diskann-benchmark/Cargo.toml                  |   3 +
 diskann-benchmark/example/multi-vector.json   |  47 +++
 .../multi-vector-tolerance.json               |  16 +
 .../perf_test_inputs/multi-vector.json        | 149 ++++++++
 diskann-benchmark/src/backend/mod.rs          |   2 +
 .../src/backend/multi_vector/README.md        |  43 +++
 .../src/backend/multi_vector/driver.rs        | 318 ++++++++++++++++
 .../src/backend/multi_vector/kernels.rs       | 220 +++++++++++
 .../src/backend/multi_vector/mod.rs           | 202 ++++++++++
 diskann-benchmark/src/inputs/mod.rs           |   1 +
 diskann-benchmark/src/inputs/multi_vector.rs  | 156 ++++++++
 diskann-benchmark/src/main.rs                 |  86 +++++
 .../src/multi_vector/distance/factory.rs      | 360 ++++++++++++++++++
 .../src/multi_vector/distance/isa.rs          |  62 +++
 .../src/multi_vector/distance/kernel.rs       |  53 +++
 .../src/multi_vector/distance/kernels/mod.rs  |   5 +-
 .../src/multi_vector/distance/mod.rs          |  22 +-
 .../distance/query_computer/f16.rs            | 100 -----
 .../distance/query_computer/f32.rs            | 101 -----
 .../distance/query_computer/mod.rs            | 290 --------------
 .../src/multi_vector/matrix.rs                |   7 +
 diskann-quantization/src/multi_vector/mod.rs  |   9 +-
 22 files changed, 1747 insertions(+), 505 deletions(-)
 create mode 100644 diskann-benchmark/example/multi-vector.json
 create mode 100644 diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json
 create mode 100644 diskann-benchmark/perf_test_inputs/multi-vector.json
 create mode 100644 diskann-benchmark/src/backend/multi_vector/README.md
 create mode 100644 diskann-benchmark/src/backend/multi_vector/driver.rs
 create mode 100644 diskann-benchmark/src/backend/multi_vector/kernels.rs
 create mode 100644 diskann-benchmark/src/backend/multi_vector/mod.rs
 create mode 100644 diskann-benchmark/src/inputs/multi_vector.rs
 create mode 100644 diskann-quantization/src/multi_vector/distance/factory.rs
 create mode 100644 diskann-quantization/src/multi_vector/distance/isa.rs
 create mode 100644 diskann-quantization/src/multi_vector/distance/kernel.rs
 delete mode 100644 diskann-quantization/src/multi_vector/distance/query_computer/f16.rs
 delete mode 100644 diskann-quantization/src/multi_vector/distance/query_computer/f32.rs
 delete mode 100644 diskann-quantization/src/multi_vector/distance/query_computer/mod.rs

diff --git a/diskann-benchmark/Cargo.toml b/diskann-benchmark/Cargo.toml
index bebaf4b8e..ecc3a53dd 100644
--- a/diskann-benchmark/Cargo.toml
+++ b/diskann-benchmark/Cargo.toml
@@ -63,6 +63,9 @@ scalar-quantization = []
 # Enable minmax-quantization based algorithms
 minmax-quantization = []
 
+# Enable multi-vector MaxSim distance benchmarks
+multi-vector = []
+
 # Enable Disk Index benchmarks
 disk-index = [
     "diskann-disk/perf_test",
diff --git a/diskann-benchmark/example/multi-vector.json b/diskann-benchmark/example/multi-vector.json
new file mode 100644
index 000000000..af66a886d
--- /dev/null
+++ b/diskann-benchmark/example/multi-vector.json
@@ -0,0 +1,47 @@
+{
+  "search_directories": [],
+  "jobs": [
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "auto",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "scalar",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "reference",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "isa": "auto",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json
new file mode 100644
index 000000000..8d5997199
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json
@@ -0,0 +1,16 @@
+{
+  "checks": [
+    {
+      "input": {
+        "type": "multi-vector-op",
+        "content": {}
+      },
+      "tolerance": {
+        "type": "multi-vector-tolerance",
+        "content": {
+          "min_time_regression": 0.05
+        }
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark/perf_test_inputs/multi-vector.json b/diskann-benchmark/perf_test_inputs/multi-vector.json
new file mode 100644
index 000000000..c4ce9bb8b
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/multi-vector.json
@@ -0,0 +1,149 @@
+{
+  "search_directories": [],
+  "jobs": [
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "auto",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "scalar",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "x86-64-v3",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "x86-64-v4",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "isa": "reference",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "isa": "x86-64-v3",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "isa": "x86-64-v4",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "isa": "reference",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark/src/backend/mod.rs b/diskann-benchmark/src/backend/mod.rs
index 8396577e8..d04bae158 100644
--- a/diskann-benchmark/src/backend/mod.rs
+++ b/diskann-benchmark/src/backend/mod.rs
@@ -9,11 +9,13 @@ mod disk_index;
 mod exhaustive;
 mod filters;
 mod index;
+mod multi_vector;
 
 pub(crate) fn register_benchmarks(registry: &mut Registry) -> anyhow::Result<()> {
     exhaustive::register_benchmarks(registry)?;
     disk_index::register_benchmarks(registry)?;
     index::register_benchmarks(registry)?;
     filters::register_benchmarks(registry)?;
+    multi_vector::register_benchmarks(registry)?;
     Ok(())
 }
diff --git a/diskann-benchmark/src/backend/multi_vector/README.md b/diskann-benchmark/src/backend/multi_vector/README.md
new file mode 100644
index 000000000..f24d38ff7
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/README.md
@@ -0,0 +1,43 @@
+# Multi-vector benchmark — kernel-author workflow
+
+The multi-vector benchmark dispatches through `diskann-quantization`'s
+`build_max_sim_f32` / `build_max_sim_f16` factory. Selection is driven by a
+non-exhaustive `MaxSimIsa` enum. To add a new in-tree experimental kernel,
+extend the enum + factory + the benchmark's shadow enum.
+
+## Steps
+
+1. **Library: variant + factory arm.** In
+   `diskann-quantization::multi_vector::distance`:
+   - Add a new variant to `MaxSimIsa` (in `isa.rs`).
+   - Implement `MaxSimKernel<T>` for your kernel struct (in `factory.rs`,
+     next to `Prepared` and `ReferenceKernel`).
+   - Add a matching arm to `build_max_sim_f32` and/or `build_max_sim_f16`
+     that constructs your kernel and hands it to `erase.erase(...)`.
+
+2. **Benchmark: matching shadow variant.** In
+   `diskann-benchmark::inputs::multi_vector`:
+   - Add the same variant to `BenchIsa`.
+   - Add the matching arm to `From<BenchIsa> for MaxSimIsa`.
+
+3. **Run.** Set `"isa": "your-variant"` in the JSON job; the existing
+   `KernelF32` / `KernelF16` benchmark entries handle the rest. No new
+   `Benchmark` registration required.
+
+## Why two enums?
+
+`MaxSimIsa` (library) and `BenchIsa` (benchmark) are kept separate so the
+library doesn't pin its public API on a serde version or a particular JSON
+shape. The benchmark owns its kebab-case JSON layout; the library is
+serde-agnostic. Mirroring variant-for-variant is intentional — small price
+for keeping the library boundary clean.
+
+## Background
+
+The factory follows the BYOTE ("Bring your own type erasure") pattern
+described in [RFC #1068]. If you want your kernel packaged as something
+other than `Box<dyn MaxSimKernel<T>>` (e.g. composed with chamfer summing,
+or wrapped in a custom thin trait), implement your own `Erase<T>` and pass
+it to the factory in place of `BoxErase`.
+
+[RFC #1068]: https://github.com/microsoft/DiskANN/pull/1068
diff --git a/diskann-benchmark/src/backend/multi_vector/driver.rs b/diskann-benchmark/src/backend/multi_vector/driver.rs
new file mode 100644
index 000000000..c9ac8b488
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/driver.rs
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Shared benchmark infrastructure for multi-vector kernels.
+//!
+//! Houses the timing harness ([`run_loops`]), data fixtures ([`Data`]), result
+//! types ([`RunResult`], [`Comparison`], [`CheckResult`]), and the trait-object
+//! [`Distance<T>`] boundary the driver dispatches through. None of the
+//! contents are kernel-aware.
+
+use diskann_benchmark_runner::{
+    utils::{
+        fmt::Table,
+        num::{relative_change, NonNegativeFinite},
+        percentiles, MicroSeconds,
+    },
+    Any, CheckDeserialization, Checker, Input,
+};
+use diskann_quantization::multi_vector::{Mat, MatRef, MaxSimKernel, Standard};
+use rand::{
+    distr::{Distribution, StandardUniform},
+    rngs::StdRng,
+    SeedableRng,
+};
+use serde::{Deserialize, Serialize};
+
+use crate::inputs::multi_vector::Run;
+
+//////////////////////
+// Tolerance        //
+//////////////////////
+
+/// Tolerance thresholds for multi-vector benchmark regression detection.
+///
+/// Each field specifies the maximum allowed relative increase in the corresponding metric.
+/// For example, a value of `0.05` means a 5% increase is tolerated.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub(super) struct MultiVectorTolerance {
+    pub(super) min_time_regression: NonNegativeFinite,
+}
+
+impl CheckDeserialization for MultiVectorTolerance {
+    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
+        Ok(())
+    }
+}
+
+impl Input for MultiVectorTolerance {
+    fn tag() -> &'static str {
+        "multi-vector-tolerance"
+    }
+
+    fn try_deserialize(
+        serialized: &serde_json::Value,
+        checker: &mut Checker,
+    ) -> anyhow::Result<Any> {
+        checker.any(Self::deserialize(serialized)?)
+    }
+
+    fn example() -> anyhow::Result<serde_json::Value> {
+        const EXAMPLE: NonNegativeFinite = match NonNegativeFinite::new(0.05) {
+            Ok(v) => v,
+            Err(_) => panic!("use a non-negative finite please"),
+        };
+
+        Ok(serde_json::to_value(MultiVectorTolerance {
+            min_time_regression: EXAMPLE,
+        })?)
+    }
+}
+
+///////////////////
+// Data fixtures //
+///////////////////
+
+/// Random query / doc fixture for a single benchmark run.
+pub(super) struct Data<T: Copy> {
+    pub(super) queries: Mat<Standard<T>>,
+    pub(super) docs: Mat<Standard<T>>,
+}
+
+impl<T: Copy> Data<T>
+where
+    StandardUniform: Distribution<T>,
+{
+    pub(super) fn new(run: &Run) -> Self {
+        let mut rng = StdRng::seed_from_u64(0x12345);
+        let queries = Mat::from_fn(
+            Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(),
+            || StandardUniform.sample(&mut rng),
+        );
+        let docs = Mat::from_fn(
+            Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(),
+            || StandardUniform.sample(&mut rng),
+        );
+        Self { queries, docs }
+    }
+}
+
+//////////////////////
+// Distance trait   //
+//////////////////////
+
+/// Object-safe distance executor. The library factory's `Erase` visitor
+/// already produces a `Box<dyn MaxSimKernel<T>>`, but the driver wants its
+/// own narrow trait so the kernel + its assertions are tucked inside one
+/// vtable boundary. Simpler than threading `Box<dyn MaxSimKernel<T>>`
+/// generically through the timing harness.
+pub(super) trait Distance<T: Copy> {
+    fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
+}
+
+/// Distance executor wrapping a boxed `MaxSimKernel<T>` from the library
+/// factory. One vtable hop in the hot loop.
+pub(super) struct BoxedKernel<T: Copy>(pub(super) Box<dyn MaxSimKernel<T>>);
+
+impl<T: Copy> Distance<T> for BoxedKernel<T> {
+    fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
+        let nq = self.0.nrows();
+        assert_eq!(
+            scores.len(),
+            nq,
+            "scores buffer not right size: {} != {}",
+            scores.len(),
+            nq
+        );
+        if doc.num_vectors() == 0 {
+            return;
+        }
+        self.0.compute_max_sim(doc, scores);
+    }
+}
+
+//////////////////////
+// Timing harness   //
+//////////////////////
+
+fn run_loops<F>(run: &Run, mut body: F) -> RunResult
+where
+    F: FnMut(),
+{
+    let mut latencies = Vec::with_capacity(run.num_measurements.get());
+
+    for _ in 0..run.num_measurements.get() {
+        let start = std::time::Instant::now();
+        for _ in 0..run.loops_per_measurement.get() {
+            body();
+        }
+        latencies.push(start.elapsed().into());
+    }
+
+    let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap();
+    RunResult {
+        run: run.clone(),
+        latencies,
+        percentiles,
+    }
+}
+
+/// Shared loop nest. The trait-object dispatch happens once per outer iteration
+/// of `run_loops`; the work inside each `max_sim` call is O(Q·D·dim), so the
+/// vtable hop is in the noise.
+pub(super) fn run_with_distance<T: Copy>(
+    run: &Run,
+    doc: MatRef<'_, Standard<T>>,
+    dist: &dyn Distance<T>,
+) -> RunResult {
+    let mut scores = vec![0.0f32; run.num_query_vectors.get()];
+    run_loops(run, || {
+        dist.max_sim(doc, &mut scores);
+        std::hint::black_box(&mut scores);
+    })
+}
+
+//////////////////////
+// Result types     //
+//////////////////////
+
+#[derive(Debug, Clone, Copy)]
+pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T);
+
+impl<T: ?Sized> std::ops::Deref for DisplayWrapper<'_, T> {
+    type Target = T;
+    fn deref(&self) -> &T {
+        self.0
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub(super) struct RunResult {
+    /// The configuration for this run.
+    pub(super) run: Run,
+    /// Per-measurement latencies (over `loops_per_measurement` calls).
+    pub(super) latencies: Vec<MicroSeconds>,
+    /// Latency percentiles.
+    pub(super) percentiles: percentiles::Percentiles<MicroSeconds>,
+}
+
+impl RunResult {
+    pub(super) fn computations_per_latency(&self) -> usize {
+        self.run.num_query_vectors.get()
+            * self.run.num_doc_vectors.get()
+            * self.run.loops_per_measurement.get()
+    }
+}
+
+impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.is_empty() {
+            return Ok(());
+        }
+
+        writeln!(
+            f,
+            "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)"
+        )?;
+
+        let header = [
+            "Q",
+            "D",
+            "Dim",
+            "Min Time (ns/IP @ Dim)",
+            "Mean Time (ns/IP @ Dim)",
+            "Loops",
+            "Measurements",
+        ];
+
+        let mut table = Table::new(header, self.len());
+
+        self.iter().enumerate().for_each(|(row, r)| {
+            let mut row = table.row(row);
+
+            let min_latency = r
+                .latencies
+                .iter()
+                .min()
+                .copied()
+                .unwrap_or(MicroSeconds::new(u64::MAX));
+            let mean_latency = r.percentiles.mean;
+
+            let computations_per_latency = r.computations_per_latency() as f64;
+            let min_time = min_latency.as_f64() / computations_per_latency * 1000.0;
+            let mean_time = mean_latency / computations_per_latency * 1000.0;
+
+            row.insert(r.run.num_query_vectors, 0);
+            row.insert(r.run.num_doc_vectors, 1);
+            row.insert(r.run.dim, 2);
+            row.insert(format!("{:.3}", min_time), 3);
+            row.insert(format!("{:.3}", mean_time), 4);
+            row.insert(r.run.loops_per_measurement, 5);
+            row.insert(r.run.num_measurements, 6);
+        });
+
+        table.fmt(f)
+    }
+}
+
+//////////////////////
+// Regression Check //
+//////////////////////
+
+/// Per-run comparison result showing before/after percentile differences.
+#[derive(Debug, Serialize)]
+pub(super) struct Comparison {
+    pub(super) run: Run,
+    pub(super) tolerance: MultiVectorTolerance,
+    pub(super) before_min: f64,
+    pub(super) after_min: f64,
+}
+
+/// Aggregated result of the regression check across all runs.
+#[derive(Debug, Serialize)]
+pub(super) struct CheckResult {
+    pub(super) checks: Vec<Comparison>,
+}
+
+impl std::fmt::Display for CheckResult {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let header = [
+            "Q",
+            "D",
+            "Dim",
+            "Min Before (ns/IP @ Dim)",
+            "Min After (ns/IP @ Dim)",
+            "Change (%)",
+            "Remark",
+        ];
+
+        let mut table = Table::new(header, self.checks.len());
+
+        for (i, c) in self.checks.iter().enumerate() {
+            let mut row = table.row(i);
+            let change = relative_change(c.before_min, c.after_min);
+
+            row.insert(c.run.num_query_vectors, 0);
+            row.insert(c.run.num_doc_vectors, 1);
+            row.insert(c.run.dim, 2);
+            row.insert(format!("{:.3}", c.before_min), 3);
+            row.insert(format!("{:.3}", c.after_min), 4);
+            match change {
+                Ok(change) => {
+                    row.insert(format!("{:.3} %", change * 100.0), 5);
+                    if change > c.tolerance.min_time_regression.get() {
+                        row.insert("FAIL", 6);
+                    }
+                }
+                Err(err) => {
+                    row.insert("invalid", 5);
+                    row.insert(err, 6);
+                }
+            }
+        }
+
+        table.fmt(f)
+    }
+}
diff --git a/diskann-benchmark/src/backend/multi_vector/kernels.rs b/diskann-benchmark/src/backend/multi_vector/kernels.rs
new file mode 100644
index 000000000..c9359b705
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/kernels.rs
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! `Benchmark` impls for the multi-vector MaxSim factory.
+//!
+//! One entry per element type. Each `try_match` checks `element_type` only;
+//! the `isa` field is passed to the library factory at run time. ISA
+//! unavailability surfaces as `NotSupported`, which becomes a job-level
+//! error.
+
+use std::io::Write;
+
+use diskann_benchmark_runner::{
+    benchmark::{FailureScore, MatchScore, PassFail, Regression},
+    utils::{datatype::AsDataType, num::relative_change},
+    Benchmark, Checkpoint, Output, Registry,
+};
+use diskann_quantization::multi_vector::{
+    build_max_sim_f16, build_max_sim_f32, BoxErase, MaxSimKernel,
+};
+use rand::distr::{Distribution, StandardUniform};
+
+use super::driver::{
+    run_with_distance, BoxedKernel, CheckResult, Comparison, Data, DisplayWrapper,
+    MultiVectorTolerance, RunResult,
+};
+use crate::inputs::multi_vector::MultiVectorOp;
+
+// ─────────────────────────────────────────────────────────────────────────
+//  Per-element-type `Benchmark` carriers.
+// ─────────────────────────────────────────────────────────────────────────
+
+#[derive(Debug)]
+pub(super) struct KernelF32;
+
+#[derive(Debug)]
+pub(super) struct KernelF16;
+
+/// Per-element-type bridge: factory entry name + factory call.
+///
+/// Data-type matching (`DATA_TYPE`, `is_match`, `describe`) comes from the
+/// framework's [`AsDataType`] trait, which is already implemented for `f32`,
+/// `half::f16`, etc.
+trait ElementType: AsDataType + Copy {
+    const ENTRY_NAME: &'static str;
+    fn build(
+        isa: diskann_quantization::multi_vector::MaxSimIsa,
+        query: diskann_quantization::multi_vector::MatRef<
+            '_,
+            diskann_quantization::multi_vector::Standard<Self>,
+        >,
+    ) -> Result<Box<dyn MaxSimKernel<Self>>, diskann_quantization::multi_vector::NotSupported>;
+}
+
+impl ElementType for f32 {
+    const ENTRY_NAME: &'static str = "multi-vector-op-f32";
+    fn build(
+        isa: diskann_quantization::multi_vector::MaxSimIsa,
+        query: diskann_quantization::multi_vector::MatRef<
+            '_,
+            diskann_quantization::multi_vector::Standard<f32>,
+        >,
+    ) -> Result<Box<dyn MaxSimKernel<f32>>, diskann_quantization::multi_vector::NotSupported> {
+        build_max_sim_f32(isa, query, BoxErase)
+    }
+}
+
+impl ElementType for half::f16 {
+    const ENTRY_NAME: &'static str = "multi-vector-op-f16";
+    fn build(
+        isa: diskann_quantization::multi_vector::MaxSimIsa,
+        query: diskann_quantization::multi_vector::MatRef<
+            '_,
+            diskann_quantization::multi_vector::Standard<half::f16>,
+        >,
+    ) -> Result<Box<dyn MaxSimKernel<half::f16>>, diskann_quantization::multi_vector::NotSupported>
+    {
+        build_max_sim_f16(isa, query, BoxErase)
+    }
+}
+
+fn run_benchmark<T: ElementType>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
+where
+    StandardUniform: Distribution<T>,
+{
+    let mut results = Vec::with_capacity(input.runs.len());
+    for run in input.runs.iter() {
+        let data = Data::<T>::new(run);
+        let kernel = T::build(input.isa.into(), data.queries.as_view())?;
+        let dist = BoxedKernel(kernel);
+        results.push(run_with_distance(run, data.docs.as_view(), &dist));
+    }
+    Ok(results)
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+//  Benchmark + Regression impls.
+// ─────────────────────────────────────────────────────────────────────────
+
+macro_rules! impl_benchmark {
+    ($ty:ident, $T:ty) => {
+        impl Benchmark for $ty
+        where
+            StandardUniform: Distribution<$T>,
+        {
+            type Input = MultiVectorOp;
+            type Output = Vec<RunResult>;
+
+            fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
+                crate::utils::match_data_type::<$T>(from.element_type)
+            }
+
+            fn run(
+                &self,
+                input: &MultiVectorOp,
+                _: Checkpoint<'_>,
+                mut output: &mut dyn Output,
+            ) -> anyhow::Result<Self::Output> {
+                writeln!(output, "{}", input)?;
+                let results = run_benchmark::<$T>(input)?;
+                writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
+                Ok(results)
+            }
+
+            fn description(
+                &self,
+                f: &mut std::fmt::Formatter<'_>,
+                input: Option<&MultiVectorOp>,
+            ) -> std::fmt::Result {
+                match input {
+                    None => writeln!(f, "- Element Type: {}", <$T as AsDataType>::DATA_TYPE)?,
+                    Some(input) => {
+                        let desc = <$T as AsDataType>::describe(input.element_type);
+                        if !desc.is_match() {
+                            writeln!(f, "\n    - Mismatched element type: {}", desc)?;
+                        }
+                    }
+                }
+                Ok(())
+            }
+        }
+
+        impl Regression for $ty
+        where
+            StandardUniform: Distribution<$T>,
+        {
+            type Tolerances = MultiVectorTolerance;
+            type Pass = CheckResult;
+            type Fail = CheckResult;
+
+            fn check(
+                &self,
+                tolerance: &MultiVectorTolerance,
+                _input: &MultiVectorOp,
+                before: &Vec<RunResult>,
+                after: &Vec<RunResult>,
+            ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
+                anyhow::ensure!(
+                    before.len() == after.len(),
+                    "before has {} runs but after has {}",
+                    before.len(),
+                    after.len(),
+                );
+
+                let mut passed = true;
+                let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
+                    .enumerate()
+                    .map(|(i, (b, a))| {
+                        anyhow::ensure!(b.run == a.run, "run {i} mismatched");
+
+                        let computations_per_latency = b.computations_per_latency() as f64;
+                        let before_min =
+                            b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+                        let after_min =
+                            a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+
+                        let comparison = Comparison {
+                            run: b.run.clone(),
+                            tolerance: *tolerance,
+                            before_min,
+                            after_min,
+                        };
+
+                        match relative_change(before_min, after_min) {
+                            Ok(change) => {
+                                if change > tolerance.min_time_regression.get() {
+                                    passed = false;
+                                }
+                            }
+                            Err(_) => passed = false,
+                        };
+
+                        Ok(comparison)
+                    })
+                    .collect::<anyhow::Result<Vec<Comparison>>>()?;
+
+                Ok(if passed {
+                    PassFail::Pass(CheckResult { checks })
+                } else {
+                    PassFail::Fail(CheckResult { checks })
+                })
+            }
+        }
+    };
+}
+
+impl_benchmark!(KernelF32, f32);
+impl_benchmark!(KernelF16, half::f16);
+
+// ─────────────────────────────────────────────────────────────────────────
+//  Registration.
+// ─────────────────────────────────────────────────────────────────────────
+
+pub(super) fn register(registry: &mut Registry) -> anyhow::Result<()> {
+    registry.register_regression(<f32 as ElementType>::ENTRY_NAME, KernelF32)?;
+    registry.register_regression(<half::f16 as ElementType>::ENTRY_NAME, KernelF16)?;
+    Ok(())
+}
diff --git a/diskann-benchmark/src/backend/multi_vector/mod.rs b/diskann-benchmark/src/backend/multi_vector/mod.rs
new file mode 100644
index 000000000..c3ffffeaf
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/mod.rs
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Multi-vector MaxSim distance benchmarks with regression detection.
+//!
+//! Registers one `Benchmark` entry per supported element type; the JSON `isa`
+//! field selects the kernel at run time via the library's
+//! [`build_max_sim_f32`] / [`build_max_sim_f16`] factories.
+//!
+//! See [`README.md`](./README.md) for the in-tree workflow when authoring a new
+//! experimental kernel.
+//!
+//! [`build_max_sim_f32`]: diskann_quantization::multi_vector::build_max_sim_f32
+//! [`build_max_sim_f16`]: diskann_quantization::multi_vector::build_max_sim_f16
+
+use diskann_benchmark_runner::Registry;
+
+cfg_if::cfg_if! {
+    if #[cfg(feature = "multi-vector")] {
+        mod driver;
+        mod kernels;
+
+        pub(super) fn register_benchmarks(registry: &mut Registry) -> anyhow::Result<()> {
+            kernels::register(registry)
+        }
+    } else {
+        crate::utils::stub_impl!("multi-vector", inputs::multi_vector::MultiVectorOp);
+
+        pub(super) fn register_benchmarks(registry: &mut Registry) -> anyhow::Result<()> {
+            imp::register("multi-vector-op", registry)
+        }
+    }
+}
+
+#[cfg(all(test, feature = "multi-vector"))]
+mod tests {
+    use std::num::NonZeroUsize;
+
+    use diskann_benchmark_runner::{
+        benchmark::{PassFail, Regression},
+        utils::{
+            datatype::DataType, num::NonNegativeFinite, percentiles::compute_percentiles,
+            MicroSeconds,
+        },
+    };
+
+    use super::driver::{CheckResult, Comparison, MultiVectorTolerance, RunResult};
+    use super::kernels::KernelF32;
+    use crate::inputs::multi_vector::{BenchIsa, MultiVectorOp, Run};
+
+    fn tiny_run() -> Run {
+        Run {
+            num_query_vectors: NonZeroUsize::new(2).unwrap(),
+            num_doc_vectors: NonZeroUsize::new(2).unwrap(),
+            dim: NonZeroUsize::new(4).unwrap(),
+            loops_per_measurement: NonZeroUsize::new(1).unwrap(),
+            num_measurements: NonZeroUsize::new(1).unwrap(),
+        }
+    }
+
+    fn tiny_op() -> MultiVectorOp {
+        MultiVectorOp {
+            element_type: DataType::Float32,
+            isa: BenchIsa::Auto,
+            runs: vec![tiny_run()],
+        }
+    }
+
+    fn tiny_result(minimum: u64) -> RunResult {
+        let mut latencies = vec![MicroSeconds::new(minimum)];
+        let percentiles = compute_percentiles(&mut latencies).unwrap();
+        RunResult {
+            run: tiny_run(),
+            latencies,
+            percentiles,
+        }
+    }
+
+    fn tolerance(limit: f64) -> MultiVectorTolerance {
+        MultiVectorTolerance {
+            min_time_regression: NonNegativeFinite::new(limit).unwrap(),
+        }
+    }
+
+    #[test]
+    fn check_rejects_mismatched_runs() {
+        let kernel = KernelF32;
+
+        // Build a result whose `run` diverges from `tiny_run()` so the
+        // regression check's `b.run == a.run` invariant fires.
+        let mut latencies = vec![MicroSeconds::new(100)];
+        let percentiles = compute_percentiles(&mut latencies).unwrap();
+        let mismatched_result = RunResult {
+            run: Run {
+                num_query_vectors: NonZeroUsize::new(4).unwrap(),
+                ..tiny_run()
+            },
+            latencies,
+            percentiles,
+        };
+
+        let err = kernel
+            .check(
+                &tolerance(0.0),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![mismatched_result],
+            )
+            .unwrap_err();
+
+        assert_eq!(err.to_string(), "run 0 mismatched");
+    }
+
+    #[test]
+    fn check_allows_negative_relative_change() {
+        let kernel = KernelF32;
+
+        let result = kernel
+            .check(
+                &tolerance(0.0),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![tiny_result(95)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Pass(_)));
+    }
+
+    #[test]
+    fn check_passes_on_tolerance_boundary() {
+        let kernel = KernelF32;
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![tiny_result(105)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Pass(_)));
+    }
+
+    #[test]
+    fn check_fails_above_tolerance_boundary() {
+        let kernel = KernelF32;
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![tiny_result(106)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Fail(_)));
+    }
+
+    #[test]
+    fn check_result_display_includes_failure_details() {
+        let check = CheckResult {
+            checks: vec![Comparison {
+                run: tiny_run(),
+                tolerance: tolerance(0.05),
+                before_min: 100.0,
+                after_min: 106.0,
+            }],
+        };
+
+        let rendered = check.to_string();
+        assert!(rendered.contains("Q"), "rendered = {rendered}");
+        assert!(rendered.contains("Dim"), "rendered = {rendered}");
+        assert!(rendered.contains("100.000"), "rendered = {rendered}");
+        assert!(rendered.contains("106.000"), "rendered = {rendered}");
+        assert!(rendered.contains("6.000 %"), "rendered = {rendered}");
+        assert!(rendered.contains("FAIL"), "rendered = {rendered}");
+    }
+
+    /// A "before" value of 0 means the measurement was too fast to obtain a
+    /// reliable signal, so we *could* be letting a regression through. We
+    /// require at least a non-zero value.
+    #[test]
+    fn zero_values_rejected() {
+        let kernel = KernelF32;
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(0)],
+                &vec![tiny_result(0)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Fail(_)));
+    }
+}
diff --git a/diskann-benchmark/src/inputs/mod.rs b/diskann-benchmark/src/inputs/mod.rs
index 7875beb1d..58c07aa00 100644
--- a/diskann-benchmark/src/inputs/mod.rs
+++ b/diskann-benchmark/src/inputs/mod.rs
@@ -7,6 +7,7 @@ pub(crate) mod disk;
 pub(crate) mod exhaustive;
 pub(crate) mod filters;
 pub(crate) mod graph_index;
+pub(crate) mod multi_vector;
 pub(crate) mod save_and_load;
 
 /// Construct an example input of type `Self`.
diff --git a/diskann-benchmark/src/inputs/multi_vector.rs b/diskann-benchmark/src/inputs/multi_vector.rs
new file mode 100644
index 000000000..9d863c13a
--- /dev/null
+++ b/diskann-benchmark/src/inputs/multi_vector.rs
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+use std::num::NonZeroUsize;
+
+use diskann_benchmark_runner::{utils::datatype::DataType, CheckDeserialization, Checker};
+use diskann_quantization::multi_vector::MaxSimIsa;
+use serde::{Deserialize, Serialize};
+
+use crate::inputs::{as_input, Example};
+
+//////////////
+// Registry //
+//////////////
+
+as_input!(MultiVectorOp);
+
+////////////////
+// Enum types //
+////////////////
+
+/// JSON-facing shadow of [`MaxSimIsa`] from `diskann-quantization`. The
+/// library's enum is deliberately not `Serialize`/`Deserialize` so it isn't
+/// pinned to a particular JSON shape; this enum owns the kebab-case
+/// serialization and converts to the library type at dispatch time.
+///
+/// **Stays in sync with `MaxSimIsa` manually.** When the library adds a
+/// variant, mirror it here + add a matching arm to `From<BenchIsa> for
+/// MaxSimIsa`.
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+#[non_exhaustive]
+pub(crate) enum BenchIsa {
+    #[serde(rename = "x86-64-v4")]
+    #[allow(non_camel_case_types)]
+    X86_64_V4,
+    #[serde(rename = "x86-64-v3")]
+    #[allow(non_camel_case_types)]
+    X86_64_V3,
+    Neon,
+    Scalar,
+    Reference,
+    Auto,
+}
+
+impl std::fmt::Display for BenchIsa {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let st = match self {
+            Self::X86_64_V4 => "x86-64-v4",
+            Self::X86_64_V3 => "x86-64-v3",
+            Self::Neon => "neon",
+            Self::Scalar => "scalar",
+            Self::Reference => "reference",
+            Self::Auto => "auto",
+        };
+        write!(f, "{}", st)
+    }
+}
+
+impl From<BenchIsa> for MaxSimIsa {
+    fn from(b: BenchIsa) -> Self {
+        match b {
+            BenchIsa::X86_64_V4 => MaxSimIsa::X86_64_V4,
+            BenchIsa::X86_64_V3 => MaxSimIsa::X86_64_V3,
+            BenchIsa::Neon => MaxSimIsa::Neon,
+            BenchIsa::Scalar => MaxSimIsa::Scalar,
+            BenchIsa::Reference => MaxSimIsa::Reference,
+            BenchIsa::Auto => MaxSimIsa::Auto,
+        }
+    }
+}
+
+/// One benchmark configuration: a single shape measurement.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub(crate) struct Run {
+    pub(crate) num_query_vectors: NonZeroUsize,
+    pub(crate) num_doc_vectors: NonZeroUsize,
+    pub(crate) dim: NonZeroUsize,
+    pub(crate) loops_per_measurement: NonZeroUsize,
+    pub(crate) num_measurements: NonZeroUsize,
+}
+
+///////////////////////
+// Multi-Vector Op   //
+///////////////////////
+
+/// A complete multi-vector benchmark job.
+#[derive(Debug, Serialize, Deserialize)]
+pub(crate) struct MultiVectorOp {
+    pub(crate) element_type: DataType,
+    pub(crate) isa: BenchIsa,
+    pub(crate) runs: Vec<Run>,
+}
+
+impl MultiVectorOp {
+    pub(crate) const fn tag() -> &'static str {
+        "multi-vector-op"
+    }
+}
+
+impl CheckDeserialization for MultiVectorOp {
+    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
+        Ok(())
+    }
+}
+
+impl Example for MultiVectorOp {
+    fn example() -> Self {
+        const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
+        const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap();
+        const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap();
+        const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap();
+
+        let runs = vec![
+            Run {
+                num_query_vectors: NonZeroUsize::new(32).unwrap(),
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+            Run {
+                num_query_vectors: NonZeroUsize::new(64).unwrap(),
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+        ];
+
+        Self {
+            element_type: DataType::Float32,
+            isa: BenchIsa::Auto,
+            runs,
+        }
+    }
+}
+
+macro_rules! write_field {
+    ($f:ident, $field:tt, $($expr:tt)*) => {
+        writeln!($f, "{:>18}: {}", $field, $($expr)*)
+    }
+}
+
+impl std::fmt::Display for MultiVectorOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "Multi-Vector Operation\n")?;
+        write_field!(f, "tag", Self::tag())?;
+        write_field!(f, "element type", self.element_type)?;
+        write_field!(f, "isa", self.isa)?;
+        write_field!(f, "number of runs", self.runs.len())?;
+        Ok(())
+    }
+}
diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs
index cc70120cd..c87a08e17 100644
--- a/diskann-benchmark/src/main.rs
+++ b/diskann-benchmark/src/main.rs
@@ -772,6 +772,92 @@ mod tests {
         assert!(!output_path.exists());
     }
 
+    ///////////////////
+    // Multi-Vector  //
+    ///////////////////
+
+    #[test]
+    fn multi_vector_integration() {
+        let path = example_directory().join("multi-vector.json");
+        let tempdir = tempfile::tempdir().unwrap();
+        let output_path = tempdir.path().join("output.json");
+        assert!(!output_path.exists());
+
+        let modified_input_path = tempdir.path().join("input.json");
+
+        let mut raw = value_from_file(&path);
+        prefix_search_directories(&mut raw, &root_directory());
+        save_to_file(&modified_input_path, &raw);
+
+        run_multi_vector_integration(&modified_input_path, &output_path)
+    }
+
+    #[cfg(feature = "multi-vector")]
+    fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) {
+        let command = Commands::Run {
+            input_file: input_path.to_owned(),
+            output_file: output_path.to_owned(),
+            dry_run: false,
+            allow_debug: true,
+        };
+
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+
+        cli.run(&mut output).unwrap();
+        println!(
+            "output = {}",
+            String::from_utf8(output.into_inner()).unwrap()
+        );
+
+        // Check that the results file is generated.
+        assert!(output_path.exists());
+    }
+
+    #[cfg(not(feature = "multi-vector"))]
+    fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) {
+        let command = Commands::Run {
+            input_file: input_path.to_owned(),
+            output_file: output_path.to_owned(),
+            dry_run: false,
+            allow_debug: true,
+        };
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+
+        let err = cli.run(&mut output).unwrap_err();
+        println!("err = {:?}", err);
+
+        let output = String::from_utf8(output.into_inner()).unwrap();
+        assert!(output.contains("\"multi-vector\" feature"));
+        println!("output = {}", output);
+
+        // The output file should not have been created because we failed the test.
+        assert!(!output_path.exists());
+    }
+
+    #[test]
+    #[cfg(feature = "multi-vector")]
+    fn multi_vector_check_verify() {
+        let input_path = example_directory().join("multi-vector.json");
+        let tolerance_path = project_directory()
+            .join("perf_test_inputs")
+            .join("multi-vector-tolerance.json");
+
+        let command = Commands::Check(diskann_benchmark_runner::app::Check::Verify {
+            tolerances: tolerance_path,
+            input_file: input_path,
+        });
+
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+        cli.run(&mut output).unwrap();
+        println!(
+            "output = {}",
+            String::from_utf8(output.into_inner()).unwrap()
+        );
+    }
+
     #[test]
     fn quiet_suppresses_check_target_warning() {
         let cli = Cli::from_commands(Commands::Skeleton, true);
diff --git a/diskann-quantization/src/multi_vector/distance/factory.rs b/diskann-quantization/src/multi_vector/distance/factory.rs
new file mode 100644
index 000000000..78d15273e
--- /dev/null
+++ b/diskann-quantization/src/multi_vector/distance/factory.rs
@@ -0,0 +1,360 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! Factory + concrete `MaxSimKernel<T>` implementations for the multi-vector
+//! distance API. See [`build_max_sim_f32`] / [`build_max_sim_f16`] for the
+//! BYOTE entry points.
+
+use diskann_utils::Reborrow;
+use diskann_vector::distance::InnerProduct;
+use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
+use diskann_wide::Architecture;
+use diskann_wide::arch::Scalar;
+#[cfg(target_arch = "aarch64")]
+use diskann_wide::arch::aarch64::Neon;
+#[cfg(target_arch = "x86_64")]
+use diskann_wide::arch::x86_64::{V3, V4};
+
+use super::isa::{MaxSimIsa, NotSupported};
+use super::kernel::{Erase, MaxSimKernel};
+use super::kernels::f16::F16Entry;
+use super::kernels::f32::F32Kernel;
+use super::max_sim::MaxSim;
+use crate::multi_vector::distance::QueryMatRef;
+use crate::multi_vector::{BlockTransposed, BlockTransposedRef, Mat, MatRef, Standard};
+
+// ─────────────────────────────────────────────────────────────────────────
+//  Prepared<A, Q> — concrete kernel for the arch-dispatched paths.
+// ─────────────────────────────────────────────────────────────────────────
+
+/// Concrete kernel: owns an arch token and a block-transposed prepared query.
+/// One generic `MaxSimKernel<T>` impl covers every arch (Scalar/V3/V4/Neon)
+/// for every supported element type (f32, f16) via the `Kernel<A>` / `Target3`
+/// dispatch in the `kernels` module.
+#[derive(Debug)]
+struct Prepared<A, Q> {
+    arch: A,
+    prepared: Q,
+}
+
+impl<A, const GROUP: usize> MaxSimKernel<f32> for Prepared<A, BlockTransposed<f32, GROUP>>
+where
+    A: Architecture,
+    F32Kernel<GROUP>: for<'a> diskann_wide::arch::Target3<
+            A,
+            (),
+            BlockTransposedRef<'a, f32, GROUP>,
+            MatRef<'a, Standard<f32>>,
+            &'a mut [f32],
+        >,
+{
+    fn nrows(&self) -> usize {
+        self.prepared.nrows()
+    }
+
+    fn compute_max_sim(&self, doc: MatRef<'_, Standard<f32>>, scores: &mut [f32]) {
+        let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()];
+        self.arch.run3(
+            F32Kernel::<GROUP>,
+            self.prepared.reborrow(),
+            doc,
+            &mut scratch,
+        );
+        for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) {
+            *dst = -src;
+        }
+    }
+}
+
+impl<A, const GROUP: usize> MaxSimKernel<half::f16>
+    for Prepared<A, BlockTransposed<half::f16, GROUP>>
+where
+    A: Architecture,
+    F16Entry<GROUP>: for<'a> diskann_wide::arch::Target3<
+            A,
+            (),
+            BlockTransposedRef<'a, half::f16, GROUP>,
+            MatRef<'a, Standard<half::f16>>,
+            &'a mut [f32],
+        >,
+{
+    fn nrows(&self) -> usize {
+        self.prepared.nrows()
+    }
+
+    fn compute_max_sim(&self, doc: MatRef<'_, Standard<half::f16>>, scores: &mut [f32]) {
+        let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()];
+        self.arch.run3(
+            F16Entry::<GROUP>,
+            self.prepared.reborrow(),
+            doc,
+            &mut scratch,
+        );
+        for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) {
+            *dst = -src;
+        }
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+//  ReferenceKernel<T> — non-SIMD fallback that wraps MaxSim::evaluate.
+// ─────────────────────────────────────────────────────────────────────────
+
+/// `MaxSimIsa::Reference` path. Owns the query as a `Mat<Standard<T>>` and
+/// delegates to the existing `MaxSim` fallback per `compute_max_sim` call.
+struct ReferenceKernel<T: Copy> {
+    query: Mat<Standard<T>>,
+}
+
+impl<T: Copy + std::fmt::Debug> std::fmt::Debug for ReferenceKernel<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ReferenceKernel")
+            .field("nrows", &self.query.num_vectors())
+            .finish()
+    }
+}
+
+impl<T: Copy> ReferenceKernel<T> {
+    fn new(query: MatRef<'_, Standard<T>>) -> Self {
+        let repr = *query.repr();
+        let src = query.as_slice();
+        let mut idx = 0usize;
+        let owned = Mat::<Standard<T>>::from_fn(repr, || {
+            let v = src[idx];
+            idx += 1;
+            v
+        });
+        Self { query: owned }
+    }
+}
+
+impl<T> MaxSimKernel<T> for ReferenceKernel<T>
+where
+    T: Copy + Send + Sync + std::fmt::Debug + 'static,
+    InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+{
+    fn nrows(&self) -> usize {
+        self.query.num_vectors()
+    }
+
+    fn compute_max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
+        if scores.is_empty() {
+            return;
+        }
+        let query: QueryMatRef<'_, Standard<T>> = self.query.as_view().into();
+        let Ok(mut max_sim) = MaxSim::new(scores) else {
+            return;
+        };
+        let _ = max_sim.evaluate(query, doc);
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+//  BuildAndErase<E> — Target1 impls used by `dispatch1_no_features` (Auto).
+// ─────────────────────────────────────────────────────────────────────────
+
+/// Internal Target1 carrier used only by the `MaxSimIsa::Auto` arm of
+/// `build_max_sim_*`. `dispatch1_no_features` picks the highest available
+/// arch on the host CPU and calls the matching `Target1::run` below.
+struct BuildAndErase<E>(E);
+
+// ───── f32 Target1 impls ─────
+
+impl<E: Erase<f32>> diskann_wide::arch::Target1<Scalar, E::Output, MatRef<'_, Standard<f32>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: Scalar, query: MatRef<'_, Standard<f32>>) -> E::Output {
+        let prepared = BlockTransposed::<f32, 8>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl<E: Erase<f32>> diskann_wide::arch::Target1<V3, E::Output, MatRef<'_, Standard<f32>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: V3, query: MatRef<'_, Standard<f32>>) -> E::Output {
+        let prepared = BlockTransposed::<f32, 16>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl<E: Erase<f32>> diskann_wide::arch::Target1<V4, E::Output, MatRef<'_, Standard<f32>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: V4, query: MatRef<'_, Standard<f32>>) -> E::Output {
+        // V4 has no dedicated kernel yet; retarget to V3.
+        let arch = arch.retarget();
+        let prepared = BlockTransposed::<f32, 16>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+#[cfg(target_arch = "aarch64")]
+impl<E: Erase<f32>> diskann_wide::arch::Target1<Neon, E::Output, MatRef<'_, Standard<f32>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: Neon, query: MatRef<'_, Standard<f32>>) -> E::Output {
+        // Neon has no dedicated kernel yet; retarget to Scalar.
+        let arch = arch.retarget();
+        let prepared = BlockTransposed::<f32, 8>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+// ───── f16 Target1 impls ─────
+
+impl<E: Erase<half::f16>>
+    diskann_wide::arch::Target1<Scalar, E::Output, MatRef<'_, Standard<half::f16>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: Scalar, query: MatRef<'_, Standard<half::f16>>) -> E::Output {
+        let prepared = BlockTransposed::<half::f16, 8>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl<E: Erase<half::f16>>
+    diskann_wide::arch::Target1<V3, E::Output, MatRef<'_, Standard<half::f16>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: V3, query: MatRef<'_, Standard<half::f16>>) -> E::Output {
+        let prepared = BlockTransposed::<half::f16, 16>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl<E: Erase<half::f16>>
+    diskann_wide::arch::Target1<V4, E::Output, MatRef<'_, Standard<half::f16>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: V4, query: MatRef<'_, Standard<half::f16>>) -> E::Output {
+        let arch = arch.retarget();
+        let prepared = BlockTransposed::<half::f16, 16>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+#[cfg(target_arch = "aarch64")]
+impl<E: Erase<half::f16>>
+    diskann_wide::arch::Target1<Neon, E::Output, MatRef<'_, Standard<half::f16>>>
+    for BuildAndErase<E>
+{
+    fn run(self, arch: Neon, query: MatRef<'_, Standard<half::f16>>) -> E::Output {
+        let arch = arch.retarget();
+        let prepared = BlockTransposed::<half::f16, 8>::from_matrix_view(query.as_matrix_view());
+        self.0.erase(Prepared { arch, prepared })
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────
+//  Factory functions.
+// ─────────────────────────────────────────────────────────────────────────
+
+/// Build a multi-vector MaxSim kernel for `f32` queries.
+///
+/// Dispatches on `isa`, constructs the corresponding concrete kernel, and
+/// hands it to `erase.erase(...)`. Returns [`NotSupported`] when the requested
+/// ISA cannot run on this build (e.g. AVX-512 unavailable; aarch64 on x86_64).
+pub fn build_max_sim_f32<E: Erase<f32>>(
+    isa: MaxSimIsa,
+    query: MatRef<'_, Standard<f32>>,
+    erase: E,
+) -> Result<E::Output, NotSupported> {
+    match isa {
+        MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features(
+            BuildAndErase(erase),
+            query,
+        )),
+        MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)),
+        #[cfg(target_arch = "x86_64")]
+        MaxSimIsa::X86_64_V3 => {
+            let arch = V3::new_checked().ok_or(NotSupported {
+                isa,
+                reason: "AVX2/FMA unavailable on this CPU",
+            })?;
+            Ok(arch.run1(BuildAndErase(erase), query))
+        }
+        #[cfg(target_arch = "x86_64")]
+        MaxSimIsa::X86_64_V4 => {
+            let arch = V4::new_checked().ok_or(NotSupported {
+                isa,
+                reason: "AVX-512 unavailable on this CPU",
+            })?;
+            Ok(arch.run1(BuildAndErase(erase), query))
+        }
+        #[cfg(not(target_arch = "x86_64"))]
+        MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported {
+            isa,
+            reason: "x86_64 target only",
+        }),
+        #[cfg(target_arch = "aarch64")]
+        MaxSimIsa::Neon => {
+            let arch = Neon::new_checked().ok_or(NotSupported {
+                isa,
+                reason: "Neon unavailable on this CPU",
+            })?;
+            Ok(arch.run1(BuildAndErase(erase), query))
+        }
+        #[cfg(not(target_arch = "aarch64"))]
+        MaxSimIsa::Neon => Err(NotSupported {
+            isa,
+            reason: "aarch64 target only",
+        }),
+        MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::<f32>::new(query))),
+    }
+}
+
+/// Build a multi-vector MaxSim kernel for `half::f16` queries. Same contract
+/// as [`build_max_sim_f32`].
+pub fn build_max_sim_f16<E: Erase<half::f16>>(
+    isa: MaxSimIsa,
+    query: MatRef<'_, Standard<half::f16>>,
+    erase: E,
+) -> Result<E::Output, NotSupported> {
+    match isa {
+        MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features(
+            BuildAndErase(erase),
+            query,
+        )),
+        MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)),
+        #[cfg(target_arch = "x86_64")]
+        MaxSimIsa::X86_64_V3 => {
+            let arch = V3::new_checked().ok_or(NotSupported {
+                isa,
+                reason: "AVX2/FMA unavailable on this CPU",
+            })?;
+            Ok(arch.run1(BuildAndErase(erase), query))
+        }
+        #[cfg(target_arch = "x86_64")]
+        MaxSimIsa::X86_64_V4 => {
+            let arch = V4::new_checked().ok_or(NotSupported {
+                isa,
+                reason: "AVX-512 unavailable on this CPU",
+            })?;
+            Ok(arch.run1(BuildAndErase(erase), query))
+        }
+        #[cfg(not(target_arch = "x86_64"))]
+        MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported {
+            isa,
+            reason: "x86_64 target only",
+        }),
+        #[cfg(target_arch = "aarch64")]
+        MaxSimIsa::Neon => {
+            let arch = Neon::new_checked().ok_or(NotSupported {
+                isa,
+                reason: "Neon unavailable on this CPU",
+            })?;
+            Ok(arch.run1(BuildAndErase(erase), query))
+        }
+        #[cfg(not(target_arch = "aarch64"))]
+        MaxSimIsa::Neon => Err(NotSupported {
+            isa,
+            reason: "aarch64 target only",
+        }),
+        MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::<half::f16>::new(query))),
+    }
+}
diff --git a/diskann-quantization/src/multi_vector/distance/isa.rs b/diskann-quantization/src/multi_vector/distance/isa.rs
new file mode 100644
index 000000000..49768bc48
--- /dev/null
+++ b/diskann-quantization/src/multi_vector/distance/isa.rs
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! Instruction Set Architecture (ISA) selector for the multi-vector MaxSim
+//! factory.
+
+/// Instruction Set Architecture (ISA) selector for which multi-vector MaxSim
+/// kernel to build.
+///
+/// `#[non_exhaustive]` so adding a variant (e.g. for a new in-tree kernel) is
+/// not a breaking change. Deliberately **not** `Serialize`/`Deserialize` —
+/// callers wanting JSON support maintain their own shadow enum and convert
+/// via `From` / `TryFrom`, so the library is not pinned to a particular
+/// serialization format.
+#[non_exhaustive]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[allow(non_camel_case_types)]
+pub enum MaxSimIsa {
+    /// Pick the highest ISA the host CPU supports.
+    Auto,
+    /// Pure-scalar (emulated SIMD) kernel — always available.
+    Scalar,
+    /// x86_64 AVX2 + FMA.
+    X86_64_V3,
+    /// x86_64 AVX-512.
+    X86_64_V4,
+    /// AArch64 Neon.
+    Neon,
+    /// Non-SIMD reference fallback. Slow; serves as a correctness baseline.
+    Reference,
+}
+
+impl std::fmt::Display for MaxSimIsa {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            Self::Auto => "auto",
+            Self::Scalar => "scalar",
+            Self::X86_64_V3 => "x86-64-v3",
+            Self::X86_64_V4 => "x86-64-v4",
+            Self::Neon => "neon",
+            Self::Reference => "reference",
+        };
+        f.write_str(s)
+    }
+}
+
+/// Returned by `build_max_sim_*` when the requested ISA cannot be produced on
+/// the current host (e.g. x86_64 V4 requested on a non-AVX512 CPU, or Neon
+/// requested on x86_64).
+#[derive(Debug, Clone, Copy)]
+pub struct NotSupported {
+    pub isa: MaxSimIsa,
+    pub reason: &'static str,
+}
+
+impl std::fmt::Display for NotSupported {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} not supported: {}", self.isa, self.reason)
+    }
+}
+
+impl std::error::Error for NotSupported {}
diff --git a/diskann-quantization/src/multi_vector/distance/kernel.rs b/diskann-quantization/src/multi_vector/distance/kernel.rs
new file mode 100644
index 000000000..a2fd530d9
--- /dev/null
+++ b/diskann-quantization/src/multi_vector/distance/kernel.rs
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! Object-safe kernel boundary trait plus BYOTE visitor trait.
+
+use crate::multi_vector::{MatRef, Standard};
+
+/// Object-safe interface for computing per-query MaxSim scores.
+///
+/// # Contract
+///
+/// - `scores.len() == self.nrows()` (caller's precondition).
+/// - The implementation must populate **all** `nrows()` entries of `scores`.
+///   Callers that derive quantities from the full score vector (e.g. sums)
+///   would silently corrupt their result if any trailing entry were left
+///   unwritten.
+pub trait MaxSimKernel<T: Copy>: Send + Sync + std::fmt::Debug {
+    /// Number of query rows whose scores this kernel produces.
+    fn nrows(&self) -> usize;
+
+    /// Compute per-query MaxSim scores against `doc` into `scores`.
+    fn compute_max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
+}
+
+/// "Bring your own type erasure" visitor. The factory hands an implementation
+/// to `erase`, which decides how to package / type-erase it. Lets different
+/// callers produce different output shapes (e.g. `Box<dyn MaxSimKernel<T>>`,
+/// a chamfer-only closure, a batched evaluator, ...) from the same factory.
+///
+/// See [`BoxErase`] for the default impl used by most callers.
+pub trait Erase<T: Copy> {
+    /// What the visitor produces.
+    type Output;
+    /// Visit the concrete kernel. `K` is generic so the body sees its concrete
+    /// type and the compiler can inline it into the wrapper.
+    fn erase<K: MaxSimKernel<T> + 'static>(self, kernel: K) -> Self::Output;
+}
+
+/// Default [`Erase`] impl: produces `Box<dyn MaxSimKernel<T>>`.
+///
+/// Use this when the caller just wants a heap-allocated kernel object behind
+/// a vtable. For custom packaging (chamfer-only, batched, composed), write
+/// your own `Erase` impl and pass it to the factory in place of `BoxErase`.
+#[derive(Debug, Clone, Copy)]
+pub struct BoxErase;
+
+impl<T: Copy + 'static> Erase<T> for BoxErase {
+    type Output = Box<dyn MaxSimKernel<T>>;
+
+    fn erase<K: MaxSimKernel<T> + 'static>(self, kernel: K) -> Self::Output {
+        Box::new(kernel)
+    }
+}
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
index bd9121a24..55108698d 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
@@ -3,9 +3,8 @@
 
 //! Block-transposed SIMD kernels for multi-vector distance computation.
 //!
-//! This module provides a SIMD-accelerated implementation that uses block-transposed
-//! memory layout for **query** vectors (instead of documents), with documents remaining
-//! in row-major format.
+//! SIMD-accelerated implementation that uses block-transposed memory layout
+//! for **query** vectors, with documents remaining in row-major format.
 //!
 //! # Memory Layout
 //!
diff --git a/diskann-quantization/src/multi_vector/distance/mod.rs b/diskann-quantization/src/multi_vector/distance/mod.rs
index 853f60753..9afb070c5 100644
--- a/diskann-quantization/src/multi_vector/distance/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/mod.rs
@@ -5,15 +5,15 @@
 //!
 //! Provides asymmetric distance primitives for multi-vector search:
 //!
-//! - [`MaxSim`]: Per-query-vector maximum similarities.
-//! - [`Chamfer`]: Sum of MaxSim scores (asymmetric Chamfer distance).
-//! - [`QueryComputer`]: Architecture-dispatched query computer backed by
-//!   SIMD-accelerated block-transposed kernels.
+//! - [`MaxSim`]: per-query-vector maximum similarities.
+//! - [`Chamfer`]: sum of MaxSim scores (asymmetric Chamfer distance).
+//! - [`MaxSimKernel`]: object-safe interface implemented by every concrete
+//!   kernel constructed through [`build_max_sim_f32`] / [`build_max_sim_f16`].
+//! - [`Erase`]: BYOTE visitor — caller decides how to type-erase the kernel.
 //!
 //! The fallback path uses a double-loop kernel over
-//! [`InnerProduct`](diskann_vector::distance::InnerProduct). The optimised
-//! path (via [`QueryComputer`]) uses block-transposed layout with
-//! cache-tiled SIMD micro-kernels.
+//! [`InnerProduct`](diskann_vector::distance::InnerProduct). The factory
+//! functions return cache-tiled SIMD kernels selected by [`MaxSimIsa`].
 //!
 //! # Example
 //!
@@ -49,11 +49,15 @@
 //! // scores[1] =  0.0 (query[1] has no good match: max IP was 0)
 //! ```
 
+mod factory;
 mod fallback;
+mod isa;
+mod kernel;
 mod kernels;
 mod max_sim;
-mod query_computer;
 
+pub use factory::{build_max_sim_f16, build_max_sim_f32};
 pub use fallback::QueryMatRef;
+pub use isa::{MaxSimIsa, NotSupported};
+pub use kernel::{BoxErase, Erase, MaxSimKernel};
 pub use max_sim::{Chamfer, MaxSim, MaxSimError};
-pub use query_computer::QueryComputer;
diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs
deleted file mode 100644
index 9bb348a6a..000000000
--- a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-use diskann_wide::Architecture;
-use diskann_wide::arch::Scalar;
-#[cfg(target_arch = "aarch64")]
-use diskann_wide::arch::aarch64::Neon;
-#[cfg(target_arch = "x86_64")]
-use diskann_wide::arch::x86_64::{V3, V4};
-
-use super::{DynQueryComputer, Prepared, QueryComputer, build_prepared};
-use crate::multi_vector::distance::kernels::f16::F16Entry;
-use crate::multi_vector::{BlockTransposed, BlockTransposedRef, MatRef, Standard};
-use diskann_utils::Reborrow;
-
-impl QueryComputer<half::f16> {
-    /// Build an f16 query computer, selecting the optimal architecture and
-    /// GROUP for the current CPU at runtime.
-    pub fn new(query: MatRef<'_, Standard<half::f16>>) -> Self {
-        diskann_wide::arch::dispatch1_no_features(BuildComputer, query)
-    }
-}
-
-impl<A, const GROUP: usize> DynQueryComputer<half::f16>
-    for Prepared<A, BlockTransposed<half::f16, GROUP>>
-where
-    A: Architecture,
-    F16Entry<GROUP>: for<'a> diskann_wide::arch::Target3<
-            A,
-            (),
-            BlockTransposedRef<'a, half::f16, GROUP>,
-            MatRef<'a, Standard<half::f16>>,
-            &'a mut [f32],
-        >,
-{
-    fn compute_max_sim(&self, doc: MatRef<'_, Standard<half::f16>>, scores: &mut [f32]) {
-        let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()];
-        self.arch.run3(
-            F16Entry::<GROUP>,
-            self.prepared.reborrow(),
-            doc,
-            &mut scratch,
-        );
-        for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) {
-            *dst = -src;
-        }
-    }
-
-    fn nrows(&self) -> usize {
-        self.prepared.nrows()
-    }
-}
-
-#[derive(Debug, Clone, Copy)]
-pub(super) struct BuildComputer;
-
-impl diskann_wide::arch::Target1<Scalar, QueryComputer<half::f16>, MatRef<'_, Standard<half::f16>>>
-    for BuildComputer
-{
-    fn run(self, arch: Scalar, query: MatRef<'_, Standard<half::f16>>) -> QueryComputer<half::f16> {
-        QueryComputer {
-            inner: Box::new(build_prepared::<half::f16, _, 8>(arch, query)),
-        }
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-impl diskann_wide::arch::Target1<V3, QueryComputer<half::f16>, MatRef<'_, Standard<half::f16>>>
-    for BuildComputer
-{
-    fn run(self, arch: V3, query: MatRef<'_, Standard<half::f16>>) -> QueryComputer<half::f16> {
-        QueryComputer {
-            inner: Box::new(build_prepared::<half::f16, _, 16>(arch, query)),
-        }
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-impl diskann_wide::arch::Target1<V4, QueryComputer<half::f16>, MatRef<'_, Standard<half::f16>>>
-    for BuildComputer
-{
-    fn run(self, arch: V4, query: MatRef<'_, Standard<half::f16>>) -> QueryComputer<half::f16> {
-        let arch = arch.retarget();
-        QueryComputer {
-            inner: Box::new(build_prepared::<half::f16, _, 16>(arch, query)),
-        }
-    }
-}
-
-#[cfg(target_arch = "aarch64")]
-impl diskann_wide::arch::Target1<Neon, QueryComputer<half::f16>, MatRef<'_, Standard<half::f16>>>
-    for BuildComputer
-{
-    fn run(self, arch: Neon, query: MatRef<'_, Standard<half::f16>>) -> QueryComputer<half::f16> {
-        let arch = arch.retarget();
-        QueryComputer {
-            inner: Box::new(build_prepared::<half::f16, _, 8>(arch, query)),
-        }
-    }
-}
diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs
deleted file mode 100644
index 9ff16b8b4..000000000
--- a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-use diskann_wide::Architecture;
-use diskann_wide::arch::Scalar;
-#[cfg(target_arch = "aarch64")]
-use diskann_wide::arch::aarch64::Neon;
-#[cfg(target_arch = "x86_64")]
-use diskann_wide::arch::x86_64::{V3, V4};
-
-use super::{DynQueryComputer, Prepared, QueryComputer, build_prepared};
-use crate::multi_vector::distance::kernels::f32::F32Kernel;
-use crate::multi_vector::{BlockTransposed, BlockTransposedRef, MatRef, Standard};
-use diskann_utils::Reborrow;
-
-impl QueryComputer<f32> {
-    /// Build an f32 query computer, selecting the optimal architecture and
-    /// GROUP for the current CPU at runtime.
-    pub fn new(query: MatRef<'_, Standard<f32>>) -> Self {
-        diskann_wide::arch::dispatch1_no_features(BuildComputer, query)
-    }
-}
-
-impl<A, const GROUP: usize> DynQueryComputer<f32> for Prepared<A, BlockTransposed<f32, GROUP>>
-where
-    A: Architecture,
-    F32Kernel<GROUP>: for<'a> diskann_wide::arch::Target3<
-            A,
-            (),
-            BlockTransposedRef<'a, f32, GROUP>,
-            MatRef<'a, Standard<f32>>,
-            &'a mut [f32],
-        >,
-{
-    fn compute_max_sim(&self, doc: MatRef<'_, Standard<f32>>, scores: &mut [f32]) {
-        let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()];
-        self.arch.run3(
-            F32Kernel::<GROUP>,
-            self.prepared.reborrow(),
-            doc,
-            &mut scratch,
-        );
-        for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) {
-            *dst = -src;
-        }
-    }
-
-    fn nrows(&self) -> usize {
-        self.prepared.nrows()
-    }
-}
-
-#[derive(Debug, Clone, Copy)]
-pub(super) struct BuildComputer;
-
-impl diskann_wide::arch::Target1<Scalar, QueryComputer<f32>, MatRef<'_, Standard<f32>>>
-    for BuildComputer
-{
-    fn run(self, arch: Scalar, query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
-        QueryComputer {
-            inner: Box::new(build_prepared::<f32, _, 8>(arch, query)),
-        }
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-impl diskann_wide::arch::Target1<V3, QueryComputer<f32>, MatRef<'_, Standard<f32>>>
-    for BuildComputer
-{
-    fn run(self, arch: V3, query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
-        QueryComputer {
-            inner: Box::new(build_prepared::<f32, _, 16>(arch, query)),
-        }
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-impl diskann_wide::arch::Target1<V4, QueryComputer<f32>, MatRef<'_, Standard<f32>>>
-    for BuildComputer
-{
-    fn run(self, arch: V4, query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
-        // V4 delegates to V3 — the V3 micro-kernel is valid on V4 hardware.
-        let arch = arch.retarget();
-        QueryComputer {
-            inner: Box::new(build_prepared::<f32, _, 16>(arch, query)),
-        }
-    }
-}
-
-#[cfg(target_arch = "aarch64")]
-impl diskann_wide::arch::Target1<Neon, QueryComputer<f32>, MatRef<'_, Standard<f32>>>
-    for BuildComputer
-{
-    fn run(self, arch: Neon, query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
-        // Neon delegates to Scalar.
-        let arch = arch.retarget();
-        QueryComputer {
-            inner: Box::new(build_prepared::<f32, _, 8>(arch, query)),
-        }
-    }
-}
diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs b/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs
deleted file mode 100644
index fbe84fcd3..000000000
--- a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs
+++ /dev/null
@@ -1,290 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-//! Architecture-opaque query computer with runtime dispatch.
-//!
-//! [`QueryComputer`] wraps a block-transposed query and a captured
-//! architecture token behind a trait-object vtable. CPU detection happens
-//! once at construction; every subsequent distance call goes through
-//! [`Architecture::run3`](diskann_wide::Architecture::run3) with full
-//! `#[target_feature]` propagation — no re-dispatch and no enum matching
-//! on the hot path.
-//!
-//! # Usage
-//!
-//! ```
-//! use diskann_quantization::multi_vector::{
-//!     QueryComputer, MatRef, Standard,
-//! };
-//!
-//! let query_data = [1.0f32, 0.0, 0.0, 1.0];
-//! let doc_data = [1.0f32, 0.0, 0.0, 1.0];
-//!
-//! let query = MatRef::new(Standard::new(2, 2).unwrap(), &query_data).unwrap();
-//! let doc = MatRef::new(Standard::new(2, 2).unwrap(), &doc_data).unwrap();
-//!
-//! // Build — runtime detects arch, picks optimal GROUP, captures both
-//! let computer = QueryComputer::<f32>::new(query);
-//!
-//! // Distance — vtable → arch.run3 with target_feature propagation
-//! let dist = computer.chamfer(doc);
-//! assert_eq!(dist, -2.0);
-//! ```
-
-mod f16;
-mod f32;
-
-use crate::multi_vector::{BlockTransposed, MatRef, Standard};
-
-/// Architecture-dispatched query computer for multi-vector distance.
-#[derive(Debug)]
-pub struct QueryComputer<T: Copy> {
-    inner: Box<dyn DynQueryComputer<T>>,
-}
-
-impl<T: Copy> QueryComputer<T> {
-    /// Number of logical (non-padded) query vectors.
-    #[inline]
-    pub fn nrows(&self) -> usize {
-        self.inner.nrows()
-    }
-
-    /// Compute Chamfer distance (sum of per-query max similarities, negated).
-    ///
-    /// Returns `0.0` if the document has zero vectors.
-    pub fn chamfer(&self, doc: MatRef<'_, Standard<T>>) -> f32 {
-        let nq = self.nrows();
-        if doc.num_vectors() == 0 {
-            return 0.0;
-        }
-        let mut scores = vec![0.0f32; nq];
-        self.max_sim(doc, &mut scores);
-        scores.iter().sum()
-    }
-
-    /// Compute per-query-vector max similarities into `scores`.
-    ///
-    /// `scores` must have length equal to [`nrows()`](Self::nrows).
-    /// Each entry is the negated max inner product for that query vector.
-    ///
-    /// # Panics
-    ///
-    /// Panics if `scores.len() != self.nrows()`.
-    pub fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
-        let nq = self.nrows();
-        assert_eq!(
-            scores.len(),
-            nq,
-            "scores buffer not right size: {} != {}",
-            scores.len(),
-            nq
-        );
-
-        if doc.num_vectors() == 0 {
-            return;
-        }
-
-        self.inner.compute_max_sim(doc, scores);
-    }
-}
-
-trait DynQueryComputer<T: Copy>: std::fmt::Debug + Send + Sync {
-    fn compute_max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
-    fn nrows(&self) -> usize;
-}
-
-#[derive(Debug)]
-struct Prepared<A, Q> {
-    arch: A,
-    prepared: Q,
-}
-
-fn build_prepared<T: Copy + Default, A, const GROUP: usize>(
-    arch: A,
-    query: MatRef<'_, Standard<T>>,
-) -> Prepared<A, BlockTransposed<T, GROUP>> {
-    let prepared = BlockTransposed::<T, GROUP>::from_matrix_view(query.as_matrix_view());
-    Prepared { arch, prepared }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::multi_vector::{Chamfer, MaxSim, QueryMatRef};
-    use diskann_vector::distance::InnerProduct;
-    use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
-
-    trait FromF32 {
-        fn from_f32(v: f32) -> Self;
-    }
-
-    impl FromF32 for f32 {
-        fn from_f32(v: f32) -> Self {
-            v
-        }
-    }
-
-    impl FromF32 for half::f16 {
-        fn from_f32(v: f32) -> Self {
-            diskann_wide::cast_f32_to_f16(v)
-        }
-    }
-
-    fn make_mat<T: Copy>(data: &[T], nrows: usize, ncols: usize) -> MatRef<'_, Standard<T>> {
-        MatRef::new(Standard::new(nrows, ncols).unwrap(), data).unwrap()
-    }
-
-    fn make_test_data<T: FromF32>(len: usize, ceil: usize, shift: usize) -> Vec<T> {
-        (0..len)
-            .map(|v| T::from_f32(((v + shift) % ceil) as f32))
-            .collect()
-    }
-
-    /// Shapes for the `chamfer_matches_fallback` / `max_sim_matches_fallback`
-    /// agreement checks: (num_queries, num_docs, dim).
-    ///
-    /// This matrix targets the API-layer wiring that lives above the
-    /// kernel — `QueryComputer::new` query setup, `chamfer` row
-    /// summation, `max_sim` per-row writeback, and the f16 query
-    /// conversion path — not kernel correctness. A small
-    /// representative set is sufficient because exhaustive shape
-    /// coverage (panel boundaries, B-remainder classes, prime `k`,
-    /// degenerate dims) is pinned one layer below in
-    /// `kernels::tiled_reduce::tests::NAIVE_CASES`, and structural
-    /// loop-path coverage in `tiled_reduce_all_loop_paths_match_naive`.
-    const TEST_CASES: &[(usize, usize, usize)] = &[
-        (1, 1, 4), // Degenerate
-        (5, 3, 5), // Prime k; nq > 1 and nd > 1 exercise chamfer summation
-        //              and per-row max_sim writeback on a non-trivial shape
-        (17, 4, 64), // A-panel remainder crossing both Scalar and V3 panel widths
-        (16, 6, 32), // B-remainder ≠ 1 (V3 b_remainder = 2)
-    ];
-
-    fn check_chamfer_matches<T: Copy + FromF32>(
-        build: fn(MatRef<'_, Standard<T>>) -> QueryComputer<T>,
-        tol: f32,
-        label: &str,
-    ) where
-        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
-    {
-        for &(nq, nd, dim) in TEST_CASES {
-            let query_data = make_test_data::<T>(nq * dim, dim, dim / 2);
-            let doc_data = make_test_data::<T>(nd * dim, dim, dim);
-
-            let query = make_mat(&query_data, nq, dim);
-            let doc = make_mat(&doc_data, nd, dim);
-
-            let expected = Chamfer::evaluate(QueryMatRef::from(query), doc);
-            let actual = build(query).chamfer(doc);
-
-            assert!(
-                (actual - expected).abs() < tol,
-                "{label}Chamfer mismatch for ({nq},{nd},{dim}): actual={actual}, expected={expected}",
-            );
-        }
-    }
-
-    fn check_max_sim_matches<T: Copy + FromF32>(
-        build: fn(MatRef<'_, Standard<T>>) -> QueryComputer<T>,
-        tol: f32,
-        label: &str,
-    ) where
-        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
-    {
-        for &(nq, nd, dim) in TEST_CASES {
-            let query_data = make_test_data::<T>(nq * dim, dim, dim / 2);
-            let doc_data = make_test_data::<T>(nd * dim, dim, dim);
-
-            let query = make_mat(&query_data, nq, dim);
-            let doc = make_mat(&doc_data, nd, dim);
-
-            let mut expected_scores = vec![0.0f32; nq];
-            let _ = MaxSim::new(&mut expected_scores)
-                .unwrap()
-                .evaluate(QueryMatRef::from(query), doc);
-
-            let computer = build(query);
-            let mut actual_scores = vec![0.0f32; nq];
-            computer.max_sim(doc, &mut actual_scores);
-
-            for i in 0..nq {
-                assert!(
-                    (actual_scores[i] - expected_scores[i]).abs() < tol,
-                    "{label}MaxSim[{i}] mismatch for ({nq},{nd},{dim}): actual={}, expected={}",
-                    actual_scores[i],
-                    expected_scores[i],
-                );
-            }
-        }
-    }
-
-    #[test]
-    fn query_computer_dimensions() {
-        let data = vec![1.0f32; 5 * 8];
-        let query = make_mat(&data, 5, 8);
-        let computer = QueryComputer::<f32>::new(query);
-
-        assert_eq!(computer.nrows(), 5);
-    }
-
-    #[test]
-    fn query_computer_f16_dimensions() {
-        let data = vec![diskann_wide::cast_f32_to_f16(1.0); 5 * 8];
-        let query = make_mat(data.as_slice(), 5, 8);
-        let computer = QueryComputer::<half::f16>::new(query);
-
-        assert_eq!(computer.nrows(), 5);
-    }
-
-    #[test]
-    fn chamfer_with_zero_docs() {
-        let query = make_mat(&[1.0f32, 0.0, 0.0, 1.0], 2, 2);
-        let computer = QueryComputer::<f32>::new(query);
-        let doc = make_mat(&[], 0, 2);
-        assert_eq!(computer.chamfer(doc), 0.0);
-    }
-
-    #[test]
-    fn max_sim_with_zero_docs() {
-        let query = make_mat(&[1.0f32, 0.0, 0.0, 1.0], 2, 2);
-        let computer = QueryComputer::<f32>::new(query);
-        let doc = make_mat::<f32>(&[], 0, 2);
-        let mut scores = vec![0.0f32; 2];
-        computer.max_sim(doc, &mut scores);
-        // With zero docs the scores buffer is left untouched.
-        for &s in &scores {
-            assert_eq!(s, 0.0, "zero-doc MaxSim should leave scores untouched");
-        }
-    }
-
-    #[test]
-    #[should_panic(expected = "scores buffer not right size")]
-    fn max_sim_panics_on_size_mismatch() {
-        let query = make_mat(&[1.0f32, 2.0, 3.0, 4.0], 2, 2);
-        let computer = QueryComputer::<f32>::new(query);
-        let doc = make_mat(&[1.0, 1.0], 1, 2);
-        let mut scores = vec![0.0f32; 3]; // Wrong size
-        computer.max_sim(doc, &mut scores);
-    }
-
-    macro_rules! test_matches_fallback {
-        ($mod_name:ident, $ty:ty, $tol:expr, $label:literal) => {
-            mod $mod_name {
-                use super::*;
-
-                #[test]
-                fn chamfer_matches_fallback() {
-                    check_chamfer_matches(QueryComputer::<$ty>::new, $tol, $label);
-                }
-
-                #[test]
-                fn max_sim_matches_fallback() {
-                    check_max_sim_matches(QueryComputer::<$ty>::new, $tol, $label);
-                }
-            }
-        };
-    }
-
-    test_matches_fallback!(f32, f32, 1e-10, "f32 ");
-    test_matches_fallback!(f16, half::f16, 1e-10, "f16 ");
-}
diff --git a/diskann-quantization/src/multi_vector/matrix.rs b/diskann-quantization/src/multi_vector/matrix.rs
index 70629d44c..31c430995 100644
--- a/diskann-quantization/src/multi_vector/matrix.rs
+++ b/diskann-quantization/src/multi_vector/matrix.rs
@@ -712,6 +712,13 @@ impl<T: NewCloned> Clone for Mat<T> {
 }
 
 impl<T: Copy> Mat<Standard<T>> {
+    /// Construct a [`Mat`] by calling `f` once per element in row-major order.
+    pub fn from_fn<F: FnMut() -> T>(repr: Standard<T>, mut f: F) -> Self {
+        let b: Box<[T]> = (0..repr.num_elements()).map(|_| f()).collect();
+        // SAFETY: `b` has length `repr.num_elements()` by construction.
+        unsafe { repr.box_to_mat(b) }
+    }
+
     /// Returns the raw dimension (columns) of the vectors in the matrix.
     #[inline]
     pub fn vector_dim(&self) -> usize {
diff --git a/diskann-quantization/src/multi_vector/mod.rs b/diskann-quantization/src/multi_vector/mod.rs
index 3670b1aaf..d2ad0e7bc 100644
--- a/diskann-quantization/src/multi_vector/mod.rs
+++ b/diskann-quantization/src/multi_vector/mod.rs
@@ -20,9 +20,11 @@
 //! | [`BlockTransposedRef`] | Immutable view of a block-transposed matrix |
 //! | [`BlockTransposedMut`] | Mutable view of a block-transposed matrix |
 //! | [`QueryMatRef`] | Query wrapper for asymmetric distances |
-//! | [`QueryComputer`] | Architecture-dispatched SIMD query computer |
 //! | [`MaxSim`] | Per-query-vector max similarity computation |
 //! | [`Chamfer`] | Asymmetric Chamfer distance (sum of MaxSim) |
+//! | [`MaxSimKernel`] | Object-safe kernel returned by [`build_max_sim_f32`] / [`build_max_sim_f16`] |
+//! | [`MaxSimIsa`] | ISA selector for the factory functions |
+//! | [`Erase`] | BYOTE visitor used by the factory |
 //!
 //! # Example
 //!
@@ -72,7 +74,10 @@ pub mod distance;
 pub(crate) mod matrix;
 
 pub use block_transposed::{BlockTransposed, BlockTransposedMut, BlockTransposedRef};
-pub use distance::{Chamfer, MaxSim, MaxSimError, QueryComputer, QueryMatRef};
+pub use distance::{
+    BoxErase, Chamfer, Erase, MaxSim, MaxSimError, MaxSimIsa, MaxSimKernel, NotSupported,
+    QueryMatRef, build_max_sim_f16, build_max_sim_f32,
+};
 pub use matrix::{
     Defaulted, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef, Overflow,
     Repr, ReprMut, ReprOwned, SliceError, Standard,

From 597330a1daf273f59fff6b55f3e0d79e110b47bd Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Tue, 19 May 2026 23:01:58 +0530
Subject: [PATCH 12/13] Address review comments

---
 .../src/backend/multi_vector/README.md        |  43 --
 .../src/backend/multi_vector/driver.rs        |  18 +-
 .../src/backend/multi_vector/kernels.rs       | 286 ++++++-------
 .../src/backend/multi_vector/mod.rs           | 108 ++++-
 .../src/multi_vector/distance/factory.rs      | 377 +++++++++++++-----
 .../src/multi_vector/distance/isa.rs          |   6 +-
 .../src/multi_vector/distance/mod.rs          |   8 +-
 diskann-quantization/src/multi_vector/mod.rs  |   7 +-
 8 files changed, 505 insertions(+), 348 deletions(-)
 delete mode 100644 diskann-benchmark/src/backend/multi_vector/README.md

diff --git a/diskann-benchmark/src/backend/multi_vector/README.md b/diskann-benchmark/src/backend/multi_vector/README.md
deleted file mode 100644
index f24d38ff7..000000000
--- a/diskann-benchmark/src/backend/multi_vector/README.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# Multi-vector benchmark — kernel-author workflow
-
-The multi-vector benchmark dispatches through `diskann-quantization`'s
-`build_max_sim_f32` / `build_max_sim_f16` factory. Selection is driven by a
-non-exhaustive `MaxSimIsa` enum. To add a new in-tree experimental kernel,
-extend the enum + factory + the benchmark's shadow enum.
-
-## Steps
-
-1. **Library: variant + factory arm.** In
-   `diskann-quantization::multi_vector::distance`:
-   - Add a new variant to `MaxSimIsa` (in `isa.rs`).
-   - Implement `MaxSimKernel<T>` for your kernel struct (in `factory.rs`,
-     next to `Prepared` and `ReferenceKernel`).
-   - Add a matching arm to `build_max_sim_f32` and/or `build_max_sim_f16`
-     that constructs your kernel and hands it to `erase.erase(...)`.
-
-2. **Benchmark: matching shadow variant.** In
-   `diskann-benchmark::inputs::multi_vector`:
-   - Add the same variant to `BenchIsa`.
-   - Add the matching arm to `From<BenchIsa> for MaxSimIsa`.
-
-3. **Run.** Set `"isa": "your-variant"` in the JSON job; the existing
-   `KernelF32` / `KernelF16` benchmark entries handle the rest. No new
-   `Benchmark` registration required.
-
-## Why two enums?
-
-`MaxSimIsa` (library) and `BenchIsa` (benchmark) are kept separate so the
-library doesn't pin its public API on a serde version or a particular JSON
-shape. The benchmark owns its kebab-case JSON layout; the library is
-serde-agnostic. Mirroring variant-for-variant is intentional — small price
-for keeping the library boundary clean.
-
-## Background
-
-The factory follows the BYOTE ("Bring your own type erasure") pattern
-described in [RFC #1068]. If you want your kernel packaged as something
-other than `Box<dyn MaxSimKernel<T>>` (e.g. composed with chamfer summing,
-or wrapped in a custom thin trait), implement your own `Erase<T>` and pass
-it to the factory in place of `BoxErase`.
-
-[RFC #1068]: https://github.com/microsoft/DiskANN/pull/1068
diff --git a/diskann-benchmark/src/backend/multi_vector/driver.rs b/diskann-benchmark/src/backend/multi_vector/driver.rs
index c9ac8b488..e59f24ac2 100644
--- a/diskann-benchmark/src/backend/multi_vector/driver.rs
+++ b/diskann-benchmark/src/backend/multi_vector/driver.rs
@@ -27,6 +27,7 @@ use rand::{
 use serde::{Deserialize, Serialize};
 
 use crate::inputs::multi_vector::Run;
+use crate::utils::DisplayWrapper;
 
 //////////////////////
 // Tolerance        //
@@ -137,10 +138,7 @@ impl<T: Copy> Distance<T> for BoxedKernel<T> {
 // Timing harness   //
 //////////////////////
 
-fn run_loops<F>(run: &Run, mut body: F) -> RunResult
-where
-    F: FnMut(),
-{
+fn run_loops(run: &Run, body: &mut dyn FnMut()) -> RunResult {
     let mut latencies = Vec::with_capacity(run.num_measurements.get());
 
     for _ in 0..run.num_measurements.get() {
@@ -168,7 +166,7 @@ pub(super) fn run_with_distance<T: Copy>(
     dist: &dyn Distance<T>,
 ) -> RunResult {
     let mut scores = vec![0.0f32; run.num_query_vectors.get()];
-    run_loops(run, || {
+    run_loops(run, &mut || {
         dist.max_sim(doc, &mut scores);
         std::hint::black_box(&mut scores);
     })
@@ -178,16 +176,6 @@ pub(super) fn run_with_distance<T: Copy>(
 // Result types     //
 //////////////////////
 
-#[derive(Debug, Clone, Copy)]
-pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T);
-
-impl<T: ?Sized> std::ops::Deref for DisplayWrapper<'_, T> {
-    type Target = T;
-    fn deref(&self) -> &T {
-        self.0
-    }
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 pub(super) struct RunResult {
     /// The configuration for this run.
diff --git a/diskann-benchmark/src/backend/multi_vector/kernels.rs b/diskann-benchmark/src/backend/multi_vector/kernels.rs
index c9359b705..cc4e63b89 100644
--- a/diskann-benchmark/src/backend/multi_vector/kernels.rs
+++ b/diskann-benchmark/src/backend/multi_vector/kernels.rs
@@ -5,216 +5,158 @@
 
 //! `Benchmark` impls for the multi-vector MaxSim factory.
 //!
-//! One entry per element type. Each `try_match` checks `element_type` only;
-//! the `isa` field is passed to the library factory at run time. ISA
-//! unavailability surfaces as `NotSupported`, which becomes a job-level
-//! error.
+//! A single generic [`Kernel<T>`] carrier supplies the `Benchmark` and
+//! `Regression` impls for every element type accepted by the library's
+//! [`MaxSimElement`] sealed trait. Each `try_match` checks `element_type`
+//! only; the JSON `isa` field is passed to the library factory at run time,
+//! and arch unavailability surfaces as a job-level error via
+//! [`NotSupported`](diskann_quantization::multi_vector::NotSupported).
 
 use std::io::Write;
+use std::marker::PhantomData;
 
 use diskann_benchmark_runner::{
     benchmark::{FailureScore, MatchScore, PassFail, Regression},
     utils::{datatype::AsDataType, num::relative_change},
     Benchmark, Checkpoint, Output, Registry,
 };
-use diskann_quantization::multi_vector::{
-    build_max_sim_f16, build_max_sim_f32, BoxErase, MaxSimKernel,
-};
+use diskann_quantization::multi_vector::{build_max_sim, BoxErase, MaxSimElement};
 use rand::distr::{Distribution, StandardUniform};
 
 use super::driver::{
-    run_with_distance, BoxedKernel, CheckResult, Comparison, Data, DisplayWrapper,
-    MultiVectorTolerance, RunResult,
+    run_with_distance, BoxedKernel, CheckResult, Comparison, Data, MultiVectorTolerance, RunResult,
 };
 use crate::inputs::multi_vector::MultiVectorOp;
+use crate::utils::DisplayWrapper;
 
 // ─────────────────────────────────────────────────────────────────────────
-//  Per-element-type `Benchmark` carriers.
+//  Kernel<T> — generic carrier registered once per element type.
 // ─────────────────────────────────────────────────────────────────────────
 
 #[derive(Debug)]
-pub(super) struct KernelF32;
-
-#[derive(Debug)]
-pub(super) struct KernelF16;
-
-/// Per-element-type bridge: factory entry name + factory call.
-///
-/// Data-type matching (`DATA_TYPE`, `is_match`, `describe`) comes from the
-/// framework's [`AsDataType`] trait, which is already implemented for `f32`,
-/// `half::f16`, etc.
-trait ElementType: AsDataType + Copy {
-    const ENTRY_NAME: &'static str;
-    fn build(
-        isa: diskann_quantization::multi_vector::MaxSimIsa,
-        query: diskann_quantization::multi_vector::MatRef<
-            '_,
-            diskann_quantization::multi_vector::Standard<Self>,
-        >,
-    ) -> Result<Box<dyn MaxSimKernel<Self>>, diskann_quantization::multi_vector::NotSupported>;
-}
+pub(super) struct Kernel<T>(PhantomData<T>);
 
-impl ElementType for f32 {
-    const ENTRY_NAME: &'static str = "multi-vector-op-f32";
-    fn build(
-        isa: diskann_quantization::multi_vector::MaxSimIsa,
-        query: diskann_quantization::multi_vector::MatRef<
-            '_,
-            diskann_quantization::multi_vector::Standard<f32>,
-        >,
-    ) -> Result<Box<dyn MaxSimKernel<f32>>, diskann_quantization::multi_vector::NotSupported> {
-        build_max_sim_f32(isa, query, BoxErase)
+impl<T> Kernel<T> {
+    pub(super) const fn new() -> Self {
+        Self(PhantomData)
     }
 }
 
-impl ElementType for half::f16 {
-    const ENTRY_NAME: &'static str = "multi-vector-op-f16";
-    fn build(
-        isa: diskann_quantization::multi_vector::MaxSimIsa,
-        query: diskann_quantization::multi_vector::MatRef<
-            '_,
-            diskann_quantization::multi_vector::Standard<half::f16>,
-        >,
-    ) -> Result<Box<dyn MaxSimKernel<half::f16>>, diskann_quantization::multi_vector::NotSupported>
-    {
-        build_max_sim_f16(isa, query, BoxErase)
-    }
-}
-
-fn run_benchmark<T: ElementType>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
+impl<T> Benchmark for Kernel<T>
 where
+    T: MaxSimElement + AsDataType,
     StandardUniform: Distribution<T>,
 {
-    let mut results = Vec::with_capacity(input.runs.len());
-    for run in input.runs.iter() {
-        let data = Data::<T>::new(run);
-        let kernel = T::build(input.isa.into(), data.queries.as_view())?;
-        let dist = BoxedKernel(kernel);
-        results.push(run_with_distance(run, data.docs.as_view(), &dist));
-    }
-    Ok(results)
-}
-
-// ─────────────────────────────────────────────────────────────────────────
-//  Benchmark + Regression impls.
-// ─────────────────────────────────────────────────────────────────────────
+    type Input = MultiVectorOp;
+    type Output = Vec<RunResult>;
 
-macro_rules! impl_benchmark {
-    ($ty:ident, $T:ty) => {
-        impl Benchmark for $ty
-        where
-            StandardUniform: Distribution<$T>,
-        {
-            type Input = MultiVectorOp;
-            type Output = Vec<RunResult>;
-
-            fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
-                crate::utils::match_data_type::<$T>(from.element_type)
-            }
-
-            fn run(
-                &self,
-                input: &MultiVectorOp,
-                _: Checkpoint<'_>,
-                mut output: &mut dyn Output,
-            ) -> anyhow::Result<Self::Output> {
-                writeln!(output, "{}", input)?;
-                let results = run_benchmark::<$T>(input)?;
-                writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
-                Ok(results)
-            }
+    fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
+        crate::utils::match_data_type::<T>(from.element_type)
+    }
 
-            fn description(
-                &self,
-                f: &mut std::fmt::Formatter<'_>,
-                input: Option<&MultiVectorOp>,
-            ) -> std::fmt::Result {
-                match input {
-                    None => writeln!(f, "- Element Type: {}", <$T as AsDataType>::DATA_TYPE)?,
-                    Some(input) => {
-                        let desc = <$T as AsDataType>::describe(input.element_type);
-                        if !desc.is_match() {
-                            writeln!(f, "\n    - Mismatched element type: {}", desc)?;
-                        }
-                    }
-                }
-                Ok(())
-            }
+    fn run(
+        &self,
+        input: &MultiVectorOp,
+        _: Checkpoint<'_>,
+        mut output: &mut dyn Output,
+    ) -> anyhow::Result<Self::Output> {
+        writeln!(output, "{}", input)?;
+        let mut results = Vec::with_capacity(input.runs.len());
+        for run in input.runs.iter() {
+            let data = Data::<T>::new(run);
+            let kernel = build_max_sim::<T, _>(input.isa.into(), data.queries.as_view(), BoxErase)?;
+            let dist = BoxedKernel(kernel);
+            results.push(run_with_distance(run, data.docs.as_view(), &dist));
         }
+        writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
+        Ok(results)
+    }
 
-        impl Regression for $ty
-        where
-            StandardUniform: Distribution<$T>,
-        {
-            type Tolerances = MultiVectorTolerance;
-            type Pass = CheckResult;
-            type Fail = CheckResult;
-
-            fn check(
-                &self,
-                tolerance: &MultiVectorTolerance,
-                _input: &MultiVectorOp,
-                before: &Vec<RunResult>,
-                after: &Vec<RunResult>,
-            ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
-                anyhow::ensure!(
-                    before.len() == after.len(),
-                    "before has {} runs but after has {}",
-                    before.len(),
-                    after.len(),
-                );
-
-                let mut passed = true;
-                let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
-                    .enumerate()
-                    .map(|(i, (b, a))| {
-                        anyhow::ensure!(b.run == a.run, "run {i} mismatched");
-
-                        let computations_per_latency = b.computations_per_latency() as f64;
-                        let before_min =
-                            b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
-                        let after_min =
-                            a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
-
-                        let comparison = Comparison {
-                            run: b.run.clone(),
-                            tolerance: *tolerance,
-                            before_min,
-                            after_min,
-                        };
-
-                        match relative_change(before_min, after_min) {
-                            Ok(change) => {
-                                if change > tolerance.min_time_regression.get() {
-                                    passed = false;
-                                }
-                            }
-                            Err(_) => passed = false,
-                        };
-
-                        Ok(comparison)
-                    })
-                    .collect::<anyhow::Result<Vec<Comparison>>>()?;
-
-                Ok(if passed {
-                    PassFail::Pass(CheckResult { checks })
-                } else {
-                    PassFail::Fail(CheckResult { checks })
-                })
+    fn description(
+        &self,
+        f: &mut std::fmt::Formatter<'_>,
+        input: Option<&MultiVectorOp>,
+    ) -> std::fmt::Result {
+        match input {
+            None => writeln!(f, "- Element Type: {}", <T as AsDataType>::DATA_TYPE)?,
+            Some(input) => {
+                let desc = <T as AsDataType>::describe(input.element_type);
+                if !desc.is_match() {
+                    writeln!(f, "\n    - Mismatched element type: {}", desc)?;
+                }
             }
         }
-    };
+        Ok(())
+    }
 }
 
-impl_benchmark!(KernelF32, f32);
-impl_benchmark!(KernelF16, half::f16);
+impl<T> Regression for Kernel<T>
+where
+    T: MaxSimElement + AsDataType,
+    StandardUniform: Distribution<T>,
+{
+    type Tolerances = MultiVectorTolerance;
+    type Pass = CheckResult;
+    type Fail = CheckResult;
+
+    fn check(
+        &self,
+        tolerance: &MultiVectorTolerance,
+        _input: &MultiVectorOp,
+        before: &Vec<RunResult>,
+        after: &Vec<RunResult>,
+    ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
+        anyhow::ensure!(
+            before.len() == after.len(),
+            "before has {} runs but after has {}",
+            before.len(),
+            after.len(),
+        );
+
+        let mut passed = true;
+        let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
+            .enumerate()
+            .map(|(i, (b, a))| {
+                anyhow::ensure!(b.run == a.run, "run {i} mismatched");
+
+                let computations_per_latency = b.computations_per_latency() as f64;
+                let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+                let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+
+                let comparison = Comparison {
+                    run: b.run.clone(),
+                    tolerance: *tolerance,
+                    before_min,
+                    after_min,
+                };
+
+                match relative_change(before_min, after_min) {
+                    Ok(change) => {
+                        if change > tolerance.min_time_regression.get() {
+                            passed = false;
+                        }
+                    }
+                    Err(_) => passed = false,
+                };
+
+                Ok(comparison)
+            })
+            .collect::<anyhow::Result<Vec<Comparison>>>()?;
+
+        Ok(if passed {
+            PassFail::Pass(CheckResult { checks })
+        } else {
+            PassFail::Fail(CheckResult { checks })
+        })
+    }
+}
 
 // ─────────────────────────────────────────────────────────────────────────
 //  Registration.
 // ─────────────────────────────────────────────────────────────────────────
 
 pub(super) fn register(registry: &mut Registry) -> anyhow::Result<()> {
-    registry.register_regression(<f32 as ElementType>::ENTRY_NAME, KernelF32)?;
-    registry.register_regression(<half::f16 as ElementType>::ENTRY_NAME, KernelF16)?;
+    registry.register_regression("multi-vector-op-f32", Kernel::<f32>::new())?;
+    registry.register_regression("multi-vector-op-f16", Kernel::<half::f16>::new())?;
     Ok(())
 }
diff --git a/diskann-benchmark/src/backend/multi_vector/mod.rs b/diskann-benchmark/src/backend/multi_vector/mod.rs
index c3ffffeaf..2cbb2d9a6 100644
--- a/diskann-benchmark/src/backend/multi_vector/mod.rs
+++ b/diskann-benchmark/src/backend/multi_vector/mod.rs
@@ -5,15 +5,56 @@
 
 //! Multi-vector MaxSim distance benchmarks with regression detection.
 //!
-//! Registers one `Benchmark` entry per supported element type; the JSON `isa`
-//! field selects the kernel at run time via the library's
-//! [`build_max_sim_f32`] / [`build_max_sim_f16`] factories.
+//! Registers one `Benchmark` entry per supported element type; the JSON
+//! `isa` field selects the kernel at run time via the library's
+//! [`build_max_sim`] factory. The set of accepted element types is gated by
+//! the sealed [`MaxSimElement`] trait.
 //!
-//! See [`README.md`](./README.md) for the in-tree workflow when authoring a new
-//! experimental kernel.
+//! # Adding a new in-tree experimental kernel
 //!
-//! [`build_max_sim_f32`]: diskann_quantization::multi_vector::build_max_sim_f32
-//! [`build_max_sim_f16`]: diskann_quantization::multi_vector::build_max_sim_f16
+//! 1. **Library: variant + dispatch arm.** In
+//!    `diskann-quantization::multi_vector::distance`:
+//!    - Add a new variant to [`MaxSimIsa`] (in `isa.rs`).
+//!    - Implement [`MaxSimKernel<T>`] for your kernel struct (in
+//!      `factory.rs`, next to `Prepared` and `ReferenceKernel`).
+//!    - Add a matching arm to the [`MaxSimElement::build`] impl for each
+//!      element type your kernel supports — the arm constructs your kernel
+//!      and hands it to `erase.erase(...)`.
+//!
+//! 2. **Benchmark: matching shadow variant.** In
+//!    [`crate::inputs::multi_vector`]:
+//!    - Add the same variant to [`BenchIsa`].
+//!    - Add the matching arm to `From<BenchIsa> for MaxSimIsa`.
+//!
+//! 3. **Run.** Set `"isa": "your-variant"` in the JSON job; the existing
+//!    `Kernel<T>` benchmark entries (registered once per element type)
+//!    handle the rest. No new `Benchmark` registration required.
+//!
+//! # Why two enums?
+//!
+//! [`MaxSimIsa`] (library) and [`BenchIsa`] are kept separate so the library
+//! doesn't pin its public API on a serde version or a particular JSON
+//! shape. The benchmark owns its kebab-case JSON layout; the library is
+//! serde-agnostic. Mirroring variant-for-variant is intentional — small
+//! price for keeping the library boundary clean.
+//!
+//! # Background
+//!
+//! The factory follows the BYOTE ("Bring your own type erasure") pattern
+//! described in [RFC #1068]. If you want your kernel packaged as something
+//! other than `Box<dyn MaxSimKernel<T>>` (e.g. composed with chamfer
+//! summing, or wrapped in a custom thin trait), implement your own
+//! [`Erase<T>`] and pass it to the factory in place of [`BoxErase`].
+//!
+//! [`build_max_sim`]: diskann_quantization::multi_vector::build_max_sim
+//! [`MaxSimIsa`]: diskann_quantization::multi_vector::MaxSimIsa
+//! [`MaxSimElement`]: diskann_quantization::multi_vector::MaxSimElement
+//! [`MaxSimElement::build`]: diskann_quantization::multi_vector::MaxSimElement::build
+//! [`MaxSimKernel<T>`]: diskann_quantization::multi_vector::MaxSimKernel
+//! [`Erase<T>`]: diskann_quantization::multi_vector::Erase
+//! [`BoxErase`]: diskann_quantization::multi_vector::BoxErase
+//! [`BenchIsa`]: crate::inputs::multi_vector::BenchIsa
+//! [RFC #1068]: https://github.com/microsoft/DiskANN/pull/1068
 
 use diskann_benchmark_runner::Registry;
 
@@ -47,7 +88,7 @@ mod tests {
     };
 
     use super::driver::{CheckResult, Comparison, MultiVectorTolerance, RunResult};
-    use super::kernels::KernelF32;
+    use super::kernels::Kernel;
     use crate::inputs::multi_vector::{BenchIsa, MultiVectorOp, Run};
 
     fn tiny_run() -> Run {
@@ -86,7 +127,7 @@ mod tests {
 
     #[test]
     fn check_rejects_mismatched_runs() {
-        let kernel = KernelF32;
+        let kernel = Kernel::<f32>::new();
 
         // Build a result whose `run` diverges from `tiny_run()` so the
         // regression check's `b.run == a.run` invariant fires.
@@ -115,7 +156,7 @@ mod tests {
 
     #[test]
     fn check_allows_negative_relative_change() {
-        let kernel = KernelF32;
+        let kernel = Kernel::<f32>::new();
 
         let result = kernel
             .check(
@@ -131,7 +172,7 @@ mod tests {
 
     #[test]
     fn check_passes_on_tolerance_boundary() {
-        let kernel = KernelF32;
+        let kernel = Kernel::<f32>::new();
 
         let result = kernel
             .check(
@@ -147,7 +188,7 @@ mod tests {
 
     #[test]
     fn check_fails_above_tolerance_boundary() {
-        let kernel = KernelF32;
+        let kernel = Kernel::<f32>::new();
 
         let result = kernel
             .check(
@@ -186,7 +227,7 @@ mod tests {
     /// require at least a non-zero value.
     #[test]
     fn zero_values_rejected() {
-        let kernel = KernelF32;
+        let kernel = Kernel::<f32>::new();
 
         let result = kernel
             .check(
@@ -199,4 +240,45 @@ mod tests {
 
         assert!(matches!(result, PassFail::Fail(_)));
     }
+
+    //////////////////////
+    // BoxedKernel      //
+    //////////////////////
+    //
+    // The library's `MaxSimKernel<T>` trait makes no zero-doc / size-assert
+    // guarantees — those contracts live on the `BoxedKernel<T>` wrapper in
+    // `driver.rs`. The tests below pin that wrapper's behaviour.
+
+    use super::driver::{BoxedKernel, Distance};
+    use diskann_quantization::multi_vector::{
+        build_max_sim, BoxErase, MatRef as LibMatRef, MaxSimIsa, Standard as LibStandard,
+    };
+
+    fn boxed_kernel_f32_two_rows() -> BoxedKernel<f32> {
+        let data = [1.0f32, 0.0, 0.0, 1.0];
+        let query = LibMatRef::new(LibStandard::new(2, 2).unwrap(), data.as_slice()).unwrap();
+        BoxedKernel(build_max_sim::<f32, _>(MaxSimIsa::Auto, query, BoxErase).unwrap())
+    }
+
+    #[test]
+    fn boxed_kernel_max_sim_with_zero_docs_leaves_scores_untouched() {
+        let kernel = boxed_kernel_f32_two_rows();
+        let empty: [f32; 0] = [];
+        let doc = LibMatRef::new(LibStandard::new(0, 2).unwrap(), empty.as_slice()).unwrap();
+        let mut scores = vec![0.0f32; 2];
+        kernel.max_sim(doc, &mut scores);
+        for &s in &scores {
+            assert_eq!(s, 0.0, "zero-doc max_sim should leave scores untouched");
+        }
+    }
+
+    #[test]
+    #[should_panic(expected = "scores buffer not right size")]
+    fn boxed_kernel_max_sim_panics_on_size_mismatch() {
+        let kernel = boxed_kernel_f32_two_rows();
+        let doc_data = [1.0f32, 1.0];
+        let doc = LibMatRef::new(LibStandard::new(1, 2).unwrap(), doc_data.as_slice()).unwrap();
+        let mut scores = vec![0.0f32; 3]; // Wrong size: 3 vs kernel's nrows() = 2.
+        kernel.max_sim(doc, &mut scores);
+    }
 }
diff --git a/diskann-quantization/src/multi_vector/distance/factory.rs b/diskann-quantization/src/multi_vector/distance/factory.rs
index 78d15273e..0bfe82fc1 100644
--- a/diskann-quantization/src/multi_vector/distance/factory.rs
+++ b/diskann-quantization/src/multi_vector/distance/factory.rs
@@ -2,8 +2,8 @@
 // Licensed under the MIT license.
 
 //! Factory + concrete `MaxSimKernel<T>` implementations for the multi-vector
-//! distance API. See [`build_max_sim_f32`] / [`build_max_sim_f16`] for the
-//! BYOTE entry points.
+//! distance API. See [`build_max_sim`] for the BYOTE entry point and
+//! [`MaxSimElement`] for the sealed trait that gates accepted element types.
 
 use diskann_utils::Reborrow;
 use diskann_vector::distance::InnerProduct;
@@ -101,7 +101,7 @@ where
 // ─────────────────────────────────────────────────────────────────────────
 
 /// `MaxSimIsa::Reference` path. Owns the query as a `Mat<Standard<T>>` and
-/// delegates to the existing `MaxSim` fallback per `compute_max_sim` call.
+/// delegates to [`MaxSim`] per `compute_max_sim` call.
 struct ReferenceKernel<T: Copy> {
     query: Mat<Standard<T>>,
 }
@@ -153,9 +153,10 @@ where
 //  BuildAndErase<E> — Target1 impls used by `dispatch1_no_features` (Auto).
 // ─────────────────────────────────────────────────────────────────────────
 
-/// Internal Target1 carrier used only by the `MaxSimIsa::Auto` arm of
-/// `build_max_sim_*`. `dispatch1_no_features` picks the highest available
-/// arch on the host CPU and calls the matching `Target1::run` below.
+/// Internal `Target1` carrier used by the `MaxSimIsa::Auto` arm of
+/// [`MaxSimElement::build`]. `dispatch1_no_features` picks the highest
+/// available arch on the host CPU and calls the matching `Target1::run`
+/// below.
 struct BuildAndErase<E>(E);
 
 // ───── f32 Target1 impls ─────
@@ -184,7 +185,7 @@ impl<E: Erase<f32>> diskann_wide::arch::Target1<V4, E::Output, MatRef<'_, Standa
     for BuildAndErase<E>
 {
     fn run(self, arch: V4, query: MatRef<'_, Standard<f32>>) -> E::Output {
-        // V4 has no dedicated kernel yet; retarget to V3.
+        // V4 dispatches to V3 (no V4-specific kernel).
         let arch = arch.retarget();
         let prepared = BlockTransposed::<f32, 16>::from_matrix_view(query.as_matrix_view());
         self.0.erase(Prepared { arch, prepared })
@@ -196,7 +197,7 @@ impl<E: Erase<f32>> diskann_wide::arch::Target1<Neon, E::Output, MatRef<'_, Stan
     for BuildAndErase<E>
 {
     fn run(self, arch: Neon, query: MatRef<'_, Standard<f32>>) -> E::Output {
-        // Neon has no dedicated kernel yet; retarget to Scalar.
+        // Neon dispatches to Scalar (no Neon-specific kernel).
         let arch = arch.retarget();
         let prepared = BlockTransposed::<f32, 8>::from_matrix_view(query.as_matrix_view());
         self.0.erase(Prepared { arch, prepared })
@@ -232,6 +233,7 @@ impl<E: Erase<half::f16>>
     for BuildAndErase<E>
 {
     fn run(self, arch: V4, query: MatRef<'_, Standard<half::f16>>) -> E::Output {
+        // V4 dispatches to V3 (no V4-specific kernel).
         let arch = arch.retarget();
         let prepared = BlockTransposed::<half::f16, 16>::from_matrix_view(query.as_matrix_view());
         self.0.erase(Prepared { arch, prepared })
@@ -244,6 +246,7 @@ impl<E: Erase<half::f16>>
     for BuildAndErase<E>
 {
     fn run(self, arch: Neon, query: MatRef<'_, Standard<half::f16>>) -> E::Output {
+        // Neon dispatches to Scalar (no Neon-specific kernel).
         let arch = arch.retarget();
         let prepared = BlockTransposed::<half::f16, 8>::from_matrix_view(query.as_matrix_view());
         self.0.erase(Prepared { arch, prepared })
@@ -251,110 +254,292 @@ impl<E: Erase<half::f16>>
 }
 
 // ─────────────────────────────────────────────────────────────────────────
-//  Factory functions.
+//  MaxSimElement — sealed trait gating accepted element types.
 // ─────────────────────────────────────────────────────────────────────────
 
-/// Build a multi-vector MaxSim kernel for `f32` queries.
+mod sealed {
+    pub trait Sealed {}
+}
+
+/// Scalar element types accepted by the multi-vector MaxSim factory.
 ///
-/// Dispatches on `isa`, constructs the corresponding concrete kernel, and
-/// hands it to `erase.erase(...)`. Returns [`NotSupported`] when the requested
-/// ISA cannot run on this build (e.g. AVX-512 unavailable; aarch64 on x86_64).
-pub fn build_max_sim_f32<E: Erase<f32>>(
-    isa: MaxSimIsa,
-    query: MatRef<'_, Standard<f32>>,
-    erase: E,
-) -> Result<E::Output, NotSupported> {
-    match isa {
-        MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features(
-            BuildAndErase(erase),
-            query,
-        )),
-        MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)),
-        #[cfg(target_arch = "x86_64")]
-        MaxSimIsa::X86_64_V3 => {
-            let arch = V3::new_checked().ok_or(NotSupported {
+/// Sealed: external crates cannot add impls. The library ships impls for
+/// `f32` and `half::f16`. Quantized representations (PQ, SQ, packed sub-byte)
+/// do not fit this trait — they carry per-vector codebook/scale state and
+/// will get dedicated factory functions when they are added.
+pub trait MaxSimElement: sealed::Sealed + Sized + Copy + Send + Sync + 'static {
+    /// Build the concrete kernel for this element type and hand it to
+    /// `erase.erase(...)`. Returns [`NotSupported`] when the requested ISA
+    /// cannot run on this build (e.g. AVX-512 unavailable; aarch64 on x86_64).
+    fn build<E: Erase<Self>>(
+        isa: MaxSimIsa,
+        query: MatRef<'_, Standard<Self>>,
+        erase: E,
+    ) -> Result<E::Output, NotSupported>;
+}
+
+impl sealed::Sealed for f32 {}
+impl sealed::Sealed for half::f16 {}
+
+impl MaxSimElement for f32 {
+    fn build<E: Erase<f32>>(
+        isa: MaxSimIsa,
+        query: MatRef<'_, Standard<f32>>,
+        erase: E,
+    ) -> Result<E::Output, NotSupported> {
+        match isa {
+            MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features(
+                BuildAndErase(erase),
+                query,
+            )),
+            MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)),
+            #[cfg(target_arch = "x86_64")]
+            MaxSimIsa::X86_64_V3 => {
+                let arch = V3::new_checked().ok_or(NotSupported {
+                    isa,
+                    reason: "AVX2/FMA unavailable on this CPU",
+                })?;
+                Ok(arch.run1(BuildAndErase(erase), query))
+            }
+            #[cfg(target_arch = "x86_64")]
+            MaxSimIsa::X86_64_V4 => {
+                let arch = V4::new_checked().ok_or(NotSupported {
+                    isa,
+                    reason: "AVX-512 unavailable on this CPU",
+                })?;
+                Ok(arch.run1(BuildAndErase(erase), query))
+            }
+            #[cfg(not(target_arch = "x86_64"))]
+            MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported {
                 isa,
-                reason: "AVX2/FMA unavailable on this CPU",
-            })?;
-            Ok(arch.run1(BuildAndErase(erase), query))
-        }
-        #[cfg(target_arch = "x86_64")]
-        MaxSimIsa::X86_64_V4 => {
-            let arch = V4::new_checked().ok_or(NotSupported {
+                reason: "x86_64 target only",
+            }),
+            #[cfg(target_arch = "aarch64")]
+            MaxSimIsa::Neon => {
+                let arch = Neon::new_checked().ok_or(NotSupported {
+                    isa,
+                    reason: "Neon unavailable on this CPU",
+                })?;
+                Ok(arch.run1(BuildAndErase(erase), query))
+            }
+            #[cfg(not(target_arch = "aarch64"))]
+            MaxSimIsa::Neon => Err(NotSupported {
                 isa,
-                reason: "AVX-512 unavailable on this CPU",
-            })?;
-            Ok(arch.run1(BuildAndErase(erase), query))
+                reason: "aarch64 target only",
+            }),
+            MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::<f32>::new(query))),
         }
-        #[cfg(not(target_arch = "x86_64"))]
-        MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported {
-            isa,
-            reason: "x86_64 target only",
-        }),
-        #[cfg(target_arch = "aarch64")]
-        MaxSimIsa::Neon => {
-            let arch = Neon::new_checked().ok_or(NotSupported {
+    }
+}
+
+impl MaxSimElement for half::f16 {
+    fn build<E: Erase<half::f16>>(
+        isa: MaxSimIsa,
+        query: MatRef<'_, Standard<half::f16>>,
+        erase: E,
+    ) -> Result<E::Output, NotSupported> {
+        match isa {
+            MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features(
+                BuildAndErase(erase),
+                query,
+            )),
+            MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)),
+            #[cfg(target_arch = "x86_64")]
+            MaxSimIsa::X86_64_V3 => {
+                let arch = V3::new_checked().ok_or(NotSupported {
+                    isa,
+                    reason: "AVX2/FMA unavailable on this CPU",
+                })?;
+                Ok(arch.run1(BuildAndErase(erase), query))
+            }
+            #[cfg(target_arch = "x86_64")]
+            MaxSimIsa::X86_64_V4 => {
+                let arch = V4::new_checked().ok_or(NotSupported {
+                    isa,
+                    reason: "AVX-512 unavailable on this CPU",
+                })?;
+                Ok(arch.run1(BuildAndErase(erase), query))
+            }
+            #[cfg(not(target_arch = "x86_64"))]
+            MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported {
+                isa,
+                reason: "x86_64 target only",
+            }),
+            #[cfg(target_arch = "aarch64")]
+            MaxSimIsa::Neon => {
+                let arch = Neon::new_checked().ok_or(NotSupported {
+                    isa,
+                    reason: "Neon unavailable on this CPU",
+                })?;
+                Ok(arch.run1(BuildAndErase(erase), query))
+            }
+            #[cfg(not(target_arch = "aarch64"))]
+            MaxSimIsa::Neon => Err(NotSupported {
                 isa,
-                reason: "Neon unavailable on this CPU",
-            })?;
-            Ok(arch.run1(BuildAndErase(erase), query))
+                reason: "aarch64 target only",
+            }),
+            MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::<half::f16>::new(query))),
         }
-        #[cfg(not(target_arch = "aarch64"))]
-        MaxSimIsa::Neon => Err(NotSupported {
-            isa,
-            reason: "aarch64 target only",
-        }),
-        MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::<f32>::new(query))),
     }
 }
 
-/// Build a multi-vector MaxSim kernel for `half::f16` queries. Same contract
-/// as [`build_max_sim_f32`].
-pub fn build_max_sim_f16<E: Erase<half::f16>>(
+// ─────────────────────────────────────────────────────────────────────────
+//  Factory entry point.
+// ─────────────────────────────────────────────────────────────────────────
+
+/// Build a multi-vector MaxSim kernel for any [`MaxSimElement`] type.
+///
+/// Thin wrapper over [`MaxSimElement::build`] — exists so generic callers can
+/// write `build_max_sim::<T, _>(isa, query, erase)` without naming the trait
+/// at the call site. Returns [`NotSupported`] when the requested ISA cannot
+/// run on this build (e.g. AVX-512 unavailable; aarch64 on x86_64).
+pub fn build_max_sim<T: MaxSimElement, E: Erase<T>>(
     isa: MaxSimIsa,
-    query: MatRef<'_, Standard<half::f16>>,
+    query: MatRef<'_, Standard<T>>,
     erase: E,
 ) -> Result<E::Output, NotSupported> {
-    match isa {
-        MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features(
-            BuildAndErase(erase),
-            query,
-        )),
-        MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)),
-        #[cfg(target_arch = "x86_64")]
-        MaxSimIsa::X86_64_V3 => {
-            let arch = V3::new_checked().ok_or(NotSupported {
-                isa,
-                reason: "AVX2/FMA unavailable on this CPU",
-            })?;
-            Ok(arch.run1(BuildAndErase(erase), query))
+    T::build(isa, query, erase)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::multi_vector::{BoxErase, Chamfer, MaxSim, QueryMatRef};
+
+    /// Local helper trait — picks a sane test value of `T` from an `f32`
+    /// so both `f32` and `half::f16` parameterizations share the same data
+    /// generator.
+    trait FromF32 {
+        fn from_f32(v: f32) -> Self;
+    }
+
+    impl FromF32 for f32 {
+        fn from_f32(v: f32) -> Self {
+            v
         }
-        #[cfg(target_arch = "x86_64")]
-        MaxSimIsa::X86_64_V4 => {
-            let arch = V4::new_checked().ok_or(NotSupported {
-                isa,
-                reason: "AVX-512 unavailable on this CPU",
-            })?;
-            Ok(arch.run1(BuildAndErase(erase), query))
+    }
+
+    impl FromF32 for half::f16 {
+        fn from_f32(v: f32) -> Self {
+            diskann_wide::cast_f32_to_f16(v)
         }
-        #[cfg(not(target_arch = "x86_64"))]
-        MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported {
-            isa,
-            reason: "x86_64 target only",
-        }),
-        #[cfg(target_arch = "aarch64")]
-        MaxSimIsa::Neon => {
-            let arch = Neon::new_checked().ok_or(NotSupported {
-                isa,
-                reason: "Neon unavailable on this CPU",
-            })?;
-            Ok(arch.run1(BuildAndErase(erase), query))
+    }
+
+    fn make_mat<T: Copy>(data: &[T], nrows: usize, ncols: usize) -> MatRef<'_, Standard<T>> {
+        MatRef::new(Standard::new(nrows, ncols).unwrap(), data).unwrap()
+    }
+
+    fn make_test_data<T: FromF32>(len: usize, ceil: usize, shift: usize) -> Vec<T> {
+        (0..len)
+            .map(|v| T::from_f32(((v + shift) % ceil) as f32))
+            .collect()
+    }
+
+    /// Shapes for the `chamfer_matches_fallback` / `max_sim_matches_fallback`
+    /// agreement checks: `(num_queries, num_docs, dim)`.
+    ///
+    /// Targets the factory wiring (query setup, score writeback) above the
+    /// kernel layer; exhaustive panel/remainder coverage is pinned in
+    /// `kernels::tiled_reduce::tests`.
+    const TEST_CASES: &[(usize, usize, usize)] = &[
+        (1, 1, 4),   // Degenerate
+        (5, 3, 5),   // Prime k; nq > 1 and nd > 1 exercise per-row writeback
+        (17, 4, 64), // A-panel remainder crossing both Scalar and V3 panel widths
+        (16, 6, 32), // B-remainder ≠ 1 (V3 b_remainder = 2)
+    ];
+
+    fn check_chamfer_matches<T>(tol: f32, label: &str)
+    where
+        T: MaxSimElement + FromF32,
+        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+    {
+        for &(nq, nd, dim) in TEST_CASES {
+            let query_data = make_test_data::<T>(nq * dim, dim, dim / 2);
+            let doc_data = make_test_data::<T>(nd * dim, dim, dim);
+
+            let query = make_mat(&query_data, nq, dim);
+            let doc = make_mat(&doc_data, nd, dim);
+
+            let expected = Chamfer::evaluate(QueryMatRef::from(query), doc);
+
+            let kernel = build_max_sim::<T, _>(MaxSimIsa::Auto, query, BoxErase).unwrap();
+            let mut scores = vec![0.0f32; nq];
+            kernel.compute_max_sim(doc, &mut scores);
+            let actual: f32 = scores.iter().sum();
+
+            assert!(
+                (actual - expected).abs() < tol,
+                "{label}Chamfer mismatch for ({nq},{nd},{dim}): actual={actual}, expected={expected}",
+            );
+        }
+    }
+
+    fn check_max_sim_matches<T>(tol: f32, label: &str)
+    where
+        T: MaxSimElement + FromF32,
+        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+    {
+        for &(nq, nd, dim) in TEST_CASES {
+            let query_data = make_test_data::<T>(nq * dim, dim, dim / 2);
+            let doc_data = make_test_data::<T>(nd * dim, dim, dim);
+
+            let query = make_mat(&query_data, nq, dim);
+            let doc = make_mat(&doc_data, nd, dim);
+
+            let mut expected_scores = vec![0.0f32; nq];
+            let _ = MaxSim::new(&mut expected_scores)
+                .unwrap()
+                .evaluate(QueryMatRef::from(query), doc);
+
+            let kernel = build_max_sim::<T, _>(MaxSimIsa::Auto, query, BoxErase).unwrap();
+            let mut actual_scores = vec![0.0f32; nq];
+            kernel.compute_max_sim(doc, &mut actual_scores);
+
+            for i in 0..nq {
+                assert!(
+                    (actual_scores[i] - expected_scores[i]).abs() < tol,
+                    "{label}MaxSim[{i}] mismatch for ({nq},{nd},{dim}): actual={}, expected={}",
+                    actual_scores[i],
+                    expected_scores[i],
+                );
+            }
         }
-        #[cfg(not(target_arch = "aarch64"))]
-        MaxSimIsa::Neon => Err(NotSupported {
-            isa,
-            reason: "aarch64 target only",
-        }),
-        MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::<half::f16>::new(query))),
     }
+
+    #[test]
+    fn dimensions_f32() {
+        let data = vec![1.0f32; 5 * 8];
+        let query = make_mat(&data, 5, 8);
+        let kernel = build_max_sim::<f32, _>(MaxSimIsa::Auto, query, BoxErase).unwrap();
+        assert_eq!(kernel.nrows(), 5);
+    }
+
+    #[test]
+    fn dimensions_f16() {
+        let data = vec![diskann_wide::cast_f32_to_f16(1.0); 5 * 8];
+        let query = make_mat(data.as_slice(), 5, 8);
+        let kernel = build_max_sim::<half::f16, _>(MaxSimIsa::Auto, query, BoxErase).unwrap();
+        assert_eq!(kernel.nrows(), 5);
+    }
+
+    macro_rules! test_matches_fallback {
+        ($mod_name:ident, $ty:ty, $tol:expr, $label:literal) => {
+            mod $mod_name {
+                use super::*;
+
+                #[test]
+                fn chamfer_matches_fallback() {
+                    check_chamfer_matches::<$ty>($tol, $label);
+                }
+
+                #[test]
+                fn max_sim_matches_fallback() {
+                    check_max_sim_matches::<$ty>($tol, $label);
+                }
+            }
+        };
+    }
+
+    test_matches_fallback!(f32, f32, 1e-10, "f32 ");
+    test_matches_fallback!(f16, half::f16, 1e-10, "f16 ");
 }
diff --git a/diskann-quantization/src/multi_vector/distance/isa.rs b/diskann-quantization/src/multi_vector/distance/isa.rs
index 49768bc48..d4495dd55 100644
--- a/diskann-quantization/src/multi_vector/distance/isa.rs
+++ b/diskann-quantization/src/multi_vector/distance/isa.rs
@@ -44,9 +44,9 @@ impl std::fmt::Display for MaxSimIsa {
     }
 }
 
-/// Returned by `build_max_sim_*` when the requested ISA cannot be produced on
-/// the current host (e.g. x86_64 V4 requested on a non-AVX512 CPU, or Neon
-/// requested on x86_64).
+/// Returned by [`build_max_sim`](super::build_max_sim) when the requested
+/// ISA cannot be produced on the current host (e.g. x86_64 V4 requested on
+/// a non-AVX512 CPU, or Neon requested on x86_64).
 #[derive(Debug, Clone, Copy)]
 pub struct NotSupported {
     pub isa: MaxSimIsa,
diff --git a/diskann-quantization/src/multi_vector/distance/mod.rs b/diskann-quantization/src/multi_vector/distance/mod.rs
index 9afb070c5..d4bc2725d 100644
--- a/diskann-quantization/src/multi_vector/distance/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/mod.rs
@@ -8,12 +8,14 @@
 //! - [`MaxSim`]: per-query-vector maximum similarities.
 //! - [`Chamfer`]: sum of MaxSim scores (asymmetric Chamfer distance).
 //! - [`MaxSimKernel`]: object-safe interface implemented by every concrete
-//!   kernel constructed through [`build_max_sim_f32`] / [`build_max_sim_f16`].
+//!   kernel constructed through [`build_max_sim`].
 //! - [`Erase`]: BYOTE visitor — caller decides how to type-erase the kernel.
+//! - [`MaxSimElement`]: sealed trait gating which element types the factory
+//!   accepts.
 //!
 //! The fallback path uses a double-loop kernel over
 //! [`InnerProduct`](diskann_vector::distance::InnerProduct). The factory
-//! functions return cache-tiled SIMD kernels selected by [`MaxSimIsa`].
+//! returns cache-tiled SIMD kernels selected by [`MaxSimIsa`].
 //!
 //! # Example
 //!
@@ -56,7 +58,7 @@ mod kernel;
 mod kernels;
 mod max_sim;
 
-pub use factory::{build_max_sim_f16, build_max_sim_f32};
+pub use factory::{MaxSimElement, build_max_sim};
 pub use fallback::QueryMatRef;
 pub use isa::{MaxSimIsa, NotSupported};
 pub use kernel::{BoxErase, Erase, MaxSimKernel};
diff --git a/diskann-quantization/src/multi_vector/mod.rs b/diskann-quantization/src/multi_vector/mod.rs
index d2ad0e7bc..edeca4ef0 100644
--- a/diskann-quantization/src/multi_vector/mod.rs
+++ b/diskann-quantization/src/multi_vector/mod.rs
@@ -22,7 +22,8 @@
 //! | [`QueryMatRef`] | Query wrapper for asymmetric distances |
 //! | [`MaxSim`] | Per-query-vector max similarity computation |
 //! | [`Chamfer`] | Asymmetric Chamfer distance (sum of MaxSim) |
-//! | [`MaxSimKernel`] | Object-safe kernel returned by [`build_max_sim_f32`] / [`build_max_sim_f16`] |
+//! | [`MaxSimKernel`] | Object-safe kernel returned by [`build_max_sim`] |
+//! | [`MaxSimElement`] | Sealed trait gating element types the factory accepts |
 //! | [`MaxSimIsa`] | ISA selector for the factory functions |
 //! | [`Erase`] | BYOTE visitor used by the factory |
 //!
@@ -75,8 +76,8 @@ pub(crate) mod matrix;
 
 pub use block_transposed::{BlockTransposed, BlockTransposedMut, BlockTransposedRef};
 pub use distance::{
-    BoxErase, Chamfer, Erase, MaxSim, MaxSimError, MaxSimIsa, MaxSimKernel, NotSupported,
-    QueryMatRef, build_max_sim_f16, build_max_sim_f32,
+    BoxErase, Chamfer, Erase, MaxSim, MaxSimElement, MaxSimError, MaxSimIsa, MaxSimKernel,
+    NotSupported, QueryMatRef, build_max_sim,
 };
 pub use matrix::{
     Defaulted, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef, Overflow,

From 03d61197b127d11d21aea2228f2a26feca7d91b4 Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryangupta@microsoft.com>
Date: Tue, 19 May 2026 23:52:00 +0530
Subject: [PATCH 13/13] Fix after main merge

---
 .../src/backend/multi_vector/driver.rs        | 27 ++++++++----------
 diskann-benchmark/src/inputs/multi_vector.rs  | 28 +++++++++----------
 2 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/diskann-benchmark/src/backend/multi_vector/driver.rs b/diskann-benchmark/src/backend/multi_vector/driver.rs
index e59f24ac2..57446ae9b 100644
--- a/diskann-benchmark/src/backend/multi_vector/driver.rs
+++ b/diskann-benchmark/src/backend/multi_vector/driver.rs
@@ -16,7 +16,7 @@ use diskann_benchmark_runner::{
         num::{relative_change, NonNegativeFinite},
         percentiles, MicroSeconds,
     },
-    Any, CheckDeserialization, Checker, Input,
+    Checker, Input,
 };
 use diskann_quantization::multi_vector::{Mat, MatRef, MaxSimKernel, Standard};
 use rand::{
@@ -42,33 +42,30 @@ pub(super) struct MultiVectorTolerance {
     pub(super) min_time_regression: NonNegativeFinite,
 }
 
-impl CheckDeserialization for MultiVectorTolerance {
-    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
-        Ok(())
-    }
-}
-
 impl Input for MultiVectorTolerance {
+    type Raw = Self;
+
     fn tag() -> &'static str {
         "multi-vector-tolerance"
     }
 
-    fn try_deserialize(
-        serialized: &serde_json::Value,
-        checker: &mut Checker,
-    ) -> anyhow::Result<Any> {
-        checker.any(Self::deserialize(serialized)?)
+    fn from_raw(raw: Self::Raw, _checker: &mut Checker) -> anyhow::Result<Self> {
+        Ok(raw)
     }
 
-    fn example() -> anyhow::Result<serde_json::Value> {
+    fn serialize(&self) -> anyhow::Result<serde_json::Value> {
+        Ok(serde_json::to_value(self)?)
+    }
+
+    fn example() -> Self {
         const EXAMPLE: NonNegativeFinite = match NonNegativeFinite::new(0.05) {
             Ok(v) => v,
             Err(_) => panic!("use a non-negative finite please"),
         };
 
-        Ok(serde_json::to_value(MultiVectorTolerance {
+        MultiVectorTolerance {
             min_time_regression: EXAMPLE,
-        })?)
+        }
     }
 }
 
diff --git a/diskann-benchmark/src/inputs/multi_vector.rs b/diskann-benchmark/src/inputs/multi_vector.rs
index 9d863c13a..cbb1c255b 100644
--- a/diskann-benchmark/src/inputs/multi_vector.rs
+++ b/diskann-benchmark/src/inputs/multi_vector.rs
@@ -5,18 +5,10 @@
 
 use std::num::NonZeroUsize;
 
-use diskann_benchmark_runner::{utils::datatype::DataType, CheckDeserialization, Checker};
+use diskann_benchmark_runner::{utils::datatype::DataType, Checker, Input};
 use diskann_quantization::multi_vector::MaxSimIsa;
 use serde::{Deserialize, Serialize};
 
-use crate::inputs::{as_input, Example};
-
-//////////////
-// Registry //
-//////////////
-
-as_input!(MultiVectorOp);
-
 ////////////////
 // Enum types //
 ////////////////
@@ -100,13 +92,21 @@ impl MultiVectorOp {
     }
 }
 
-impl CheckDeserialization for MultiVectorOp {
-    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
-        Ok(())
+impl Input for MultiVectorOp {
+    type Raw = Self;
+
+    fn tag() -> &'static str {
+        Self::tag()
+    }
+
+    fn from_raw(raw: Self::Raw, _checker: &mut Checker) -> anyhow::Result<Self> {
+        Ok(raw)
+    }
+
+    fn serialize(&self) -> anyhow::Result<serde_json::Value> {
+        Ok(serde_json::to_value(self)?)
     }
-}
 
-impl Example for MultiVectorOp {
     fn example() -> Self {
         const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
         const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap();