From 77cc936ed8241b34f3baf75a0842ddc863d367ff Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Thu, 7 May 2026 01:18:55 +0530 Subject: [PATCH 01/13] Add benchmark crate for multi-vector --- Cargo.lock | 17 + Cargo.toml | 1 + diskann-benchmark-multi-vector/Cargo.toml | 30 + diskann-benchmark-multi-vector/README.md | 136 ++ .../examples/multi-vector.json | 70 + .../examples/test.json | 47 + .../examples/tolerance.json | 16 + diskann-benchmark-multi-vector/src/bin.rs | 96 + diskann-benchmark-multi-vector/src/lib.rs | 992 ++++++++ results.json | 2150 +++++++++++++++++ 10 files changed, 3555 insertions(+) create mode 100644 diskann-benchmark-multi-vector/Cargo.toml create mode 100644 diskann-benchmark-multi-vector/README.md create mode 100644 diskann-benchmark-multi-vector/examples/multi-vector.json create mode 100644 diskann-benchmark-multi-vector/examples/test.json create mode 100644 diskann-benchmark-multi-vector/examples/tolerance.json create mode 100644 diskann-benchmark-multi-vector/src/bin.rs create mode 100644 diskann-benchmark-multi-vector/src/lib.rs create mode 100644 results.json diff --git a/Cargo.lock b/Cargo.lock index beac316c4..fc0a7cc87 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -697,6 +697,23 @@ dependencies = [ "tokio", ] +[[package]] +name = "diskann-benchmark-multi-vector" +version = "0.50.1" +dependencies = [ + "anyhow", + "diskann-benchmark-runner", + "diskann-quantization", + "diskann-utils", + "diskann-vector", + "half", + "rand 0.9.4", + "serde", + "serde_json", + "tempfile", + "thiserror 2.0.17", +] + [[package]] name = "diskann-benchmark-runner" version = "0.50.1" diff --git a/Cargo.toml b/Cargo.toml index 6f31a1ae2..13fcbdd9c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ members = [ "diskann-benchmark-runner", "diskann-benchmark-core", "diskann-benchmark-simd", + "diskann-benchmark-multi-vector", "diskann-benchmark", "diskann-tools", "vectorset", diff --git a/diskann-benchmark-multi-vector/Cargo.toml b/diskann-benchmark-multi-vector/Cargo.toml new file mode 100644 index 000000000..f8eb937e1 --- /dev/null +++ b/diskann-benchmark-multi-vector/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "diskann-benchmark-multi-vector" +version.workspace = true +description.workspace = true +authors.workspace = true +documentation.workspace = true +license.workspace = true +edition.workspace = true + +[[bin]] +name = "benchmark-multi-vector" +path = "src/bin.rs" + +[dependencies] +anyhow.workspace = true +diskann-utils = { workspace = true, default-features = false } +half = { workspace = true, features = ["rand_distr"] } +diskann-benchmark-runner = { workspace = true } +diskann-quantization = { workspace = true } +diskann-vector = { workspace = true } +rand.workspace = true +serde = { workspace = true, features = ["derive"] } +serde_json.workspace = true +thiserror.workspace = true + +[lints] +workspace = true + +[dev-dependencies] +tempfile.workspace = true diff --git a/diskann-benchmark-multi-vector/README.md b/diskann-benchmark-multi-vector/README.md new file mode 100644 index 000000000..014a393a1 --- /dev/null +++ b/diskann-benchmark-multi-vector/README.md @@ -0,0 +1,136 @@ +# diskann-benchmark-multi-vector + +Benchmarks and regression detection for the **multi-vector distance +operations** exposed by `diskann-quantization` — `Chamfer` and `MaxSim` — +across `f32` and `f16` element types. + +## Layout + +- `src/lib.rs` — benchmark library: input/tolerance schemas, kernel + dispatch, regression checker. +- `src/bin.rs` — `benchmark-multi-vector` CLI entry point. +- `examples/multi-vector.json` — full benchmark matrix covering both + operations across the registered kernels and a representative range of + shapes. +- `examples/test.json` — minimal smoke configuration consumed by the + integration tests. +- `examples/tolerance.json` — default regression thresholds. + +## Registered kernels + +The crate registers four kernels — one per `(element_type, implementation)` +pair: + +| Tag | Element | Implementation | +| -------------------------------- | ------- | -------------------- | +| `multi-vector-op-f32-optimized` | `f32` | `QueryComputer` | +| `multi-vector-op-f16-optimized` | `f16` | `QueryComputer` | +| `multi-vector-op-f32-reference` | `f32` | `Chamfer` / `MaxSim` | +| `multi-vector-op-f16-reference` | `f16` | `Chamfer` / `MaxSim` | + +The **optimized** path constructs a `QueryComputer` once per shape (which +internally selects the best available SIMD kernel for the host) and calls +`chamfer` / `max_sim` inside the timed loop. The **reference** path drives +the `Chamfer` / `MaxSim` fallback used by the `multi_vector` unit tests — +useful both as a numerical ground truth and as a baseline to measure SIMD +speedups against. + +## Time normalization + +Per-measurement latency is normalized to **nanoseconds per inner-product +call**, abbreviated `ns/IP`: + +``` +ns/IP = min_latency_µs * 1000 / (Q * D * loops_per_measurement) +``` + +Two important properties: + +- **Independent of `Q`, `D`, and `loops_per_measurement`.** Reshaping the + benchmark or scaling the loop budget leaves the metric unchanged, so + cache-residency effects and SIMD utilization show up directly. +- **Approximately linear in `Dim`.** Each inner-product call is itself an + O(`Dim`) operation, so `ns/IP` grows with `Dim` — that is why the table + headers read `ns/IP @ Dim`. Compare across rows with the same `Dim`; to + compare across different `Dim`s, divide further by `Dim` to recover ns + per scalar multiply. + +This is the right metric for the two things this crate cares about: +detecting per-shape regressions (the `Dim` factor cancels) and comparing +optimized vs. reference at a fixed shape. + +## Usage + +All examples below assume you are inside the crate directory and use a +small shell function for brevity: + +```bash +bench() { cargo run --release -p diskann-benchmark-multi-vector --bin benchmark-multi-vector -- "$@"; } +``` + +### Run benchmarks + +`run` executes every job in the input file and writes per-measurement +latencies plus percentiles to the output file: + +```bash +bench run --input-file examples/multi-vector.json --output-file before.json +``` + +### Regression check workflow + +The check workflow is **two-phase**: validate the tolerance file once, then +compare two recorded result files. + +**Phase 1 — preflight.** No benchmarks are executed. The verifier confirms +that every entry in `tolerance.json` matches at least one job in the input +file, and that every job is matched by exactly one entry. Run it whenever +you edit `tolerance.json`: + +```bash +bench check verify \ + --tolerances examples/tolerance.json \ + --input-file examples/multi-vector.json +``` + +**Phase 2 — comparison.** Record results before and after a code change, +then compare. The command exits non-zero if any run regresses past its +tolerance: + +```bash +# On the baseline commit +bench run --input-file examples/multi-vector.json --output-file before.json + +# On the change commit +bench run --input-file examples/multi-vector.json --output-file after.json + +# Compare +bench check run \ + --tolerances examples/tolerance.json \ + --input-file examples/multi-vector.json \ + --before before.json --after after.json \ + --output-file checks.json +``` + +A run **fails** when its post-change `ns/IP` minimum exceeds the +baseline minimum by more than `min_time_regression` (default `0.05` = +5%). Improvements (negative change) always pass. + +### How tolerances are matched to jobs + +Each entry in `tolerance.json` has the shape `{ input, tolerance }`. The +`input` block acts as a **partial template** against the jobs in the input +file: any field present must match; missing fields are wildcards. + +The shipped `tolerance.json` uses an empty `"content": {}`, which matches +every `multi-vector-op` job — so a single 5% threshold applies to all four +kernels. To apply different thresholds per implementation, add more +specific entries, e.g.: + +```json +{ "input": { "type": "multi-vector-op", "content": { "implementation": "reference" } }, + "tolerance": { "type": "multi-vector-tolerance", "content": { "min_time_regression": 0.10 } } } +``` + +`check verify` will reject the file if entries overlap or leave any job +unmatched. diff --git a/diskann-benchmark-multi-vector/examples/multi-vector.json b/diskann-benchmark-multi-vector/examples/multi-vector.json new file mode 100644 index 000000000..2626e5047 --- /dev/null +++ b/diskann-benchmark-multi-vector/examples/multi-vector.json @@ -0,0 +1,70 @@ +{ + "search_directories": [], + "jobs": [ + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "implementation": "optimized", + "runs": [ + { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 20 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 }, + + { "operation": "max_sim", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 20 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "implementation": "optimized", + "runs": [ + { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "implementation": "reference", + "runs": [ + { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "implementation": "reference", + "runs": [ + { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2, "num_measurements": 50 } + ] + } + } + ] +} diff --git a/diskann-benchmark-multi-vector/examples/test.json b/diskann-benchmark-multi-vector/examples/test.json new file mode 100644 index 000000000..28e9b9d64 --- /dev/null +++ b/diskann-benchmark-multi-vector/examples/test.json @@ -0,0 +1,47 @@ +{ + "search_directories": [], + "jobs": [ + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "implementation": "optimized", + "runs": [ + { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 2, "num_measurements": 1 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "implementation": "optimized", + "runs": [ + { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "implementation": "reference", + "runs": [ + { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 2, "num_measurements": 1 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "implementation": "reference", + "runs": [ + { "operation": "max_sim", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 } + ] + } + } + ] +} diff --git a/diskann-benchmark-multi-vector/examples/tolerance.json b/diskann-benchmark-multi-vector/examples/tolerance.json new file mode 100644 index 000000000..8d5997199 --- /dev/null +++ b/diskann-benchmark-multi-vector/examples/tolerance.json @@ -0,0 +1,16 @@ +{ + "checks": [ + { + "input": { + "type": "multi-vector-op", + "content": {} + }, + "tolerance": { + "type": "multi-vector-tolerance", + "content": { + "min_time_regression": 0.05 + } + } + } + ] +} diff --git a/diskann-benchmark-multi-vector/src/bin.rs b/diskann-benchmark-multi-vector/src/bin.rs new file mode 100644 index 000000000..d595533e7 --- /dev/null +++ b/diskann-benchmark-multi-vector/src/bin.rs @@ -0,0 +1,96 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +use diskann_benchmark_multi_vector::{register, MultiVectorOp}; +use diskann_benchmark_runner::{output, registry, App, Output}; + +pub fn main() -> anyhow::Result<()> { + // Create the pocket bench application. + let app = App::parse(); + main_inner(&app, &mut output::default()) +} + +fn main_inner(app: &App, output: &mut dyn Output) -> anyhow::Result<()> { + // Register inputs and benchmarks. + let mut inputs = registry::Inputs::new(); + inputs.register::()?; + + let mut benchmarks = registry::Benchmarks::new(); + register(&mut benchmarks); + + // Here we go! + app.run(&inputs, &benchmarks, output) +} + +/////////// +// Tests // +/////////// + +#[cfg(test)] +mod tests { + use super::*; + + use std::path::{Path, PathBuf}; + + use diskann_benchmark_runner::app::{Check, Commands}; + + fn run_integration_test(input_file: &Path, output_file: &Path) { + let commands = Commands::Run { + input_file: input_file.to_str().unwrap().into(), + output_file: output_file.to_str().unwrap().into(), + dry_run: false, + allow_debug: true, + }; + + let app = App::from_commands(commands); + + let mut output = output::Memory::new(); + main_inner(&app, &mut output).unwrap(); + println!( + "output = {}", + String::from_utf8(output.into_inner()).unwrap() + ); + + assert!(output_file.exists()); + } + + fn run_check_test(input_file: &Path, tolerances: &Path) -> String { + let commands = Commands::Check(Check::Verify { + tolerances: tolerances.to_str().unwrap().into(), + input_file: input_file.to_str().unwrap().into(), + }); + + let app = App::from_commands(commands); + + let mut output = output::Memory::new(); + main_inner(&app, &mut output).unwrap(); + String::from_utf8(output.into_inner()).unwrap() + } + + #[test] + fn integration_test() { + let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("examples") + .join("test.json"); + + let tempdir = tempfile::tempdir().unwrap(); + let output_path = tempdir.path().join("output.json"); + + run_integration_test(&input_path, &output_path); + } + + #[test] + fn check_verify() { + let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("examples") + .join("test.json"); + let tolerance_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("examples") + .join("tolerance.json"); + + let stdout = run_check_test(&input_path, &tolerance_path); + println!("stdout = {}", stdout); + } +} diff --git a/diskann-benchmark-multi-vector/src/lib.rs b/diskann-benchmark-multi-vector/src/lib.rs new file mode 100644 index 000000000..7cadf4f29 --- /dev/null +++ b/diskann-benchmark-multi-vector/src/lib.rs @@ -0,0 +1,992 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Multi-vector distance benchmarks with regression detection. + +use std::{io::Write, num::NonZeroUsize}; + +use diskann_quantization::multi_vector::{Chamfer, MatRef, MaxSim, QueryComputer, Standard}; +use diskann_vector::distance::InnerProduct; +use diskann_vector::{DistanceFunctionMut, PureDistanceFunction}; +use half::f16; +use rand::{ + distr::{Distribution, StandardUniform}, + rngs::StdRng, + SeedableRng, +}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +use diskann_benchmark_runner::{ + benchmark::{PassFail, Regression}, + dispatcher::{Description, DispatchRule, FailureScore, MatchScore}, + utils::{ + datatype::{self, DataType}, + num::{relative_change, NonNegativeFinite}, + percentiles, MicroSeconds, + }, + Any, Benchmark, CheckDeserialization, Checker, Input, +}; + +//////////////// +// Public API // +//////////////// + +/// Register all multi-vector benchmarks with the runner's dispatcher. +pub fn register(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) { + register_benchmarks_impl(dispatcher) +} + +/////////// +// Utils // +/////////// + +#[derive(Debug, Clone, Copy)] +struct DisplayWrapper<'a, T: ?Sized>(&'a T); + +impl std::ops::Deref for DisplayWrapper<'_, T> { + type Target = T; + fn deref(&self) -> &T { + self.0 + } +} + +//////////// +// Inputs // +//////////// + +/// The two distance operations exposed by [`QueryComputer`]. +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum Operation { + Chamfer, + MaxSim, +} + +impl std::fmt::Display for Operation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let st = match self { + Self::Chamfer => "chamfer", + Self::MaxSim => "max_sim", + }; + write!(f, "{}", st) + } +} + +/// Which implementation tier to benchmark. +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +enum Implementation { + Optimized, + Reference, +} + +impl std::fmt::Display for Implementation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let st = match self { + Self::Optimized => "optimized", + Self::Reference => "reference", + }; + write!(f, "{}", st) + } +} + +/// One benchmark configuration: a single (operation, shape) measurement. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +struct Run { + operation: Operation, + num_query_vectors: NonZeroUsize, + num_doc_vectors: NonZeroUsize, + dim: NonZeroUsize, + loops_per_measurement: NonZeroUsize, + num_measurements: NonZeroUsize, +} + +/// A complete multi-vector benchmark job. +#[derive(Debug, Serialize, Deserialize)] +pub struct MultiVectorOp { + element_type: DataType, + implementation: Implementation, + runs: Vec, +} + +impl CheckDeserialization for MultiVectorOp { + fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { + Ok(()) + } +} + +macro_rules! write_field { + ($f:ident, $field:tt, $($expr:tt)*) => { + writeln!($f, "{:>18}: {}", $field, $($expr)*) + } +} + +impl MultiVectorOp { + fn summarize_fields(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write_field!(f, "element type", self.element_type)?; + write_field!(f, "implementation", self.implementation)?; + write_field!(f, "number of runs", self.runs.len())?; + Ok(()) + } +} + +impl std::fmt::Display for MultiVectorOp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Multi-Vector Operation\n")?; + write_field!(f, "tag", Self::tag())?; + self.summarize_fields(f) + } +} + +impl Input for MultiVectorOp { + fn tag() -> &'static str { + "multi-vector-op" + } + + fn try_deserialize( + serialized: &serde_json::Value, + checker: &mut Checker, + ) -> anyhow::Result { + checker.any(Self::deserialize(serialized)?) + } + + fn example() -> anyhow::Result { + const NUM_QUERY_VECTORS: NonZeroUsize = NonZeroUsize::new(32).unwrap(); + const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap(); + const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap(); + const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap(); + const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap(); + + let runs = vec![ + Run { + operation: Operation::Chamfer, + num_query_vectors: NUM_QUERY_VECTORS, + num_doc_vectors: NUM_DOC_VECTORS, + dim: DIM, + loops_per_measurement: LOOPS_PER_MEASUREMENT, + num_measurements: NUM_MEASUREMENTS, + }, + Run { + operation: Operation::MaxSim, + num_query_vectors: NUM_QUERY_VECTORS, + num_doc_vectors: NUM_DOC_VECTORS, + dim: DIM, + loops_per_measurement: LOOPS_PER_MEASUREMENT, + num_measurements: NUM_MEASUREMENTS, + }, + ]; + + Ok(serde_json::to_value(&Self { + element_type: DataType::Float32, + implementation: Implementation::Optimized, + runs, + })?) + } +} + +////////////////////// +// Regression Check // +////////////////////// + +/// Tolerance thresholds for multi-vector benchmark regression detection. +/// +/// Each field specifies the maximum allowed relative increase in the corresponding metric. +/// For example, a value of `0.05` means a 5% increase is tolerated. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +struct MultiVectorTolerance { + min_time_regression: NonNegativeFinite, +} + +impl CheckDeserialization for MultiVectorTolerance { + fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { + Ok(()) + } +} + +impl Input for MultiVectorTolerance { + fn tag() -> &'static str { + "multi-vector-tolerance" + } + + fn try_deserialize( + serialized: &serde_json::Value, + checker: &mut Checker, + ) -> anyhow::Result { + checker.any(Self::deserialize(serialized)?) + } + + fn example() -> anyhow::Result { + const EXAMPLE: NonNegativeFinite = match NonNegativeFinite::new(0.05) { + Ok(v) => v, + Err(_) => panic!("use a non-negative finite please"), + }; + + Ok(serde_json::to_value(MultiVectorTolerance { + min_time_regression: EXAMPLE, + })?) + } +} + +/// Per-run comparison result showing before/after percentile differences. +#[derive(Debug, Serialize)] +struct Comparison { + run: Run, + tolerance: MultiVectorTolerance, + before_min: f64, + after_min: f64, +} + +/// Aggregated result of the regression check across all runs. +#[derive(Debug, Serialize)] +struct CheckResult { + checks: Vec, +} + +impl std::fmt::Display for CheckResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let header = [ + "Operation", + "Q", + "D", + "Dim", + "Min Before (ns/IP @ Dim)", + "Min After (ns/IP @ Dim)", + "Change (%)", + "Remark", + ]; + + let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.checks.len()); + + for (i, c) in self.checks.iter().enumerate() { + let mut row = table.row(i); + let change = relative_change(c.before_min, c.after_min); + + row.insert(c.run.operation, 0); + row.insert(c.run.num_query_vectors, 1); + row.insert(c.run.num_doc_vectors, 2); + row.insert(c.run.dim, 3); + row.insert(format!("{:.3}", c.before_min), 4); + row.insert(format!("{:.3}", c.after_min), 5); + match change { + Ok(change) => { + row.insert(format!("{:.3} %", change * 100.0), 6); + if change > c.tolerance.min_time_regression.get() { + row.insert("FAIL", 7); + } + } + Err(err) => { + row.insert("invalid", 6); + row.insert(err, 7); + } + } + } + + table.fmt(f) + } +} + +//////////////////////////// +// Benchmark Registration // +//////////////////////////// + +fn register_benchmarks_impl(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) { + // Optimized (architecture-dispatched QueryComputer). + dispatcher.register_regression( + "multi-vector-op-f32-optimized", + Kernel::::new(), + ); + dispatcher.register_regression( + "multi-vector-op-f16-optimized", + Kernel::::new(), + ); + + // Reference (Chamfer / MaxSim fallback path). + dispatcher.register_regression( + "multi-vector-op-f32-reference", + Kernel::::new(), + ); + dispatcher.register_regression( + "multi-vector-op-f16-reference", + Kernel::::new(), + ); +} + +////////////// +// Dispatch // +////////////// + +/// Dispatch marker for the [`QueryComputer`] implementation. +#[derive(Debug)] +struct Optimized; + +/// Dispatch marker for the [`Chamfer`] / [`MaxSim`] fallback. +#[derive(Debug)] +struct Reference; + +/// A multi-vector benchmark. +struct Kernel { + _type: std::marker::PhantomData<(I, T)>, +} + +impl Kernel { + fn new() -> Self { + Self { + _type: std::marker::PhantomData, + } + } +} + +#[derive(Debug, Error)] +#[error("implementation {0} is not registered for this benchmark")] +pub(crate) struct ImplementationMismatch(Implementation); + +impl DispatchRule for Optimized { + type Error = ImplementationMismatch; + + fn try_match(from: &Implementation) -> Result { + if *from == Implementation::Optimized { + Ok(MatchScore(0)) + } else { + Err(FailureScore(1)) + } + } + + fn convert(from: Implementation) -> Result { + if from == Implementation::Optimized { + Ok(Optimized) + } else { + Err(ImplementationMismatch(from)) + } + } + + fn description( + f: &mut std::fmt::Formatter<'_>, + from: Option<&Implementation>, + ) -> std::fmt::Result { + match from { + None => write!(f, "QueryComputer (architecture-dispatched)"), + Some(impl_) => { + if Self::try_match(impl_).is_ok() { + write!(f, "matched {}", impl_) + } else { + write!(f, "expected {}, got {}", Implementation::Optimized, impl_) + } + } + } + } +} + +impl DispatchRule for Reference { + type Error = ImplementationMismatch; + + fn try_match(from: &Implementation) -> Result { + if *from == Implementation::Reference { + Ok(MatchScore(0)) + } else { + Err(FailureScore(1)) + } + } + + fn convert(from: Implementation) -> Result { + if from == Implementation::Reference { + Ok(Reference) + } else { + Err(ImplementationMismatch(from)) + } + } + + fn description( + f: &mut std::fmt::Formatter<'_>, + from: Option<&Implementation>, + ) -> std::fmt::Result { + match from { + None => write!(f, "Chamfer / MaxSim fallback"), + Some(impl_) => { + if Self::try_match(impl_).is_ok() { + write!(f, "matched {}", impl_) + } else { + write!(f, "expected {}, got {}", Implementation::Reference, impl_) + } + } + } + } +} + +impl Benchmark for Kernel +where + datatype::Type: DispatchRule, + I: DispatchRule + 'static, + Kernel: RunBenchmark, + T: 'static, +{ + type Input = MultiVectorOp; + type Output = Vec; + + fn try_match(&self, from: &MultiVectorOp) -> Result { + let mut failscore: Option = None; + if datatype::Type::::try_match(&from.element_type).is_err() { + *failscore.get_or_insert(0) += 10; + } + if let Err(FailureScore(score)) = I::try_match(&from.implementation) { + *failscore.get_or_insert(0) += 2 + score; + } + + match failscore { + None => Ok(MatchScore(0)), + Some(score) => Err(FailureScore(score)), + } + } + + fn run( + &self, + input: &MultiVectorOp, + _: diskann_benchmark_runner::Checkpoint<'_>, + mut output: &mut dyn diskann_benchmark_runner::Output, + ) -> anyhow::Result { + let _ = I::convert(input.implementation)?; + writeln!(output, "{}", input)?; + let results = self.run_benchmark(input)?; + writeln!(output, "\n\n{}", DisplayWrapper(&*results))?; + Ok(results) + } + + fn description( + &self, + f: &mut std::fmt::Formatter<'_>, + input: Option<&MultiVectorOp>, + ) -> std::fmt::Result { + match input { + None => { + writeln!( + f, + "- Element Type: {}", + Description::>::new() + )?; + writeln!( + f, + "- Implementation: {}", + Description::::new() + )?; + } + Some(input) => { + if let Err(err) = datatype::Type::::try_match_verbose(&input.element_type) { + writeln!(f, "\n - Mismatched element type: {}", err)?; + } + if let Err(err) = I::try_match_verbose(&input.implementation) { + writeln!(f, "\n - Mismatched implementation: {}", err)?; + } + } + } + Ok(()) + } +} + +impl Regression for Kernel +where + datatype::Type: DispatchRule, + I: DispatchRule + 'static, + Kernel: RunBenchmark, + T: 'static, +{ + type Tolerances = MultiVectorTolerance; + type Pass = CheckResult; + type Fail = CheckResult; + + fn check( + &self, + tolerance: &MultiVectorTolerance, + _input: &MultiVectorOp, + before: &Vec, + after: &Vec, + ) -> anyhow::Result> { + anyhow::ensure!( + before.len() == after.len(), + "before has {} runs but after has {}", + before.len(), + after.len(), + ); + + let mut passed = true; + let checks: Vec = std::iter::zip(before.iter(), after.iter()) + .enumerate() + .map(|(i, (b, a))| { + anyhow::ensure!(b.run == a.run, "run {i} mismatched"); + + let computations_per_latency = b.computations_per_latency() as f64; + + let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + + let comparison = Comparison { + run: b.run.clone(), + tolerance: *tolerance, + before_min, + after_min, + }; + + match relative_change(before_min, after_min) { + Ok(change) => { + if change > tolerance.min_time_regression.get() { + passed = false; + } + } + Err(_) => passed = false, + }; + + Ok(comparison) + }) + .collect::>>()?; + + let check = CheckResult { checks }; + + if passed { + Ok(PassFail::Pass(check)) + } else { + Ok(PassFail::Fail(check)) + } + } +} + +/////////////// +// Benchmark // +/////////////// + +trait RunBenchmark { + fn run_benchmark(&self, input: &MultiVectorOp) -> Result, anyhow::Error>; +} + +#[derive(Debug, Serialize, Deserialize)] +struct RunResult { + /// The configuration for this run. + run: Run, + /// Per-measurement latencies (over `loops_per_measurement` calls). + latencies: Vec, + /// Latency percentiles. + percentiles: percentiles::Percentiles, +} + +impl RunResult { + fn computations_per_latency(&self) -> usize { + self.run.num_query_vectors.get() + * self.run.num_doc_vectors.get() + * self.run.loops_per_measurement.get() + } +} + +impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.is_empty() { + return Ok(()); + } + + // ns/IP is normalized as `min_latency_us * 1000 / (Q * D * loops)` and is + // approximately linear in `dim`. Compare across rows with the same `Dim`; + // divide further by `Dim` to recover ns per scalar multiply. + writeln!( + f, + "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)" + )?; + + let header = [ + "Operation", + "Q", + "D", + "Dim", + "Min Time (ns/IP @ Dim)", + "Mean Time (ns/IP @ Dim)", + "Loops", + "Measurements", + ]; + + let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.len()); + + self.iter().enumerate().for_each(|(row, r)| { + let mut row = table.row(row); + + let min_latency = r + .latencies + .iter() + .min() + .copied() + .unwrap_or(MicroSeconds::new(u64::MAX)); + let mean_latency = r.percentiles.mean; + + let computations_per_latency = r.computations_per_latency() as f64; + + // Convert time from micro-seconds to nano-seconds per inner-product call + // (one (query, doc) pair, ~ linear in dim). + let min_time = min_latency.as_f64() / computations_per_latency * 1000.0; + let mean_time = mean_latency / computations_per_latency * 1000.0; + + row.insert(r.run.operation, 0); + row.insert(r.run.num_query_vectors, 1); + row.insert(r.run.num_doc_vectors, 2); + row.insert(r.run.dim, 3); + row.insert(format!("{:.3}", min_time), 4); + row.insert(format!("{:.3}", mean_time), 5); + row.insert(r.run.loops_per_measurement, 6); + row.insert(r.run.num_measurements, 7); + }); + + table.fmt(f) + } +} + +fn run_loops(run: &Run, mut body: F) -> RunResult +where + F: FnMut(), +{ + let mut latencies = Vec::with_capacity(run.num_measurements.get()); + + for _ in 0..run.num_measurements.get() { + let start = std::time::Instant::now(); + for _ in 0..run.loops_per_measurement.get() { + body(); + } + latencies.push(start.elapsed().into()); + } + + let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap(); + RunResult { + run: run.clone(), + latencies, + percentiles, + } +} + +/////////////////// +// Data fixtures // +/////////////////// + +const RNG_SEED: u64 = 0x12345; + +struct Data { + query_data: Box<[T]>, + doc_data: Box<[T]>, +} + +impl Data +where + StandardUniform: Distribution, +{ + fn new(run: &Run) -> Self { + let mut rng = StdRng::seed_from_u64(RNG_SEED); + let query_data: Box<[T]> = (0..run.num_query_vectors.get() * run.dim.get()) + .map(|_| StandardUniform.sample(&mut rng)) + .collect(); + let doc_data: Box<[T]> = (0..run.num_doc_vectors.get() * run.dim.get()) + .map(|_| StandardUniform.sample(&mut rng)) + .collect(); + + Self { + query_data, + doc_data, + } + } + + fn query(&self, run: &Run) -> MatRef<'_, Standard> { + MatRef::new( + Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(), + &self.query_data, + ) + .unwrap() + } + + fn doc(&self, run: &Run) -> MatRef<'_, Standard> { + MatRef::new( + Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(), + &self.doc_data, + ) + .unwrap() + } +} + +///////////////////// +// Implementations // +///////////////////// + +fn run_optimized(input: &MultiVectorOp) -> anyhow::Result> +where + T: Copy, + StandardUniform: Distribution, + QueryComputer: NewFromMatRef, +{ + let mut results = Vec::with_capacity(input.runs.len()); + for run in input.runs.iter() { + let data = Data::::new(run); + let computer = as NewFromMatRef>::new_from(data.query(run)); + let doc = data.doc(run); + + let result = match run.operation { + Operation::Chamfer => run_loops(run, || { + let v = computer.chamfer(doc); + std::hint::black_box(v); + }), + Operation::MaxSim => { + let mut scores = vec![0.0f32; run.num_query_vectors.get()]; + run_loops(run, || { + computer.max_sim(doc, &mut scores); + std::hint::black_box(&mut scores); + }) + } + }; + results.push(result); + } + Ok(results) +} + +/// Drive the [`Chamfer`] / [`MaxSim`] fallback path. +fn run_reference(input: &MultiVectorOp) -> anyhow::Result> +where + T: Copy, + StandardUniform: Distribution, + InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, +{ + let mut results = Vec::with_capacity(input.runs.len()); + for run in input.runs.iter() { + let data = Data::::new(run); + let query = data.query(run); + let doc = data.doc(run); + + let result = match run.operation { + Operation::Chamfer => run_loops(run, || { + let v = Chamfer::evaluate(query.into(), doc); + std::hint::black_box(v); + }), + Operation::MaxSim => { + let mut scores = vec![0.0f32; run.num_query_vectors.get()]; + run_loops(run, || { + let mut max_sim = MaxSim::new(&mut scores).unwrap(); + let _ = max_sim.evaluate(query.into(), doc); + std::hint::black_box(&mut scores); + }) + } + }; + results.push(result); + } + Ok(results) +} + +impl RunBenchmark for Kernel { + fn run_benchmark(&self, input: &MultiVectorOp) -> Result, anyhow::Error> { + run_optimized::(input) + } +} + +impl RunBenchmark for Kernel { + fn run_benchmark(&self, input: &MultiVectorOp) -> Result, anyhow::Error> { + run_optimized::(input) + } +} + +impl RunBenchmark for Kernel { + fn run_benchmark(&self, input: &MultiVectorOp) -> Result, anyhow::Error> { + run_reference::(input) + } +} + +impl RunBenchmark for Kernel { + fn run_benchmark(&self, input: &MultiVectorOp) -> Result, anyhow::Error> { + run_reference::(input) + } +} + +/// Element-type-erasing constructor for [`QueryComputer`]. +trait NewFromMatRef { + fn new_from(query: MatRef<'_, Standard>) -> QueryComputer; +} + +impl NewFromMatRef for QueryComputer { + fn new_from(query: MatRef<'_, Standard>) -> QueryComputer { + QueryComputer::::new(query) + } +} + +impl NewFromMatRef for QueryComputer { + fn new_from(query: MatRef<'_, Standard>) -> QueryComputer { + QueryComputer::::new(query) + } +} + +/////////// +// Tests // +/////////// + +#[cfg(test)] +mod tests { + use super::*; + + use diskann_benchmark_runner::{ + benchmark::{PassFail, Regression}, + utils::percentiles::compute_percentiles, + }; + + fn tiny_run(operation: Operation) -> Run { + Run { + operation, + num_query_vectors: NonZeroUsize::new(2).unwrap(), + num_doc_vectors: NonZeroUsize::new(2).unwrap(), + dim: NonZeroUsize::new(4).unwrap(), + loops_per_measurement: NonZeroUsize::new(1).unwrap(), + num_measurements: NonZeroUsize::new(1).unwrap(), + } + } + + fn tiny_op() -> MultiVectorOp { + MultiVectorOp { + element_type: DataType::Float32, + implementation: Implementation::Optimized, + runs: vec![tiny_run(Operation::Chamfer)], + } + } + + fn tiny_result(operation: Operation, minimum: u64) -> RunResult { + let run = tiny_run(operation); + let minimum = MicroSeconds::new(minimum); + let mut latencies = vec![minimum]; + let percentiles = compute_percentiles(&mut latencies).unwrap(); + RunResult { + run, + latencies, + percentiles, + } + } + + fn tolerance(limit: f64) -> MultiVectorTolerance { + MultiVectorTolerance { + min_time_regression: NonNegativeFinite::new(limit).unwrap(), + } + } + + #[test] + fn check_rejects_mismatched_runs() { + let kernel = Kernel::::new(); + + let err = kernel + .check( + &tolerance(0.0), + &tiny_op(), + &vec![tiny_result(Operation::Chamfer, 100)], + &vec![tiny_result(Operation::MaxSim, 100)], + ) + .unwrap_err(); + + assert_eq!(err.to_string(), "run 0 mismatched"); + } + + #[test] + fn check_allows_negative_relative_change() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.0), + &tiny_op(), + &vec![tiny_result(Operation::Chamfer, 100)], + &vec![tiny_result(Operation::Chamfer, 95)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Pass(_))); + } + + #[test] + fn check_passes_on_tolerance_boundary() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(Operation::Chamfer, 100)], + &vec![tiny_result(Operation::Chamfer, 105)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Pass(_))); + } + + #[test] + fn check_fails_above_tolerance_boundary() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(Operation::Chamfer, 100)], + &vec![tiny_result(Operation::Chamfer, 106)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Fail(_))); + } + + #[test] + fn check_result_display_includes_failure_details() { + let check = CheckResult { + checks: vec![Comparison { + run: tiny_run(Operation::Chamfer), + tolerance: tolerance(0.05), + before_min: 100.0, + after_min: 106.0, + }], + }; + + let rendered = check.to_string(); + assert!(rendered.contains("Operation"), "rendered = {rendered}"); + assert!(rendered.contains("chamfer"), "rendered = {rendered}"); + assert!(rendered.contains("100.000"), "rendered = {rendered}"); + assert!(rendered.contains("106.000"), "rendered = {rendered}"); + assert!(rendered.contains("6.000 %"), "rendered = {rendered}"); + assert!(rendered.contains("FAIL"), "rendered = {rendered}"); + } + + /// A "before" value of 0 means the measurement was too fast to obtain a + /// reliable signal, so we *could* be letting a regression through. We + /// require at least a non-zero value. + #[test] + fn zero_values_rejected() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(Operation::Chamfer, 0)], + &vec![tiny_result(Operation::Chamfer, 0)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Fail(_))); + } + + /// Sanity-check that the optimized kernel and the reference path produce + /// numerically equivalent Chamfer scores on a small fixture. + #[test] + fn optimized_chamfer_matches_reference_f32() { + let run = Run { + operation: Operation::Chamfer, + num_query_vectors: NonZeroUsize::new(5).unwrap(), + num_doc_vectors: NonZeroUsize::new(7).unwrap(), + dim: NonZeroUsize::new(16).unwrap(), + loops_per_measurement: NonZeroUsize::new(1).unwrap(), + num_measurements: NonZeroUsize::new(1).unwrap(), + }; + + let data = Data::::new(&run); + let query = data.query(&run); + let doc = data.doc(&run); + + let optimized = QueryComputer::::new(query).chamfer(doc); + let reference = Chamfer::evaluate(query.into(), doc); + + assert!( + (optimized - reference).abs() < 1e-4, + "optimized={optimized}, reference={reference}", + ); + } +} diff --git a/results.json b/results.json new file mode 100644 index 000000000..f061f6750 --- /dev/null +++ b/results.json @@ -0,0 +1,2150 @@ +[ + { + "input": { + "content": { + "element_type": "float32", + "implementation": "optimized", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 500, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 8, + "operation": "chamfer" + }, + { + "dim": 256, + "loops_per_measurement": 100, + "num_doc_vectors": 64, + "num_measurements": 50, + "num_query_vectors": 16, + "operation": "chamfer" + }, + { + "dim": 384, + "loops_per_measurement": 20, + "num_doc_vectors": 128, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "chamfer" + }, + { + "dim": 256, + "loops_per_measurement": 200, + "num_doc_vectors": 16, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "chamfer" + }, + { + "dim": 264, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 64, + "operation": "chamfer" + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_vectors": 1250, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "chamfer" + }, + { + "dim": 512, + "loops_per_measurement": 2, + "num_doc_vectors": 1250, + "num_measurements": 20, + "num_query_vectors": 64, + "operation": "chamfer" + }, + { + "dim": 128, + "loops_per_measurement": 200, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 64, + "operation": "chamfer" + }, + { + "dim": 512, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "chamfer" + }, + { + "dim": 128, + "loops_per_measurement": 500, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 8, + "operation": "max_sim" + }, + { + "dim": 256, + "loops_per_measurement": 100, + "num_doc_vectors": 64, + "num_measurements": 50, + "num_query_vectors": 16, + "operation": "max_sim" + }, + { + "dim": 384, + "loops_per_measurement": 20, + "num_doc_vectors": 128, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + }, + { + "dim": 256, + "loops_per_measurement": 200, + "num_doc_vectors": 16, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + }, + { + "dim": 264, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 64, + "operation": "max_sim" + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_vectors": 1250, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + }, + { + "dim": 512, + "loops_per_measurement": 2, + "num_doc_vectors": 1250, + "num_measurements": 20, + "num_query_vectors": 64, + "operation": "max_sim" + }, + { + "dim": 128, + "loops_per_measurement": 200, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 64, + "operation": "max_sim" + }, + { + "dim": 512, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + } + ] + }, + "type": "multi-vector-op" + }, + "results": [ + { + "latencies": [ + 777, + 777, + 778, + 780, + 780, + 781, + 804, + 838, + 838, + 838, + 838, + 839, + 839, + 839, + 840, + 842, + 845, + 850, + 899, + 926, + 927, + 931, + 932, + 937, + 939, + 956, + 978, + 1034, + 1035, + 1036, + 1053, + 1064, + 1065, + 1147, + 1164, + 1165, + 1165, + 1166, + 1173, + 1221, + 1323, + 1333, + 1350, + 1352, + 1353, + 1353, + 1357, + 1393, + 1529, + 1537 + ], + "percentiles": { + "mean": 1030.32, + "median": 947.5, + "minimum": 777, + "p90": 1353, + "p99": 1537 + }, + "run": { + "dim": 128, + "loops_per_measurement": 500, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 8, + "operation": "chamfer" + } + }, + { + "latencies": [ + 1029, + 1029, + 1030, + 1030, + 1030, + 1030, + 1030, + 1031, + 1032, + 1034, + 1035, + 1038, + 1050, + 1058, + 1070, + 1112, + 1112, + 1112, + 1112, + 1112, + 1112, + 1112, + 1113, + 1117, + 1119, + 1120, + 1123, + 1145, + 1146, + 1146, + 1146, + 1148, + 1152, + 1167, + 1192, + 1192, + 1192, + 1192, + 1193, + 1207, + 1235, + 1251, + 1254, + 1256, + 1257, + 1261, + 1293, + 1330, + 1330, + 1344 + ], + "percentiles": { + "mean": 1139.22, + "median": 1119.5, + "minimum": 1029, + "p90": 1261, + "p99": 1344 + }, + "run": { + "dim": 256, + "loops_per_measurement": 100, + "num_doc_vectors": 64, + "num_measurements": 50, + "num_query_vectors": 16, + "operation": "chamfer" + } + }, + { + "latencies": [ + 1210, + 1210, + 1210, + 1210, + 1210, + 1210, + 1210, + 1210, + 1210, + 1211, + 1212, + 1212, + 1212, + 1212, + 1213, + 1213, + 1213, + 1213, + 1213, + 1214, + 1217, + 1217, + 1220, + 1223, + 1225, + 1226, + 1227, + 1229, + 1231, + 1235, + 1235, + 1239, + 1239, + 1240, + 1244, + 1249, + 1252, + 1259, + 1264, + 1270, + 1281, + 1294, + 1299, + 1306, + 1312, + 1315, + 1332, + 1341, + 1383, + 1484 + ], + "percentiles": { + "mean": 1246.32, + "median": 1225.5, + "minimum": 1210, + "p90": 1315, + "p99": 1484 + }, + "run": { + "dim": 384, + "loops_per_measurement": 20, + "num_doc_vectors": 128, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "chamfer" + } + }, + { + "latencies": [ + 958, + 958, + 958, + 958, + 958, + 960, + 960, + 960, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 962, + 962, + 963, + 964, + 964, + 965, + 965, + 965, + 966, + 966, + 973, + 974, + 974, + 981, + 981, + 983, + 985, + 987, + 987, + 987, + 990, + 999, + 999 + ], + "percentiles": { + "mean": 967.42, + "median": 961.0, + "minimum": 958, + "p90": 987, + "p99": 999 + }, + "run": { + "dim": 256, + "loops_per_measurement": 200, + "num_doc_vectors": 16, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "chamfer" + } + }, + { + "latencies": [ + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1018, + 1018, + 1018, + 1018, + 1018, + 1018, + 1018, + 1018, + 1018, + 1019, + 1019, + 1019, + 1019, + 1020, + 1020, + 1020, + 1020, + 1020, + 1020, + 1021, + 1022, + 1023, + 1023, + 1026, + 1029, + 1031, + 1032, + 1033, + 1034, + 1035, + 1036, + 1037, + 1041, + 1044, + 1044, + 1045, + 1046, + 1065 + ], + "percentiles": { + "mean": 1024.58, + "median": 1019.5, + "minimum": 1017, + "p90": 1044, + "p99": 1065 + }, + "run": { + "dim": 264, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 64, + "operation": "chamfer" + } + }, + { + "latencies": [ + 1854, + 1855, + 1855, + 1855, + 1855, + 1855, + 1856, + 1856, + 1856, + 1857, + 1857, + 1857, + 1857, + 1857, + 1857, + 1858, + 1858, + 1858, + 1858, + 1858, + 1858, + 1858, + 1859, + 1860, + 1861, + 1861, + 1863, + 1866, + 1869, + 1870, + 1871, + 1871, + 1871, + 1872, + 1874, + 1875, + 1881, + 1883, + 1885, + 1885, + 1890, + 1892, + 1892, + 1892, + 1892, + 1899, + 1906, + 1909, + 1909, + 1916 + ], + "percentiles": { + "mean": 1870.38, + "median": 1861.0, + "minimum": 1854, + "p90": 1899, + "p99": 1916 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_vectors": 1250, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "chamfer" + } + }, + { + "latencies": [ + 3180, + 3180, + 3180, + 3180, + 3180, + 3181, + 3181, + 3181, + 3181, + 3183, + 3185, + 3187, + 3205, + 3206, + 3207, + 3208, + 3211, + 3218, + 3220, + 3268 + ], + "percentiles": { + "mean": 3196.1, + "median": 3184.0, + "minimum": 3180, + "p90": 3220, + "p99": 3268 + }, + "run": { + "dim": 512, + "loops_per_measurement": 2, + "num_doc_vectors": 1250, + "num_measurements": 20, + "num_query_vectors": 64, + "operation": "chamfer" + } + }, + { + "latencies": [ + 1784, + 1784, + 1784, + 1784, + 1784, + 1784, + 1785, + 1785, + 1790, + 1791, + 1791, + 1792, + 1792, + 1792, + 1792, + 1792, + 1792, + 1792, + 1795, + 1795, + 1796, + 1796, + 1796, + 1796, + 1798, + 1800, + 1803, + 1805, + 1814, + 1815, + 1817, + 1818, + 1821, + 1826, + 1840, + 1845, + 1856, + 1858, + 1878, + 1879, + 1879, + 1884, + 1888, + 1890, + 1893, + 1905, + 1907, + 1912, + 1918, + 1950 + ], + "percentiles": { + "mean": 1825.26, + "median": 1799.0, + "minimum": 1784, + "p90": 1905, + "p99": 1950 + }, + "run": { + "dim": 128, + "loops_per_measurement": 200, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 64, + "operation": "chamfer" + } + }, + { + "latencies": [ + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1018, + 1018, + 1018, + 1018, + 1018, + 1019, + 1020, + 1020, + 1021, + 1021, + 1022, + 1022, + 1022, + 1023, + 1027, + 1030, + 1030, + 1035, + 1043, + 1043, + 1044, + 1045, + 1049, + 1049, + 1060 + ], + "percentiles": { + "mean": 1023.2, + "median": 1017.5, + "minimum": 1017, + "p90": 1044, + "p99": 1060 + }, + "run": { + "dim": 512, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "chamfer" + } + }, + { + "latencies": [ + 567, + 567, + 567, + 567, + 567, + 567, + 567, + 567, + 567, + 567, + 567, + 567, + 567, + 567, + 567, + 569, + 569, + 569, + 569, + 569, + 569, + 570, + 570, + 570, + 570, + 570, + 570, + 570, + 570, + 570, + 571, + 571, + 571, + 571, + 571, + 571, + 571, + 571, + 571, + 571, + 571, + 571, + 571, + 571, + 574, + 578, + 578, + 594, + 595, + 598 + ], + "percentiles": { + "mean": 571.2, + "median": 570.0, + "minimum": 567, + "p90": 578, + "p99": 598 + }, + "run": { + "dim": 128, + "loops_per_measurement": 500, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 8, + "operation": "max_sim" + } + }, + { + "latencies": [ + 988, + 988, + 988, + 988, + 988, + 988, + 988, + 988, + 988, + 989, + 989, + 989, + 989, + 989, + 989, + 989, + 989, + 989, + 991, + 991, + 991, + 991, + 991, + 991, + 991, + 991, + 991, + 991, + 991, + 991, + 991, + 991, + 992, + 992, + 992, + 992, + 992, + 992, + 992, + 992, + 992, + 996, + 996, + 1004, + 1009, + 1013, + 1018, + 1020, + 1047, + 1057 + ], + "percentiles": { + "mean": 995.1, + "median": 991.0, + "minimum": 988, + "p90": 1013, + "p99": 1057 + }, + "run": { + "dim": 256, + "loops_per_measurement": 100, + "num_doc_vectors": 64, + "num_measurements": 50, + "num_query_vectors": 16, + "operation": "max_sim" + } + }, + { + "latencies": [ + 1210, + 1210, + 1210, + 1210, + 1210, + 1210, + 1210, + 1210, + 1211, + 1211, + 1211, + 1212, + 1213, + 1213, + 1213, + 1213, + 1213, + 1213, + 1213, + 1213, + 1213, + 1213, + 1214, + 1214, + 1214, + 1214, + 1214, + 1214, + 1214, + 1214, + 1214, + 1214, + 1216, + 1217, + 1217, + 1217, + 1218, + 1220, + 1222, + 1223, + 1224, + 1224, + 1225, + 1227, + 1238, + 1239, + 1239, + 1241, + 1242, + 1243 + ], + "percentiles": { + "mean": 1217.74, + "median": 1214.0, + "minimum": 1210, + "p90": 1239, + "p99": 1243 + }, + "run": { + "dim": 384, + "loops_per_measurement": 20, + "num_doc_vectors": 128, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + } + }, + { + "latencies": [ + 953, + 953, + 953, + 953, + 953, + 953, + 954, + 954, + 956, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 957, + 958, + 958, + 958, + 958, + 958, + 958, + 960, + 961, + 961, + 961, + 961, + 961, + 961, + 961, + 962, + 963, + 971, + 976, + 978, + 984, + 984, + 987 + ], + "percentiles": { + "mean": 960.1, + "median": 957.0, + "minimum": 953, + "p90": 976, + "p99": 987 + }, + "run": { + "dim": 256, + "loops_per_measurement": 200, + "num_doc_vectors": 16, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + } + }, + { + "latencies": [ + 1016, + 1016, + 1016, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1018, + 1018, + 1018, + 1018, + 1018, + 1018, + 1018, + 1018, + 1019, + 1019, + 1019, + 1019, + 1019, + 1019, + 1019, + 1021, + 1021, + 1023, + 1023, + 1025, + 1032, + 1044, + 1045, + 1045, + 1045, + 1047, + 1052, + 1058, + 1061 + ], + "percentiles": { + "mean": 1023.46, + "median": 1018.0, + "minimum": 1016, + "p90": 1045, + "p99": 1061 + }, + "run": { + "dim": 264, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 64, + "operation": "max_sim" + } + }, + { + "latencies": [ + 1858, + 1858, + 1860, + 1860, + 1860, + 1860, + 1860, + 1860, + 1860, + 1860, + 1860, + 1861, + 1861, + 1861, + 1861, + 1861, + 1861, + 1861, + 1862, + 1863, + 1863, + 1864, + 1865, + 1867, + 1868, + 1872, + 1873, + 1876, + 1878, + 1881, + 1882, + 1883, + 1888, + 1889, + 1889, + 1890, + 1890, + 1890, + 1891, + 1892, + 1905, + 1906, + 1908, + 1934, + 1962, + 1967, + 1974, + 1988, + 2004, + 2014 + ], + "percentiles": { + "mean": 1887.22, + "median": 1870.0, + "minimum": 1858, + "p90": 1967, + "p99": 2014 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_vectors": 1250, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + } + }, + { + "latencies": [ + 3177, + 3177, + 3177, + 3179, + 3192, + 3201, + 3212, + 3222, + 3251, + 3251, + 3255, + 3256, + 3256, + 3321, + 3381, + 3399, + 3400, + 3419, + 3422, + 3445 + ], + "percentiles": { + "mean": 3279.65, + "median": 3253.0, + "minimum": 3177, + "p90": 3422, + "p99": 3445 + }, + "run": { + "dim": 512, + "loops_per_measurement": 2, + "num_doc_vectors": 1250, + "num_measurements": 20, + "num_query_vectors": 64, + "operation": "max_sim" + } + }, + { + "latencies": [ + 1783, + 1784, + 1787, + 1791, + 1791, + 1791, + 1813, + 1838, + 1853, + 1868, + 1871, + 1882, + 1882, + 1884, + 1890, + 1899, + 1899, + 1899, + 1900, + 1901, + 1905, + 1906, + 1908, + 1909, + 1911, + 1911, + 1911, + 1911, + 1914, + 1915, + 1915, + 1916, + 1916, + 1917, + 1919, + 1922, + 1922, + 1923, + 1923, + 1925, + 1927, + 1927, + 1928, + 1929, + 1929, + 1933, + 1937, + 1938, + 1940, + 1983 + ], + "percentiles": { + "mean": 1893.52, + "median": 1911.0, + "minimum": 1783, + "p90": 1933, + "p99": 1983 + }, + "run": { + "dim": 128, + "loops_per_measurement": 200, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 64, + "operation": "max_sim" + } + }, + { + "latencies": [ + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1017, + 1020, + 1023, + 1023, + 1025, + 1028, + 1033, + 1033, + 1034, + 1037, + 1038, + 1040, + 1043, + 1044, + 1052, + 1052, + 1057, + 1060, + 1063, + 1078, + 1088, + 1088, + 1088, + 1088, + 1088, + 1088, + 1088, + 1088, + 1090, + 1090, + 1090, + 1092, + 1093, + 1093, + 1094, + 1094 + ], + "percentiles": { + "mean": 1049.56, + "median": 1039.0, + "minimum": 1017, + "p90": 1092, + "p99": 1094 + }, + "run": { + "dim": 512, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + } + } + ] + }, + { + "input": { + "content": { + "element_type": "float16", + "implementation": "optimized", + "runs": [ + { + "dim": 256, + "loops_per_measurement": 100, + "num_doc_vectors": 64, + "num_measurements": 50, + "num_query_vectors": 16, + "operation": "chamfer" + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_vectors": 1250, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "chamfer" + }, + { + "dim": 256, + "loops_per_measurement": 100, + "num_doc_vectors": 64, + "num_measurements": 50, + "num_query_vectors": 16, + "operation": "max_sim" + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_vectors": 1250, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + } + ] + }, + "type": "multi-vector-op" + }, + "results": [ + { + "latencies": [ + 1734, + 1734, + 1736, + 1736, + 1737, + 1737, + 1737, + 1738, + 1738, + 1738, + 1738, + 1739, + 1740, + 1740, + 1741, + 1744, + 1744, + 1751, + 1751, + 1753, + 1754, + 1754, + 1756, + 1759, + 1761, + 1764, + 1767, + 1767, + 1767, + 1768, + 1768, + 1769, + 1769, + 1773, + 1774, + 1775, + 1779, + 1787, + 1794, + 1808, + 1822, + 1825, + 1829, + 1829, + 1844, + 1846, + 1852, + 1859, + 1903, + 2194 + ], + "percentiles": { + "mean": 1780.44, + "median": 1762.5, + "minimum": 1734, + "p90": 1846, + "p99": 2194 + }, + "run": { + "dim": 256, + "loops_per_measurement": 100, + "num_doc_vectors": 64, + "num_measurements": 50, + "num_query_vectors": 16, + "operation": "chamfer" + } + }, + { + "latencies": [ + 2130, + 2130, + 2130, + 2131, + 2133, + 2133, + 2140, + 2142, + 2149, + 2151, + 2158, + 2160, + 2163, + 2164, + 2166, + 2167, + 2167, + 2168, + 2171, + 2173, + 2174, + 2176, + 2177, + 2178, + 2178, + 2181, + 2184, + 2189, + 2195, + 2195, + 2197, + 2198, + 2198, + 2201, + 2203, + 2207, + 2215, + 2217, + 2220, + 2229, + 2240, + 2242, + 2243, + 2249, + 2250, + 2291, + 2305, + 2438, + 2613, + 2643 + ], + "percentiles": { + "mean": 2209.04, + "median": 2179.5, + "minimum": 2130, + "p90": 2291, + "p99": 2643 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_vectors": 1250, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "chamfer" + } + }, + { + "latencies": [ + 1731, + 1733, + 1737, + 1737, + 1737, + 1741, + 1741, + 1745, + 1745, + 1750, + 1750, + 1750, + 1750, + 1751, + 1754, + 1754, + 1755, + 1758, + 1758, + 1759, + 1761, + 1761, + 1766, + 1768, + 1770, + 1771, + 1771, + 1772, + 1773, + 1773, + 1775, + 1776, + 1776, + 1778, + 1785, + 1788, + 1789, + 1791, + 1795, + 1800, + 1804, + 1808, + 1814, + 1822, + 1832, + 1833, + 1834, + 1864, + 1867, + 1869 + ], + "percentiles": { + "mean": 1776.44, + "median": 1770.5, + "minimum": 1731, + "p90": 1833, + "p99": 1869 + }, + "run": { + "dim": 256, + "loops_per_measurement": 100, + "num_doc_vectors": 64, + "num_measurements": 50, + "num_query_vectors": 16, + "operation": "max_sim" + } + }, + { + "latencies": [ + 2127, + 2127, + 2129, + 2130, + 2132, + 2141, + 2142, + 2142, + 2147, + 2148, + 2149, + 2150, + 2154, + 2154, + 2159, + 2162, + 2166, + 2168, + 2170, + 2173, + 2177, + 2180, + 2180, + 2181, + 2181, + 2182, + 2183, + 2187, + 2196, + 2196, + 2199, + 2200, + 2204, + 2211, + 2213, + 2216, + 2224, + 2255, + 2256, + 2271, + 2354, + 2488, + 2493, + 2495, + 2498, + 2505, + 2525, + 2653, + 2657, + 3515 + ], + "percentiles": { + "mean": 2264.9, + "median": 2181.5, + "minimum": 2127, + "p90": 2505, + "p99": 3515 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_vectors": 1250, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + } + } + ] + }, + { + "input": { + "content": { + "element_type": "float32", + "implementation": "reference", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 8, + "operation": "chamfer" + }, + { + "dim": 384, + "loops_per_measurement": 2, + "num_doc_vectors": 128, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "chamfer" + }, + { + "dim": 128, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 8, + "operation": "max_sim" + }, + { + "dim": 384, + "loops_per_measurement": 2, + "num_doc_vectors": 128, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + } + ] + }, + "type": "multi-vector-op" + }, + "results": [ + { + "latencies": [ + 64, + 64, + 64, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 67, + 67, + 67, + 67, + 67, + 67, + 67, + 67, + 67, + 68, + 68, + 69, + 71, + 127 + ], + "percentiles": { + "mean": 67.52, + "median": 66.0, + "minimum": 64, + "p90": 68, + "p99": 127 + }, + "run": { + "dim": 128, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 8, + "operation": "chamfer" + } + }, + { + "latencies": [ + 130, + 130, + 130, + 130, + 130, + 130, + 130, + 130, + 130, + 130, + 131, + 131, + 131, + 131, + 132, + 132, + 133, + 133, + 135, + 136, + 136, + 137, + 138, + 138, + 138, + 138, + 138, + 138, + 138, + 138, + 138, + 138, + 138, + 138, + 138, + 138, + 138, + 138, + 139, + 139, + 139, + 139, + 139, + 140, + 140, + 140, + 141, + 143, + 147, + 161 + ], + "percentiles": { + "mean": 136.26, + "median": 138.0, + "minimum": 130, + "p90": 140, + "p99": 161 + }, + "run": { + "dim": 384, + "loops_per_measurement": 2, + "num_doc_vectors": 128, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "chamfer" + } + }, + { + "latencies": [ + 62, + 62, + 62, + 62, + 62, + 63, + 63, + 63, + 63, + 63, + 63, + 63, + 63, + 63, + 63, + 64, + 64, + 65, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 66, + 67, + 67, + 67, + 67, + 67, + 67, + 67, + 68, + 68, + 69, + 71, + 72, + 78, + 106 + ], + "percentiles": { + "mean": 66.44, + "median": 66.0, + "minimum": 62, + "p90": 69, + "p99": 106 + }, + "run": { + "dim": 128, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 8, + "operation": "max_sim" + } + }, + { + "latencies": [ + 130, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 131, + 132, + 132, + 132, + 132, + 132, + 133, + 133, + 134, + 134, + 135, + 135, + 135, + 136, + 136, + 137, + 139, + 139, + 140, + 142, + 142, + 143, + 144, + 145, + 145, + 147, + 155, + 158 + ], + "percentiles": { + "mean": 135.18, + "median": 132.0, + "minimum": 130, + "p90": 145, + "p99": 158 + }, + "run": { + "dim": 384, + "loops_per_measurement": 2, + "num_doc_vectors": 128, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + } + } + ] + }, + { + "input": { + "content": { + "element_type": "float16", + "implementation": "reference", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 8, + "operation": "chamfer" + }, + { + "dim": 384, + "loops_per_measurement": 2, + "num_doc_vectors": 128, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + } + ] + }, + "type": "multi-vector-op" + }, + "results": [ + { + "latencies": [ + 73, + 73, + 73, + 73, + 73, + 73, + 73, + 73, + 73, + 73, + 73, + 73, + 73, + 73, + 73, + 73, + 73, + 74, + 74, + 74, + 74, + 74, + 74, + 75, + 75, + 76, + 76, + 76, + 76, + 76, + 76, + 77, + 77, + 77, + 77, + 77, + 77, + 77, + 77, + 77, + 78, + 78, + 78, + 79, + 80, + 80, + 80, + 84, + 87, + 92 + ], + "percentiles": { + "mean": 76.0, + "median": 75.5, + "minimum": 73, + "p90": 80, + "p99": 92 + }, + "run": { + "dim": 128, + "loops_per_measurement": 50, + "num_doc_vectors": 32, + "num_measurements": 50, + "num_query_vectors": 8, + "operation": "chamfer" + } + }, + { + "latencies": [ + 135, + 135, + 135, + 135, + 135, + 135, + 135, + 135, + 136, + 136, + 137, + 138, + 140, + 141, + 141, + 141, + 141, + 141, + 142, + 142, + 142, + 142, + 142, + 142, + 142, + 142, + 142, + 142, + 142, + 142, + 142, + 142, + 142, + 142, + 143, + 143, + 143, + 144, + 144, + 145, + 145, + 145, + 147, + 150, + 151, + 151, + 153, + 154, + 158, + 158 + ], + "percentiles": { + "mean": 142.36, + "median": 142.0, + "minimum": 135, + "p90": 151, + "p99": 158 + }, + "run": { + "dim": 384, + "loops_per_measurement": 2, + "num_doc_vectors": 128, + "num_measurements": 50, + "num_query_vectors": 32, + "operation": "max_sim" + } + } + ] + } +] \ No newline at end of file From 54a21ec8f274006c433fcddf111cd2580aa184e1 Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Thu, 7 May 2026 02:38:23 +0530 Subject: [PATCH 02/13] Move some repetetive code to macros and add more benchmark cases --- .../examples/multi-vector.json | 65 ++++- diskann-benchmark-multi-vector/src/lib.rs | 228 +++++++----------- 2 files changed, 141 insertions(+), 152 deletions(-) diff --git a/diskann-benchmark-multi-vector/examples/multi-vector.json b/diskann-benchmark-multi-vector/examples/multi-vector.json index 2626e5047..553a6a9d8 100644 --- a/diskann-benchmark-multi-vector/examples/multi-vector.json +++ b/diskann-benchmark-multi-vector/examples/multi-vector.json @@ -13,7 +13,7 @@ { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 20 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 }, @@ -23,7 +23,7 @@ { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 20 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } ] @@ -35,10 +35,25 @@ "element_type": "float16", "implementation": "optimized", "runs": [ + { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 }, + + { "operation": "max_sim", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 } + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } ] } }, @@ -48,10 +63,25 @@ "element_type": "float32", "implementation": "reference", "runs": [ - { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2, "num_measurements": 50 } + { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 }, + + { "operation": "max_sim", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } ] } }, @@ -61,8 +91,25 @@ "element_type": "float16", "implementation": "reference", "runs": [ - { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2, "num_measurements": 50 } + { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 }, + + { "operation": "max_sim", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } ] } } diff --git a/diskann-benchmark-multi-vector/src/lib.rs b/diskann-benchmark-multi-vector/src/lib.rs index 7cadf4f29..ea6a09715 100644 --- a/diskann-benchmark-multi-vector/src/lib.rs +++ b/diskann-benchmark-multi-vector/src/lib.rs @@ -293,25 +293,19 @@ impl std::fmt::Display for CheckResult { //////////////////////////// fn register_benchmarks_impl(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) { + macro_rules! register { + ($impl:ident, $t:ty, $tag:literal) => { + dispatcher.register_regression($tag, Kernel::<$impl, $t>::new()); + }; + } + // Optimized (architecture-dispatched QueryComputer). - dispatcher.register_regression( - "multi-vector-op-f32-optimized", - Kernel::::new(), - ); - dispatcher.register_regression( - "multi-vector-op-f16-optimized", - Kernel::::new(), - ); + register!(Optimized, f32, "multi-vector-op-f32-optimized"); + register!(Optimized, f16, "multi-vector-op-f16-optimized"); // Reference (Chamfer / MaxSim fallback path). - dispatcher.register_regression( - "multi-vector-op-f32-reference", - Kernel::::new(), - ); - dispatcher.register_regression( - "multi-vector-op-f16-reference", - Kernel::::new(), - ); + register!(Reference, f32, "multi-vector-op-f32-reference"); + register!(Reference, f16, "multi-vector-op-f16-reference"); } ////////////// @@ -340,81 +334,52 @@ impl Kernel { } #[derive(Debug, Error)] -#[error("implementation {0} is not registered for this benchmark")] +#[error("this kernel handles a different implementation than {0}")] pub(crate) struct ImplementationMismatch(Implementation); -impl DispatchRule for Optimized { - type Error = ImplementationMismatch; - - fn try_match(from: &Implementation) -> Result { - if *from == Implementation::Optimized { - Ok(MatchScore(0)) - } else { - Err(FailureScore(1)) - } - } - - fn convert(from: Implementation) -> Result { - if from == Implementation::Optimized { - Ok(Optimized) - } else { - Err(ImplementationMismatch(from)) - } - } +macro_rules! impl_dispatch_rule { + ($marker:ident, $variant:ident, $description:literal) => { + impl DispatchRule for $marker { + type Error = ImplementationMismatch; - fn description( - f: &mut std::fmt::Formatter<'_>, - from: Option<&Implementation>, - ) -> std::fmt::Result { - match from { - None => write!(f, "QueryComputer (architecture-dispatched)"), - Some(impl_) => { - if Self::try_match(impl_).is_ok() { - write!(f, "matched {}", impl_) + fn try_match(from: &Implementation) -> Result { + if *from == Implementation::$variant { + Ok(MatchScore(0)) } else { - write!(f, "expected {}, got {}", Implementation::Optimized, impl_) + Err(FailureScore(1)) } } - } - } -} - -impl DispatchRule for Reference { - type Error = ImplementationMismatch; - fn try_match(from: &Implementation) -> Result { - if *from == Implementation::Reference { - Ok(MatchScore(0)) - } else { - Err(FailureScore(1)) - } - } - - fn convert(from: Implementation) -> Result { - if from == Implementation::Reference { - Ok(Reference) - } else { - Err(ImplementationMismatch(from)) - } - } - - fn description( - f: &mut std::fmt::Formatter<'_>, - from: Option<&Implementation>, - ) -> std::fmt::Result { - match from { - None => write!(f, "Chamfer / MaxSim fallback"), - Some(impl_) => { - if Self::try_match(impl_).is_ok() { - write!(f, "matched {}", impl_) + fn convert(from: Implementation) -> Result { + if from == Implementation::$variant { + Ok($marker) } else { - write!(f, "expected {}, got {}", Implementation::Reference, impl_) + Err(ImplementationMismatch(from)) + } + } + + fn description( + f: &mut std::fmt::Formatter<'_>, + from: Option<&Implementation>, + ) -> std::fmt::Result { + match from { + None => write!(f, $description), + Some(impl_) => { + if Self::try_match(impl_).is_ok() { + write!(f, "matched {}", impl_) + } else { + write!(f, "expected {}, got {}", Implementation::$variant, impl_) + } + } } } } - } + }; } +impl_dispatch_rule!(Optimized, Optimized, "QueryComputer (architecture-dispatched)"); +impl_dispatch_rule!(Reference, Reference, "Chamfer / MaxSim fallback"); + impl Benchmark for Kernel where datatype::Type: DispatchRule, @@ -446,7 +411,9 @@ where _: diskann_benchmark_runner::Checkpoint<'_>, mut output: &mut dyn diskann_benchmark_runner::Output, ) -> anyhow::Result { - let _ = I::convert(input.implementation)?; + // The dispatcher only invokes `run` after `try_match` has already accepted + // the input, so a failure here would indicate a dispatcher bug. + I::convert(input.implementation).expect("try_match accepted the input"); writeln!(output, "{}", input)?; let results = self.run_benchmark(input)?; writeln!(output, "\n\n{}", DisplayWrapper(&*results))?; @@ -717,6 +684,9 @@ where let mut results = Vec::with_capacity(input.runs.len()); for run in input.runs.iter() { let data = Data::::new(run); + // `QueryComputer` performs query-side precomputation that is intentionally + // amortized across many `chamfer` / `max_sim` calls; construct it once per + // shape, outside the timed loop. let computer = as NewFromMatRef>::new_from(data.query(run)); let doc = data.doc(run); @@ -748,20 +718,23 @@ where let mut results = Vec::with_capacity(input.runs.len()); for run in input.runs.iter() { let data = Data::::new(run); - let query = data.query(run); let doc = data.doc(run); + // Hoist out of the timed loop to mirror the optimized path's + // per-shape precomputation. + let query: diskann_quantization::multi_vector::distance::QueryMatRef<'_, _> = + data.query(run).into(); let result = match run.operation { Operation::Chamfer => run_loops(run, || { - let v = Chamfer::evaluate(query.into(), doc); + let v = Chamfer::evaluate(query, doc); std::hint::black_box(v); }), Operation::MaxSim => { let mut scores = vec![0.0f32; run.num_query_vectors.get()]; + let mut max_sim = MaxSim::new(&mut scores).unwrap(); run_loops(run, || { - let mut max_sim = MaxSim::new(&mut scores).unwrap(); - let _ = max_sim.evaluate(query.into(), doc); - std::hint::black_box(&mut scores); + let _ = max_sim.evaluate(query, doc); + std::hint::black_box(max_sim.scores_mut()); }) } }; @@ -770,47 +743,42 @@ where Ok(results) } -impl RunBenchmark for Kernel { - fn run_benchmark(&self, input: &MultiVectorOp) -> Result, anyhow::Error> { - run_optimized::(input) - } -} - -impl RunBenchmark for Kernel { - fn run_benchmark(&self, input: &MultiVectorOp) -> Result, anyhow::Error> { - run_optimized::(input) - } -} - -impl RunBenchmark for Kernel { - fn run_benchmark(&self, input: &MultiVectorOp) -> Result, anyhow::Error> { - run_reference::(input) - } -} - -impl RunBenchmark for Kernel { - fn run_benchmark(&self, input: &MultiVectorOp) -> Result, anyhow::Error> { - run_reference::(input) - } -} - /// Element-type-erasing constructor for [`QueryComputer`]. trait NewFromMatRef { fn new_from(query: MatRef<'_, Standard>) -> QueryComputer; } -impl NewFromMatRef for QueryComputer { - fn new_from(query: MatRef<'_, Standard>) -> QueryComputer { - QueryComputer::::new(query) - } -} +macro_rules! impl_kernel_for { + ($t:ty) => { + impl NewFromMatRef<$t> for QueryComputer<$t> { + fn new_from(query: MatRef<'_, Standard<$t>>) -> QueryComputer<$t> { + QueryComputer::<$t>::new(query) + } + } -impl NewFromMatRef for QueryComputer { - fn new_from(query: MatRef<'_, Standard>) -> QueryComputer { - QueryComputer::::new(query) - } + impl RunBenchmark for Kernel { + fn run_benchmark( + &self, + input: &MultiVectorOp, + ) -> Result, anyhow::Error> { + run_optimized::<$t>(input) + } + } + + impl RunBenchmark for Kernel { + fn run_benchmark( + &self, + input: &MultiVectorOp, + ) -> Result, anyhow::Error> { + run_reference::<$t>(input) + } + } + }; } +impl_kernel_for!(f32); +impl_kernel_for!(f16); + /////////// // Tests // /////////// @@ -963,30 +931,4 @@ mod tests { assert!(matches!(result, PassFail::Fail(_))); } - - /// Sanity-check that the optimized kernel and the reference path produce - /// numerically equivalent Chamfer scores on a small fixture. - #[test] - fn optimized_chamfer_matches_reference_f32() { - let run = Run { - operation: Operation::Chamfer, - num_query_vectors: NonZeroUsize::new(5).unwrap(), - num_doc_vectors: NonZeroUsize::new(7).unwrap(), - dim: NonZeroUsize::new(16).unwrap(), - loops_per_measurement: NonZeroUsize::new(1).unwrap(), - num_measurements: NonZeroUsize::new(1).unwrap(), - }; - - let data = Data::::new(&run); - let query = data.query(&run); - let doc = data.doc(&run); - - let optimized = QueryComputer::::new(query).chamfer(doc); - let reference = Chamfer::evaluate(query.into(), doc); - - assert!( - (optimized - reference).abs() < 1e-4, - "optimized={optimized}, reference={reference}", - ); - } } From f3a5d9fb33cc2dbb0864c88f8bd90bbb65e26dca Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Thu, 7 May 2026 02:38:43 +0530 Subject: [PATCH 03/13] Move some repetetive code to macros and add more benchmark cases --- diskann-benchmark-multi-vector/src/lib.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/diskann-benchmark-multi-vector/src/lib.rs b/diskann-benchmark-multi-vector/src/lib.rs index ea6a09715..df08d93dd 100644 --- a/diskann-benchmark-multi-vector/src/lib.rs +++ b/diskann-benchmark-multi-vector/src/lib.rs @@ -377,7 +377,11 @@ macro_rules! impl_dispatch_rule { }; } -impl_dispatch_rule!(Optimized, Optimized, "QueryComputer (architecture-dispatched)"); +impl_dispatch_rule!( + Optimized, + Optimized, + "QueryComputer (architecture-dispatched)" +); impl_dispatch_rule!(Reference, Reference, "Chamfer / MaxSim fallback"); impl Benchmark for Kernel From 8efdbcd5e79bf48068dfc8e1b4d6a6cdfadd35f0 Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Thu, 7 May 2026 02:39:33 +0530 Subject: [PATCH 04/13] Move some repetetive code to macros and add more benchmark cases --- results.json | 2150 -------------------------------------------------- 1 file changed, 2150 deletions(-) delete mode 100644 results.json diff --git a/results.json b/results.json deleted file mode 100644 index f061f6750..000000000 --- a/results.json +++ /dev/null @@ -1,2150 +0,0 @@ -[ - { - "input": { - "content": { - "element_type": "float32", - "implementation": "optimized", - "runs": [ - { - "dim": 128, - "loops_per_measurement": 500, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 8, - "operation": "chamfer" - }, - { - "dim": 256, - "loops_per_measurement": 100, - "num_doc_vectors": 64, - "num_measurements": 50, - "num_query_vectors": 16, - "operation": "chamfer" - }, - { - "dim": 384, - "loops_per_measurement": 20, - "num_doc_vectors": 128, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "chamfer" - }, - { - "dim": 256, - "loops_per_measurement": 200, - "num_doc_vectors": 16, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "chamfer" - }, - { - "dim": 264, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 64, - "operation": "chamfer" - }, - { - "dim": 128, - "loops_per_measurement": 10, - "num_doc_vectors": 1250, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "chamfer" - }, - { - "dim": 512, - "loops_per_measurement": 2, - "num_doc_vectors": 1250, - "num_measurements": 20, - "num_query_vectors": 64, - "operation": "chamfer" - }, - { - "dim": 128, - "loops_per_measurement": 200, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 64, - "operation": "chamfer" - }, - { - "dim": 512, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "chamfer" - }, - { - "dim": 128, - "loops_per_measurement": 500, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 8, - "operation": "max_sim" - }, - { - "dim": 256, - "loops_per_measurement": 100, - "num_doc_vectors": 64, - "num_measurements": 50, - "num_query_vectors": 16, - "operation": "max_sim" - }, - { - "dim": 384, - "loops_per_measurement": 20, - "num_doc_vectors": 128, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - }, - { - "dim": 256, - "loops_per_measurement": 200, - "num_doc_vectors": 16, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - }, - { - "dim": 264, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 64, - "operation": "max_sim" - }, - { - "dim": 128, - "loops_per_measurement": 10, - "num_doc_vectors": 1250, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - }, - { - "dim": 512, - "loops_per_measurement": 2, - "num_doc_vectors": 1250, - "num_measurements": 20, - "num_query_vectors": 64, - "operation": "max_sim" - }, - { - "dim": 128, - "loops_per_measurement": 200, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 64, - "operation": "max_sim" - }, - { - "dim": 512, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - } - ] - }, - "type": "multi-vector-op" - }, - "results": [ - { - "latencies": [ - 777, - 777, - 778, - 780, - 780, - 781, - 804, - 838, - 838, - 838, - 838, - 839, - 839, - 839, - 840, - 842, - 845, - 850, - 899, - 926, - 927, - 931, - 932, - 937, - 939, - 956, - 978, - 1034, - 1035, - 1036, - 1053, - 1064, - 1065, - 1147, - 1164, - 1165, - 1165, - 1166, - 1173, - 1221, - 1323, - 1333, - 1350, - 1352, - 1353, - 1353, - 1357, - 1393, - 1529, - 1537 - ], - "percentiles": { - "mean": 1030.32, - "median": 947.5, - "minimum": 777, - "p90": 1353, - "p99": 1537 - }, - "run": { - "dim": 128, - "loops_per_measurement": 500, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 8, - "operation": "chamfer" - } - }, - { - "latencies": [ - 1029, - 1029, - 1030, - 1030, - 1030, - 1030, - 1030, - 1031, - 1032, - 1034, - 1035, - 1038, - 1050, - 1058, - 1070, - 1112, - 1112, - 1112, - 1112, - 1112, - 1112, - 1112, - 1113, - 1117, - 1119, - 1120, - 1123, - 1145, - 1146, - 1146, - 1146, - 1148, - 1152, - 1167, - 1192, - 1192, - 1192, - 1192, - 1193, - 1207, - 1235, - 1251, - 1254, - 1256, - 1257, - 1261, - 1293, - 1330, - 1330, - 1344 - ], - "percentiles": { - "mean": 1139.22, - "median": 1119.5, - "minimum": 1029, - "p90": 1261, - "p99": 1344 - }, - "run": { - "dim": 256, - "loops_per_measurement": 100, - "num_doc_vectors": 64, - "num_measurements": 50, - "num_query_vectors": 16, - "operation": "chamfer" - } - }, - { - "latencies": [ - 1210, - 1210, - 1210, - 1210, - 1210, - 1210, - 1210, - 1210, - 1210, - 1211, - 1212, - 1212, - 1212, - 1212, - 1213, - 1213, - 1213, - 1213, - 1213, - 1214, - 1217, - 1217, - 1220, - 1223, - 1225, - 1226, - 1227, - 1229, - 1231, - 1235, - 1235, - 1239, - 1239, - 1240, - 1244, - 1249, - 1252, - 1259, - 1264, - 1270, - 1281, - 1294, - 1299, - 1306, - 1312, - 1315, - 1332, - 1341, - 1383, - 1484 - ], - "percentiles": { - "mean": 1246.32, - "median": 1225.5, - "minimum": 1210, - "p90": 1315, - "p99": 1484 - }, - "run": { - "dim": 384, - "loops_per_measurement": 20, - "num_doc_vectors": 128, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "chamfer" - } - }, - { - "latencies": [ - 958, - 958, - 958, - 958, - 958, - 960, - 960, - 960, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 962, - 962, - 963, - 964, - 964, - 965, - 965, - 965, - 966, - 966, - 973, - 974, - 974, - 981, - 981, - 983, - 985, - 987, - 987, - 987, - 990, - 999, - 999 - ], - "percentiles": { - "mean": 967.42, - "median": 961.0, - "minimum": 958, - "p90": 987, - "p99": 999 - }, - "run": { - "dim": 256, - "loops_per_measurement": 200, - "num_doc_vectors": 16, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "chamfer" - } - }, - { - "latencies": [ - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1018, - 1018, - 1018, - 1018, - 1018, - 1018, - 1018, - 1018, - 1018, - 1019, - 1019, - 1019, - 1019, - 1020, - 1020, - 1020, - 1020, - 1020, - 1020, - 1021, - 1022, - 1023, - 1023, - 1026, - 1029, - 1031, - 1032, - 1033, - 1034, - 1035, - 1036, - 1037, - 1041, - 1044, - 1044, - 1045, - 1046, - 1065 - ], - "percentiles": { - "mean": 1024.58, - "median": 1019.5, - "minimum": 1017, - "p90": 1044, - "p99": 1065 - }, - "run": { - "dim": 264, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 64, - "operation": "chamfer" - } - }, - { - "latencies": [ - 1854, - 1855, - 1855, - 1855, - 1855, - 1855, - 1856, - 1856, - 1856, - 1857, - 1857, - 1857, - 1857, - 1857, - 1857, - 1858, - 1858, - 1858, - 1858, - 1858, - 1858, - 1858, - 1859, - 1860, - 1861, - 1861, - 1863, - 1866, - 1869, - 1870, - 1871, - 1871, - 1871, - 1872, - 1874, - 1875, - 1881, - 1883, - 1885, - 1885, - 1890, - 1892, - 1892, - 1892, - 1892, - 1899, - 1906, - 1909, - 1909, - 1916 - ], - "percentiles": { - "mean": 1870.38, - "median": 1861.0, - "minimum": 1854, - "p90": 1899, - "p99": 1916 - }, - "run": { - "dim": 128, - "loops_per_measurement": 10, - "num_doc_vectors": 1250, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "chamfer" - } - }, - { - "latencies": [ - 3180, - 3180, - 3180, - 3180, - 3180, - 3181, - 3181, - 3181, - 3181, - 3183, - 3185, - 3187, - 3205, - 3206, - 3207, - 3208, - 3211, - 3218, - 3220, - 3268 - ], - "percentiles": { - "mean": 3196.1, - "median": 3184.0, - "minimum": 3180, - "p90": 3220, - "p99": 3268 - }, - "run": { - "dim": 512, - "loops_per_measurement": 2, - "num_doc_vectors": 1250, - "num_measurements": 20, - "num_query_vectors": 64, - "operation": "chamfer" - } - }, - { - "latencies": [ - 1784, - 1784, - 1784, - 1784, - 1784, - 1784, - 1785, - 1785, - 1790, - 1791, - 1791, - 1792, - 1792, - 1792, - 1792, - 1792, - 1792, - 1792, - 1795, - 1795, - 1796, - 1796, - 1796, - 1796, - 1798, - 1800, - 1803, - 1805, - 1814, - 1815, - 1817, - 1818, - 1821, - 1826, - 1840, - 1845, - 1856, - 1858, - 1878, - 1879, - 1879, - 1884, - 1888, - 1890, - 1893, - 1905, - 1907, - 1912, - 1918, - 1950 - ], - "percentiles": { - "mean": 1825.26, - "median": 1799.0, - "minimum": 1784, - "p90": 1905, - "p99": 1950 - }, - "run": { - "dim": 128, - "loops_per_measurement": 200, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 64, - "operation": "chamfer" - } - }, - { - "latencies": [ - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1018, - 1018, - 1018, - 1018, - 1018, - 1019, - 1020, - 1020, - 1021, - 1021, - 1022, - 1022, - 1022, - 1023, - 1027, - 1030, - 1030, - 1035, - 1043, - 1043, - 1044, - 1045, - 1049, - 1049, - 1060 - ], - "percentiles": { - "mean": 1023.2, - "median": 1017.5, - "minimum": 1017, - "p90": 1044, - "p99": 1060 - }, - "run": { - "dim": 512, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "chamfer" - } - }, - { - "latencies": [ - 567, - 567, - 567, - 567, - 567, - 567, - 567, - 567, - 567, - 567, - 567, - 567, - 567, - 567, - 567, - 569, - 569, - 569, - 569, - 569, - 569, - 570, - 570, - 570, - 570, - 570, - 570, - 570, - 570, - 570, - 571, - 571, - 571, - 571, - 571, - 571, - 571, - 571, - 571, - 571, - 571, - 571, - 571, - 571, - 574, - 578, - 578, - 594, - 595, - 598 - ], - "percentiles": { - "mean": 571.2, - "median": 570.0, - "minimum": 567, - "p90": 578, - "p99": 598 - }, - "run": { - "dim": 128, - "loops_per_measurement": 500, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 8, - "operation": "max_sim" - } - }, - { - "latencies": [ - 988, - 988, - 988, - 988, - 988, - 988, - 988, - 988, - 988, - 989, - 989, - 989, - 989, - 989, - 989, - 989, - 989, - 989, - 991, - 991, - 991, - 991, - 991, - 991, - 991, - 991, - 991, - 991, - 991, - 991, - 991, - 991, - 992, - 992, - 992, - 992, - 992, - 992, - 992, - 992, - 992, - 996, - 996, - 1004, - 1009, - 1013, - 1018, - 1020, - 1047, - 1057 - ], - "percentiles": { - "mean": 995.1, - "median": 991.0, - "minimum": 988, - "p90": 1013, - "p99": 1057 - }, - "run": { - "dim": 256, - "loops_per_measurement": 100, - "num_doc_vectors": 64, - "num_measurements": 50, - "num_query_vectors": 16, - "operation": "max_sim" - } - }, - { - "latencies": [ - 1210, - 1210, - 1210, - 1210, - 1210, - 1210, - 1210, - 1210, - 1211, - 1211, - 1211, - 1212, - 1213, - 1213, - 1213, - 1213, - 1213, - 1213, - 1213, - 1213, - 1213, - 1213, - 1214, - 1214, - 1214, - 1214, - 1214, - 1214, - 1214, - 1214, - 1214, - 1214, - 1216, - 1217, - 1217, - 1217, - 1218, - 1220, - 1222, - 1223, - 1224, - 1224, - 1225, - 1227, - 1238, - 1239, - 1239, - 1241, - 1242, - 1243 - ], - "percentiles": { - "mean": 1217.74, - "median": 1214.0, - "minimum": 1210, - "p90": 1239, - "p99": 1243 - }, - "run": { - "dim": 384, - "loops_per_measurement": 20, - "num_doc_vectors": 128, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - } - }, - { - "latencies": [ - 953, - 953, - 953, - 953, - 953, - 953, - 954, - 954, - 956, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 957, - 958, - 958, - 958, - 958, - 958, - 958, - 960, - 961, - 961, - 961, - 961, - 961, - 961, - 961, - 962, - 963, - 971, - 976, - 978, - 984, - 984, - 987 - ], - "percentiles": { - "mean": 960.1, - "median": 957.0, - "minimum": 953, - "p90": 976, - "p99": 987 - }, - "run": { - "dim": 256, - "loops_per_measurement": 200, - "num_doc_vectors": 16, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - } - }, - { - "latencies": [ - 1016, - 1016, - 1016, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1018, - 1018, - 1018, - 1018, - 1018, - 1018, - 1018, - 1018, - 1019, - 1019, - 1019, - 1019, - 1019, - 1019, - 1019, - 1021, - 1021, - 1023, - 1023, - 1025, - 1032, - 1044, - 1045, - 1045, - 1045, - 1047, - 1052, - 1058, - 1061 - ], - "percentiles": { - "mean": 1023.46, - "median": 1018.0, - "minimum": 1016, - "p90": 1045, - "p99": 1061 - }, - "run": { - "dim": 264, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 64, - "operation": "max_sim" - } - }, - { - "latencies": [ - 1858, - 1858, - 1860, - 1860, - 1860, - 1860, - 1860, - 1860, - 1860, - 1860, - 1860, - 1861, - 1861, - 1861, - 1861, - 1861, - 1861, - 1861, - 1862, - 1863, - 1863, - 1864, - 1865, - 1867, - 1868, - 1872, - 1873, - 1876, - 1878, - 1881, - 1882, - 1883, - 1888, - 1889, - 1889, - 1890, - 1890, - 1890, - 1891, - 1892, - 1905, - 1906, - 1908, - 1934, - 1962, - 1967, - 1974, - 1988, - 2004, - 2014 - ], - "percentiles": { - "mean": 1887.22, - "median": 1870.0, - "minimum": 1858, - "p90": 1967, - "p99": 2014 - }, - "run": { - "dim": 128, - "loops_per_measurement": 10, - "num_doc_vectors": 1250, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - } - }, - { - "latencies": [ - 3177, - 3177, - 3177, - 3179, - 3192, - 3201, - 3212, - 3222, - 3251, - 3251, - 3255, - 3256, - 3256, - 3321, - 3381, - 3399, - 3400, - 3419, - 3422, - 3445 - ], - "percentiles": { - "mean": 3279.65, - "median": 3253.0, - "minimum": 3177, - "p90": 3422, - "p99": 3445 - }, - "run": { - "dim": 512, - "loops_per_measurement": 2, - "num_doc_vectors": 1250, - "num_measurements": 20, - "num_query_vectors": 64, - "operation": "max_sim" - } - }, - { - "latencies": [ - 1783, - 1784, - 1787, - 1791, - 1791, - 1791, - 1813, - 1838, - 1853, - 1868, - 1871, - 1882, - 1882, - 1884, - 1890, - 1899, - 1899, - 1899, - 1900, - 1901, - 1905, - 1906, - 1908, - 1909, - 1911, - 1911, - 1911, - 1911, - 1914, - 1915, - 1915, - 1916, - 1916, - 1917, - 1919, - 1922, - 1922, - 1923, - 1923, - 1925, - 1927, - 1927, - 1928, - 1929, - 1929, - 1933, - 1937, - 1938, - 1940, - 1983 - ], - "percentiles": { - "mean": 1893.52, - "median": 1911.0, - "minimum": 1783, - "p90": 1933, - "p99": 1983 - }, - "run": { - "dim": 128, - "loops_per_measurement": 200, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 64, - "operation": "max_sim" - } - }, - { - "latencies": [ - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1017, - 1020, - 1023, - 1023, - 1025, - 1028, - 1033, - 1033, - 1034, - 1037, - 1038, - 1040, - 1043, - 1044, - 1052, - 1052, - 1057, - 1060, - 1063, - 1078, - 1088, - 1088, - 1088, - 1088, - 1088, - 1088, - 1088, - 1088, - 1090, - 1090, - 1090, - 1092, - 1093, - 1093, - 1094, - 1094 - ], - "percentiles": { - "mean": 1049.56, - "median": 1039.0, - "minimum": 1017, - "p90": 1092, - "p99": 1094 - }, - "run": { - "dim": 512, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - } - } - ] - }, - { - "input": { - "content": { - "element_type": "float16", - "implementation": "optimized", - "runs": [ - { - "dim": 256, - "loops_per_measurement": 100, - "num_doc_vectors": 64, - "num_measurements": 50, - "num_query_vectors": 16, - "operation": "chamfer" - }, - { - "dim": 128, - "loops_per_measurement": 10, - "num_doc_vectors": 1250, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "chamfer" - }, - { - "dim": 256, - "loops_per_measurement": 100, - "num_doc_vectors": 64, - "num_measurements": 50, - "num_query_vectors": 16, - "operation": "max_sim" - }, - { - "dim": 128, - "loops_per_measurement": 10, - "num_doc_vectors": 1250, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - } - ] - }, - "type": "multi-vector-op" - }, - "results": [ - { - "latencies": [ - 1734, - 1734, - 1736, - 1736, - 1737, - 1737, - 1737, - 1738, - 1738, - 1738, - 1738, - 1739, - 1740, - 1740, - 1741, - 1744, - 1744, - 1751, - 1751, - 1753, - 1754, - 1754, - 1756, - 1759, - 1761, - 1764, - 1767, - 1767, - 1767, - 1768, - 1768, - 1769, - 1769, - 1773, - 1774, - 1775, - 1779, - 1787, - 1794, - 1808, - 1822, - 1825, - 1829, - 1829, - 1844, - 1846, - 1852, - 1859, - 1903, - 2194 - ], - "percentiles": { - "mean": 1780.44, - "median": 1762.5, - "minimum": 1734, - "p90": 1846, - "p99": 2194 - }, - "run": { - "dim": 256, - "loops_per_measurement": 100, - "num_doc_vectors": 64, - "num_measurements": 50, - "num_query_vectors": 16, - "operation": "chamfer" - } - }, - { - "latencies": [ - 2130, - 2130, - 2130, - 2131, - 2133, - 2133, - 2140, - 2142, - 2149, - 2151, - 2158, - 2160, - 2163, - 2164, - 2166, - 2167, - 2167, - 2168, - 2171, - 2173, - 2174, - 2176, - 2177, - 2178, - 2178, - 2181, - 2184, - 2189, - 2195, - 2195, - 2197, - 2198, - 2198, - 2201, - 2203, - 2207, - 2215, - 2217, - 2220, - 2229, - 2240, - 2242, - 2243, - 2249, - 2250, - 2291, - 2305, - 2438, - 2613, - 2643 - ], - "percentiles": { - "mean": 2209.04, - "median": 2179.5, - "minimum": 2130, - "p90": 2291, - "p99": 2643 - }, - "run": { - "dim": 128, - "loops_per_measurement": 10, - "num_doc_vectors": 1250, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "chamfer" - } - }, - { - "latencies": [ - 1731, - 1733, - 1737, - 1737, - 1737, - 1741, - 1741, - 1745, - 1745, - 1750, - 1750, - 1750, - 1750, - 1751, - 1754, - 1754, - 1755, - 1758, - 1758, - 1759, - 1761, - 1761, - 1766, - 1768, - 1770, - 1771, - 1771, - 1772, - 1773, - 1773, - 1775, - 1776, - 1776, - 1778, - 1785, - 1788, - 1789, - 1791, - 1795, - 1800, - 1804, - 1808, - 1814, - 1822, - 1832, - 1833, - 1834, - 1864, - 1867, - 1869 - ], - "percentiles": { - "mean": 1776.44, - "median": 1770.5, - "minimum": 1731, - "p90": 1833, - "p99": 1869 - }, - "run": { - "dim": 256, - "loops_per_measurement": 100, - "num_doc_vectors": 64, - "num_measurements": 50, - "num_query_vectors": 16, - "operation": "max_sim" - } - }, - { - "latencies": [ - 2127, - 2127, - 2129, - 2130, - 2132, - 2141, - 2142, - 2142, - 2147, - 2148, - 2149, - 2150, - 2154, - 2154, - 2159, - 2162, - 2166, - 2168, - 2170, - 2173, - 2177, - 2180, - 2180, - 2181, - 2181, - 2182, - 2183, - 2187, - 2196, - 2196, - 2199, - 2200, - 2204, - 2211, - 2213, - 2216, - 2224, - 2255, - 2256, - 2271, - 2354, - 2488, - 2493, - 2495, - 2498, - 2505, - 2525, - 2653, - 2657, - 3515 - ], - "percentiles": { - "mean": 2264.9, - "median": 2181.5, - "minimum": 2127, - "p90": 2505, - "p99": 3515 - }, - "run": { - "dim": 128, - "loops_per_measurement": 10, - "num_doc_vectors": 1250, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - } - } - ] - }, - { - "input": { - "content": { - "element_type": "float32", - "implementation": "reference", - "runs": [ - { - "dim": 128, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 8, - "operation": "chamfer" - }, - { - "dim": 384, - "loops_per_measurement": 2, - "num_doc_vectors": 128, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "chamfer" - }, - { - "dim": 128, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 8, - "operation": "max_sim" - }, - { - "dim": 384, - "loops_per_measurement": 2, - "num_doc_vectors": 128, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - } - ] - }, - "type": "multi-vector-op" - }, - "results": [ - { - "latencies": [ - 64, - 64, - 64, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 67, - 67, - 67, - 67, - 67, - 67, - 67, - 67, - 67, - 68, - 68, - 69, - 71, - 127 - ], - "percentiles": { - "mean": 67.52, - "median": 66.0, - "minimum": 64, - "p90": 68, - "p99": 127 - }, - "run": { - "dim": 128, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 8, - "operation": "chamfer" - } - }, - { - "latencies": [ - 130, - 130, - 130, - 130, - 130, - 130, - 130, - 130, - 130, - 130, - 131, - 131, - 131, - 131, - 132, - 132, - 133, - 133, - 135, - 136, - 136, - 137, - 138, - 138, - 138, - 138, - 138, - 138, - 138, - 138, - 138, - 138, - 138, - 138, - 138, - 138, - 138, - 138, - 139, - 139, - 139, - 139, - 139, - 140, - 140, - 140, - 141, - 143, - 147, - 161 - ], - "percentiles": { - "mean": 136.26, - "median": 138.0, - "minimum": 130, - "p90": 140, - "p99": 161 - }, - "run": { - "dim": 384, - "loops_per_measurement": 2, - "num_doc_vectors": 128, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "chamfer" - } - }, - { - "latencies": [ - 62, - 62, - 62, - 62, - 62, - 63, - 63, - 63, - 63, - 63, - 63, - 63, - 63, - 63, - 63, - 64, - 64, - 65, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 66, - 67, - 67, - 67, - 67, - 67, - 67, - 67, - 68, - 68, - 69, - 71, - 72, - 78, - 106 - ], - "percentiles": { - "mean": 66.44, - "median": 66.0, - "minimum": 62, - "p90": 69, - "p99": 106 - }, - "run": { - "dim": 128, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 8, - "operation": "max_sim" - } - }, - { - "latencies": [ - 130, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 131, - 132, - 132, - 132, - 132, - 132, - 133, - 133, - 134, - 134, - 135, - 135, - 135, - 136, - 136, - 137, - 139, - 139, - 140, - 142, - 142, - 143, - 144, - 145, - 145, - 147, - 155, - 158 - ], - "percentiles": { - "mean": 135.18, - "median": 132.0, - "minimum": 130, - "p90": 145, - "p99": 158 - }, - "run": { - "dim": 384, - "loops_per_measurement": 2, - "num_doc_vectors": 128, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - } - } - ] - }, - { - "input": { - "content": { - "element_type": "float16", - "implementation": "reference", - "runs": [ - { - "dim": 128, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 8, - "operation": "chamfer" - }, - { - "dim": 384, - "loops_per_measurement": 2, - "num_doc_vectors": 128, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - } - ] - }, - "type": "multi-vector-op" - }, - "results": [ - { - "latencies": [ - 73, - 73, - 73, - 73, - 73, - 73, - 73, - 73, - 73, - 73, - 73, - 73, - 73, - 73, - 73, - 73, - 73, - 74, - 74, - 74, - 74, - 74, - 74, - 75, - 75, - 76, - 76, - 76, - 76, - 76, - 76, - 77, - 77, - 77, - 77, - 77, - 77, - 77, - 77, - 77, - 78, - 78, - 78, - 79, - 80, - 80, - 80, - 84, - 87, - 92 - ], - "percentiles": { - "mean": 76.0, - "median": 75.5, - "minimum": 73, - "p90": 80, - "p99": 92 - }, - "run": { - "dim": 128, - "loops_per_measurement": 50, - "num_doc_vectors": 32, - "num_measurements": 50, - "num_query_vectors": 8, - "operation": "chamfer" - } - }, - { - "latencies": [ - 135, - 135, - 135, - 135, - 135, - 135, - 135, - 135, - 136, - 136, - 137, - 138, - 140, - 141, - 141, - 141, - 141, - 141, - 142, - 142, - 142, - 142, - 142, - 142, - 142, - 142, - 142, - 142, - 142, - 142, - 142, - 142, - 142, - 142, - 143, - 143, - 143, - 144, - 144, - 145, - 145, - 145, - 147, - 150, - 151, - 151, - 153, - 154, - 158, - 158 - ], - "percentiles": { - "mean": 142.36, - "median": 142.0, - "minimum": 135, - "p90": 151, - "p99": 158 - }, - "run": { - "dim": 384, - "loops_per_measurement": 2, - "num_doc_vectors": 128, - "num_measurements": 50, - "num_query_vectors": 32, - "operation": "max_sim" - } - } - ] - } -] \ No newline at end of file From 3a89c3750bef66322e5e5c3f90e91d43e941a74b Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Thu, 7 May 2026 12:09:00 +0530 Subject: [PATCH 05/13] Add Cargo.lock --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index c7b68684e..e179d3320 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -699,7 +699,7 @@ dependencies = [ [[package]] name = "diskann-benchmark-multi-vector" -version = "0.50.1" +version = "0.51.0" dependencies = [ "anyhow", "diskann-benchmark-runner", From 96d17b30378159ffdfc04b5afbeb0bf225992de2 Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Fri, 8 May 2026 01:48:37 +0530 Subject: [PATCH 06/13] Remove unused scalar benchmark config file --- .../graph_index_scalar_oai_large.json | 115 ------------------ 1 file changed, 115 deletions(-) delete mode 100644 diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json diff --git a/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json b/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json deleted file mode 100644 index 09752477a..000000000 --- a/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json +++ /dev/null @@ -1,115 +0,0 @@ -{ - "search_directories": [ - "/mnt/nvme/s" - ], - "jobs": [ - { - "type": "graph-index-build-sq", - "content": { - "build": { - "data_type": "float16", - "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin", - "distance": "squared_l2", - "max_degree": 32, - "l_build": 100, - "alpha": 1.2, - "backedge_ratio": 1.0, - "num_threads": 8, - "multi_insert":null, - "search_phase": { - "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin", - "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000", - "reps": 2, - "num_threads": [ - 8 - ], - "runs": [ - { - "search_n": 10, - "search_l": [ - 50 - ], - "recall_k": 10 - } - ] - } - }, - "num_bits": 1, - "standard_deviations": 2, - "use_fp_for_search": true - } - }, - { - "type": "graph-index-build-sq", - "content": { - "build": { - "data_type": "float16", - "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin", - "distance": "squared_l2", - "max_degree": 32, - "l_build": 100, - "alpha": 1.2, - "backedge_ratio": 1.0, - "num_threads": 8, - "multi_insert":null, - "search_phase": { - "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin", - "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000", - "reps": 2, - "num_threads": [ - 8 - ], - "runs": [ - { - "search_n": 10, - "search_l": [ - 50 - ], - "recall_k": 10 - } - ] - } - }, - "num_bits": 4, - "standard_deviations": 2, - "use_fp_for_search": true - } - }, - { - "type": "graph-index-build-sq", - "content": { - "build": { - "data_type": "float16", - "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin", - "distance": "squared_l2", - "max_degree": 32, - "l_build": 100, - "alpha": 1.2, - "backedge_ratio": 1.0, - "num_threads": 8, - "multi_insert":null, - "search_phase": { - "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin", - "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000", - "reps": 1, - "num_threads": [ - 8 - ], - "runs": [ - { - "search_n": 10, - "search_l": [ - 50 - ], - "recall_k": 10 - } - ] - } - }, - "num_bits": 8, - "standard_deviations": 2, - "use_fp_for_search": true - } - } - ] - } \ No newline at end of file From 6b33719c7b082fb6142d0b372c0c227c501fdc8c Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Fri, 8 May 2026 01:51:13 +0530 Subject: [PATCH 07/13] Revert "Remove unused scalar benchmark config file" This reverts commit 96d17b30378159ffdfc04b5afbeb0bf225992de2. --- .../graph_index_scalar_oai_large.json | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json diff --git a/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json b/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json new file mode 100644 index 000000000..09752477a --- /dev/null +++ b/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json @@ -0,0 +1,115 @@ +{ + "search_directories": [ + "/mnt/nvme/s" + ], + "jobs": [ + { + "type": "graph-index-build-sq", + "content": { + "build": { + "data_type": "float16", + "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin", + "distance": "squared_l2", + "max_degree": 32, + "l_build": 100, + "alpha": 1.2, + "backedge_ratio": 1.0, + "num_threads": 8, + "multi_insert":null, + "search_phase": { + "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin", + "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000", + "reps": 2, + "num_threads": [ + 8 + ], + "runs": [ + { + "search_n": 10, + "search_l": [ + 50 + ], + "recall_k": 10 + } + ] + } + }, + "num_bits": 1, + "standard_deviations": 2, + "use_fp_for_search": true + } + }, + { + "type": "graph-index-build-sq", + "content": { + "build": { + "data_type": "float16", + "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin", + "distance": "squared_l2", + "max_degree": 32, + "l_build": 100, + "alpha": 1.2, + "backedge_ratio": 1.0, + "num_threads": 8, + "multi_insert":null, + "search_phase": { + "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin", + "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000", + "reps": 2, + "num_threads": [ + 8 + ], + "runs": [ + { + "search_n": 10, + "search_l": [ + 50 + ], + "recall_k": 10 + } + ] + } + }, + "num_bits": 4, + "standard_deviations": 2, + "use_fp_for_search": true + } + }, + { + "type": "graph-index-build-sq", + "content": { + "build": { + "data_type": "float16", + "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin", + "distance": "squared_l2", + "max_degree": 32, + "l_build": 100, + "alpha": 1.2, + "backedge_ratio": 1.0, + "num_threads": 8, + "multi_insert":null, + "search_phase": { + "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin", + "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000", + "reps": 1, + "num_threads": [ + 8 + ], + "runs": [ + { + "search_n": 10, + "search_l": [ + 50 + ], + "recall_k": 10 + } + ] + } + }, + "num_bits": 8, + "standard_deviations": 2, + "use_fp_for_search": true + } + } + ] + } \ No newline at end of file From d06df7ee59ebb69009837dbdbc5bcd8bcaedfc84 Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Tue, 12 May 2026 20:47:04 +0530 Subject: [PATCH 08/13] Fold the new crate to existing diskann-benchmark crate --- Cargo.lock | 17 - Cargo.toml | 1 - diskann-benchmark-multi-vector/Cargo.toml | 30 - diskann-benchmark-multi-vector/README.md | 136 --- diskann-benchmark-multi-vector/src/bin.rs | 96 -- diskann-benchmark-multi-vector/src/lib.rs | 938 ------------------ diskann-benchmark/Cargo.toml | 3 + .../example/multi-vector-test.json | 0 .../example}/multi-vector.json | 0 .../multi-vector-tolerance.json | 0 diskann-benchmark/src/backend/mod.rs | 2 + diskann-benchmark/src/backend/multi_vector.rs | 806 +++++++++++++++ diskann-benchmark/src/inputs/mod.rs | 2 + diskann-benchmark/src/inputs/multi_vector.rs | 190 ++++ diskann-benchmark/src/main.rs | 86 ++ .../src/multi_vector/matrix.rs | 44 + diskann-quantization/src/multi_vector/mod.rs | 4 +- 17 files changed, 1135 insertions(+), 1220 deletions(-) delete mode 100644 diskann-benchmark-multi-vector/Cargo.toml delete mode 100644 diskann-benchmark-multi-vector/README.md delete mode 100644 diskann-benchmark-multi-vector/src/bin.rs delete mode 100644 diskann-benchmark-multi-vector/src/lib.rs rename diskann-benchmark-multi-vector/examples/test.json => diskann-benchmark/example/multi-vector-test.json (100%) rename {diskann-benchmark-multi-vector/examples => diskann-benchmark/example}/multi-vector.json (100%) rename diskann-benchmark-multi-vector/examples/tolerance.json => diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json (100%) create mode 100644 diskann-benchmark/src/backend/multi_vector.rs create mode 100644 diskann-benchmark/src/inputs/multi_vector.rs diff --git a/Cargo.lock b/Cargo.lock index e179d3320..1713f4b87 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -697,23 +697,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "diskann-benchmark-multi-vector" -version = "0.51.0" -dependencies = [ - "anyhow", - "diskann-benchmark-runner", - "diskann-quantization", - "diskann-utils", - "diskann-vector", - "half", - "rand 0.9.4", - "serde", - "serde_json", - "tempfile", - "thiserror 2.0.17", -] - [[package]] name = "diskann-benchmark-runner" version = "0.51.0" diff --git a/Cargo.toml b/Cargo.toml index cce02b501..6353773c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,6 @@ members = [ "diskann-benchmark-runner", "diskann-benchmark-core", "diskann-benchmark-simd", - "diskann-benchmark-multi-vector", "diskann-benchmark", "diskann-tools", "vectorset", diff --git a/diskann-benchmark-multi-vector/Cargo.toml b/diskann-benchmark-multi-vector/Cargo.toml deleted file mode 100644 index f8eb937e1..000000000 --- a/diskann-benchmark-multi-vector/Cargo.toml +++ /dev/null @@ -1,30 +0,0 @@ -[package] -name = "diskann-benchmark-multi-vector" -version.workspace = true -description.workspace = true -authors.workspace = true -documentation.workspace = true -license.workspace = true -edition.workspace = true - -[[bin]] -name = "benchmark-multi-vector" -path = "src/bin.rs" - -[dependencies] -anyhow.workspace = true -diskann-utils = { workspace = true, default-features = false } -half = { workspace = true, features = ["rand_distr"] } -diskann-benchmark-runner = { workspace = true } -diskann-quantization = { workspace = true } -diskann-vector = { workspace = true } -rand.workspace = true -serde = { workspace = true, features = ["derive"] } -serde_json.workspace = true -thiserror.workspace = true - -[lints] -workspace = true - -[dev-dependencies] -tempfile.workspace = true diff --git a/diskann-benchmark-multi-vector/README.md b/diskann-benchmark-multi-vector/README.md deleted file mode 100644 index 014a393a1..000000000 --- a/diskann-benchmark-multi-vector/README.md +++ /dev/null @@ -1,136 +0,0 @@ -# diskann-benchmark-multi-vector - -Benchmarks and regression detection for the **multi-vector distance -operations** exposed by `diskann-quantization` — `Chamfer` and `MaxSim` — -across `f32` and `f16` element types. - -## Layout - -- `src/lib.rs` — benchmark library: input/tolerance schemas, kernel - dispatch, regression checker. -- `src/bin.rs` — `benchmark-multi-vector` CLI entry point. -- `examples/multi-vector.json` — full benchmark matrix covering both - operations across the registered kernels and a representative range of - shapes. -- `examples/test.json` — minimal smoke configuration consumed by the - integration tests. -- `examples/tolerance.json` — default regression thresholds. - -## Registered kernels - -The crate registers four kernels — one per `(element_type, implementation)` -pair: - -| Tag | Element | Implementation | -| -------------------------------- | ------- | -------------------- | -| `multi-vector-op-f32-optimized` | `f32` | `QueryComputer` | -| `multi-vector-op-f16-optimized` | `f16` | `QueryComputer` | -| `multi-vector-op-f32-reference` | `f32` | `Chamfer` / `MaxSim` | -| `multi-vector-op-f16-reference` | `f16` | `Chamfer` / `MaxSim` | - -The **optimized** path constructs a `QueryComputer` once per shape (which -internally selects the best available SIMD kernel for the host) and calls -`chamfer` / `max_sim` inside the timed loop. The **reference** path drives -the `Chamfer` / `MaxSim` fallback used by the `multi_vector` unit tests — -useful both as a numerical ground truth and as a baseline to measure SIMD -speedups against. - -## Time normalization - -Per-measurement latency is normalized to **nanoseconds per inner-product -call**, abbreviated `ns/IP`: - -``` -ns/IP = min_latency_µs * 1000 / (Q * D * loops_per_measurement) -``` - -Two important properties: - -- **Independent of `Q`, `D`, and `loops_per_measurement`.** Reshaping the - benchmark or scaling the loop budget leaves the metric unchanged, so - cache-residency effects and SIMD utilization show up directly. -- **Approximately linear in `Dim`.** Each inner-product call is itself an - O(`Dim`) operation, so `ns/IP` grows with `Dim` — that is why the table - headers read `ns/IP @ Dim`. Compare across rows with the same `Dim`; to - compare across different `Dim`s, divide further by `Dim` to recover ns - per scalar multiply. - -This is the right metric for the two things this crate cares about: -detecting per-shape regressions (the `Dim` factor cancels) and comparing -optimized vs. reference at a fixed shape. - -## Usage - -All examples below assume you are inside the crate directory and use a -small shell function for brevity: - -```bash -bench() { cargo run --release -p diskann-benchmark-multi-vector --bin benchmark-multi-vector -- "$@"; } -``` - -### Run benchmarks - -`run` executes every job in the input file and writes per-measurement -latencies plus percentiles to the output file: - -```bash -bench run --input-file examples/multi-vector.json --output-file before.json -``` - -### Regression check workflow - -The check workflow is **two-phase**: validate the tolerance file once, then -compare two recorded result files. - -**Phase 1 — preflight.** No benchmarks are executed. The verifier confirms -that every entry in `tolerance.json` matches at least one job in the input -file, and that every job is matched by exactly one entry. Run it whenever -you edit `tolerance.json`: - -```bash -bench check verify \ - --tolerances examples/tolerance.json \ - --input-file examples/multi-vector.json -``` - -**Phase 2 — comparison.** Record results before and after a code change, -then compare. The command exits non-zero if any run regresses past its -tolerance: - -```bash -# On the baseline commit -bench run --input-file examples/multi-vector.json --output-file before.json - -# On the change commit -bench run --input-file examples/multi-vector.json --output-file after.json - -# Compare -bench check run \ - --tolerances examples/tolerance.json \ - --input-file examples/multi-vector.json \ - --before before.json --after after.json \ - --output-file checks.json -``` - -A run **fails** when its post-change `ns/IP` minimum exceeds the -baseline minimum by more than `min_time_regression` (default `0.05` = -5%). Improvements (negative change) always pass. - -### How tolerances are matched to jobs - -Each entry in `tolerance.json` has the shape `{ input, tolerance }`. The -`input` block acts as a **partial template** against the jobs in the input -file: any field present must match; missing fields are wildcards. - -The shipped `tolerance.json` uses an empty `"content": {}`, which matches -every `multi-vector-op` job — so a single 5% threshold applies to all four -kernels. To apply different thresholds per implementation, add more -specific entries, e.g.: - -```json -{ "input": { "type": "multi-vector-op", "content": { "implementation": "reference" } }, - "tolerance": { "type": "multi-vector-tolerance", "content": { "min_time_regression": 0.10 } } } -``` - -`check verify` will reject the file if entries overlap or leave any job -unmatched. diff --git a/diskann-benchmark-multi-vector/src/bin.rs b/diskann-benchmark-multi-vector/src/bin.rs deleted file mode 100644 index d595533e7..000000000 --- a/diskann-benchmark-multi-vector/src/bin.rs +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) Microsoft Corporation. - * Licensed under the MIT license. - */ - -use diskann_benchmark_multi_vector::{register, MultiVectorOp}; -use diskann_benchmark_runner::{output, registry, App, Output}; - -pub fn main() -> anyhow::Result<()> { - // Create the pocket bench application. - let app = App::parse(); - main_inner(&app, &mut output::default()) -} - -fn main_inner(app: &App, output: &mut dyn Output) -> anyhow::Result<()> { - // Register inputs and benchmarks. - let mut inputs = registry::Inputs::new(); - inputs.register::()?; - - let mut benchmarks = registry::Benchmarks::new(); - register(&mut benchmarks); - - // Here we go! - app.run(&inputs, &benchmarks, output) -} - -/////////// -// Tests // -/////////// - -#[cfg(test)] -mod tests { - use super::*; - - use std::path::{Path, PathBuf}; - - use diskann_benchmark_runner::app::{Check, Commands}; - - fn run_integration_test(input_file: &Path, output_file: &Path) { - let commands = Commands::Run { - input_file: input_file.to_str().unwrap().into(), - output_file: output_file.to_str().unwrap().into(), - dry_run: false, - allow_debug: true, - }; - - let app = App::from_commands(commands); - - let mut output = output::Memory::new(); - main_inner(&app, &mut output).unwrap(); - println!( - "output = {}", - String::from_utf8(output.into_inner()).unwrap() - ); - - assert!(output_file.exists()); - } - - fn run_check_test(input_file: &Path, tolerances: &Path) -> String { - let commands = Commands::Check(Check::Verify { - tolerances: tolerances.to_str().unwrap().into(), - input_file: input_file.to_str().unwrap().into(), - }); - - let app = App::from_commands(commands); - - let mut output = output::Memory::new(); - main_inner(&app, &mut output).unwrap(); - String::from_utf8(output.into_inner()).unwrap() - } - - #[test] - fn integration_test() { - let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("examples") - .join("test.json"); - - let tempdir = tempfile::tempdir().unwrap(); - let output_path = tempdir.path().join("output.json"); - - run_integration_test(&input_path, &output_path); - } - - #[test] - fn check_verify() { - let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("examples") - .join("test.json"); - let tolerance_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("examples") - .join("tolerance.json"); - - let stdout = run_check_test(&input_path, &tolerance_path); - println!("stdout = {}", stdout); - } -} diff --git a/diskann-benchmark-multi-vector/src/lib.rs b/diskann-benchmark-multi-vector/src/lib.rs deleted file mode 100644 index df08d93dd..000000000 --- a/diskann-benchmark-multi-vector/src/lib.rs +++ /dev/null @@ -1,938 +0,0 @@ -/* - * Copyright (c) Microsoft Corporation. - * Licensed under the MIT license. - */ - -//! Multi-vector distance benchmarks with regression detection. - -use std::{io::Write, num::NonZeroUsize}; - -use diskann_quantization::multi_vector::{Chamfer, MatRef, MaxSim, QueryComputer, Standard}; -use diskann_vector::distance::InnerProduct; -use diskann_vector::{DistanceFunctionMut, PureDistanceFunction}; -use half::f16; -use rand::{ - distr::{Distribution, StandardUniform}, - rngs::StdRng, - SeedableRng, -}; -use serde::{Deserialize, Serialize}; -use thiserror::Error; - -use diskann_benchmark_runner::{ - benchmark::{PassFail, Regression}, - dispatcher::{Description, DispatchRule, FailureScore, MatchScore}, - utils::{ - datatype::{self, DataType}, - num::{relative_change, NonNegativeFinite}, - percentiles, MicroSeconds, - }, - Any, Benchmark, CheckDeserialization, Checker, Input, -}; - -//////////////// -// Public API // -//////////////// - -/// Register all multi-vector benchmarks with the runner's dispatcher. -pub fn register(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) { - register_benchmarks_impl(dispatcher) -} - -/////////// -// Utils // -/////////// - -#[derive(Debug, Clone, Copy)] -struct DisplayWrapper<'a, T: ?Sized>(&'a T); - -impl std::ops::Deref for DisplayWrapper<'_, T> { - type Target = T; - fn deref(&self) -> &T { - self.0 - } -} - -//////////// -// Inputs // -//////////// - -/// The two distance operations exposed by [`QueryComputer`]. -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum Operation { - Chamfer, - MaxSim, -} - -impl std::fmt::Display for Operation { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let st = match self { - Self::Chamfer => "chamfer", - Self::MaxSim => "max_sim", - }; - write!(f, "{}", st) - } -} - -/// Which implementation tier to benchmark. -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "kebab-case")] -enum Implementation { - Optimized, - Reference, -} - -impl std::fmt::Display for Implementation { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let st = match self { - Self::Optimized => "optimized", - Self::Reference => "reference", - }; - write!(f, "{}", st) - } -} - -/// One benchmark configuration: a single (operation, shape) measurement. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -struct Run { - operation: Operation, - num_query_vectors: NonZeroUsize, - num_doc_vectors: NonZeroUsize, - dim: NonZeroUsize, - loops_per_measurement: NonZeroUsize, - num_measurements: NonZeroUsize, -} - -/// A complete multi-vector benchmark job. -#[derive(Debug, Serialize, Deserialize)] -pub struct MultiVectorOp { - element_type: DataType, - implementation: Implementation, - runs: Vec, -} - -impl CheckDeserialization for MultiVectorOp { - fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { - Ok(()) - } -} - -macro_rules! write_field { - ($f:ident, $field:tt, $($expr:tt)*) => { - writeln!($f, "{:>18}: {}", $field, $($expr)*) - } -} - -impl MultiVectorOp { - fn summarize_fields(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write_field!(f, "element type", self.element_type)?; - write_field!(f, "implementation", self.implementation)?; - write_field!(f, "number of runs", self.runs.len())?; - Ok(()) - } -} - -impl std::fmt::Display for MultiVectorOp { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "Multi-Vector Operation\n")?; - write_field!(f, "tag", Self::tag())?; - self.summarize_fields(f) - } -} - -impl Input for MultiVectorOp { - fn tag() -> &'static str { - "multi-vector-op" - } - - fn try_deserialize( - serialized: &serde_json::Value, - checker: &mut Checker, - ) -> anyhow::Result { - checker.any(Self::deserialize(serialized)?) - } - - fn example() -> anyhow::Result { - const NUM_QUERY_VECTORS: NonZeroUsize = NonZeroUsize::new(32).unwrap(); - const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap(); - const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap(); - const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap(); - const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap(); - - let runs = vec![ - Run { - operation: Operation::Chamfer, - num_query_vectors: NUM_QUERY_VECTORS, - num_doc_vectors: NUM_DOC_VECTORS, - dim: DIM, - loops_per_measurement: LOOPS_PER_MEASUREMENT, - num_measurements: NUM_MEASUREMENTS, - }, - Run { - operation: Operation::MaxSim, - num_query_vectors: NUM_QUERY_VECTORS, - num_doc_vectors: NUM_DOC_VECTORS, - dim: DIM, - loops_per_measurement: LOOPS_PER_MEASUREMENT, - num_measurements: NUM_MEASUREMENTS, - }, - ]; - - Ok(serde_json::to_value(&Self { - element_type: DataType::Float32, - implementation: Implementation::Optimized, - runs, - })?) - } -} - -////////////////////// -// Regression Check // -////////////////////// - -/// Tolerance thresholds for multi-vector benchmark regression detection. -/// -/// Each field specifies the maximum allowed relative increase in the corresponding metric. -/// For example, a value of `0.05` means a 5% increase is tolerated. -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -struct MultiVectorTolerance { - min_time_regression: NonNegativeFinite, -} - -impl CheckDeserialization for MultiVectorTolerance { - fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { - Ok(()) - } -} - -impl Input for MultiVectorTolerance { - fn tag() -> &'static str { - "multi-vector-tolerance" - } - - fn try_deserialize( - serialized: &serde_json::Value, - checker: &mut Checker, - ) -> anyhow::Result { - checker.any(Self::deserialize(serialized)?) - } - - fn example() -> anyhow::Result { - const EXAMPLE: NonNegativeFinite = match NonNegativeFinite::new(0.05) { - Ok(v) => v, - Err(_) => panic!("use a non-negative finite please"), - }; - - Ok(serde_json::to_value(MultiVectorTolerance { - min_time_regression: EXAMPLE, - })?) - } -} - -/// Per-run comparison result showing before/after percentile differences. -#[derive(Debug, Serialize)] -struct Comparison { - run: Run, - tolerance: MultiVectorTolerance, - before_min: f64, - after_min: f64, -} - -/// Aggregated result of the regression check across all runs. -#[derive(Debug, Serialize)] -struct CheckResult { - checks: Vec, -} - -impl std::fmt::Display for CheckResult { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let header = [ - "Operation", - "Q", - "D", - "Dim", - "Min Before (ns/IP @ Dim)", - "Min After (ns/IP @ Dim)", - "Change (%)", - "Remark", - ]; - - let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.checks.len()); - - for (i, c) in self.checks.iter().enumerate() { - let mut row = table.row(i); - let change = relative_change(c.before_min, c.after_min); - - row.insert(c.run.operation, 0); - row.insert(c.run.num_query_vectors, 1); - row.insert(c.run.num_doc_vectors, 2); - row.insert(c.run.dim, 3); - row.insert(format!("{:.3}", c.before_min), 4); - row.insert(format!("{:.3}", c.after_min), 5); - match change { - Ok(change) => { - row.insert(format!("{:.3} %", change * 100.0), 6); - if change > c.tolerance.min_time_regression.get() { - row.insert("FAIL", 7); - } - } - Err(err) => { - row.insert("invalid", 6); - row.insert(err, 7); - } - } - } - - table.fmt(f) - } -} - -//////////////////////////// -// Benchmark Registration // -//////////////////////////// - -fn register_benchmarks_impl(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) { - macro_rules! register { - ($impl:ident, $t:ty, $tag:literal) => { - dispatcher.register_regression($tag, Kernel::<$impl, $t>::new()); - }; - } - - // Optimized (architecture-dispatched QueryComputer). - register!(Optimized, f32, "multi-vector-op-f32-optimized"); - register!(Optimized, f16, "multi-vector-op-f16-optimized"); - - // Reference (Chamfer / MaxSim fallback path). - register!(Reference, f32, "multi-vector-op-f32-reference"); - register!(Reference, f16, "multi-vector-op-f16-reference"); -} - -////////////// -// Dispatch // -////////////// - -/// Dispatch marker for the [`QueryComputer`] implementation. -#[derive(Debug)] -struct Optimized; - -/// Dispatch marker for the [`Chamfer`] / [`MaxSim`] fallback. -#[derive(Debug)] -struct Reference; - -/// A multi-vector benchmark. -struct Kernel { - _type: std::marker::PhantomData<(I, T)>, -} - -impl Kernel { - fn new() -> Self { - Self { - _type: std::marker::PhantomData, - } - } -} - -#[derive(Debug, Error)] -#[error("this kernel handles a different implementation than {0}")] -pub(crate) struct ImplementationMismatch(Implementation); - -macro_rules! impl_dispatch_rule { - ($marker:ident, $variant:ident, $description:literal) => { - impl DispatchRule for $marker { - type Error = ImplementationMismatch; - - fn try_match(from: &Implementation) -> Result { - if *from == Implementation::$variant { - Ok(MatchScore(0)) - } else { - Err(FailureScore(1)) - } - } - - fn convert(from: Implementation) -> Result { - if from == Implementation::$variant { - Ok($marker) - } else { - Err(ImplementationMismatch(from)) - } - } - - fn description( - f: &mut std::fmt::Formatter<'_>, - from: Option<&Implementation>, - ) -> std::fmt::Result { - match from { - None => write!(f, $description), - Some(impl_) => { - if Self::try_match(impl_).is_ok() { - write!(f, "matched {}", impl_) - } else { - write!(f, "expected {}, got {}", Implementation::$variant, impl_) - } - } - } - } - } - }; -} - -impl_dispatch_rule!( - Optimized, - Optimized, - "QueryComputer (architecture-dispatched)" -); -impl_dispatch_rule!(Reference, Reference, "Chamfer / MaxSim fallback"); - -impl Benchmark for Kernel -where - datatype::Type: DispatchRule, - I: DispatchRule + 'static, - Kernel: RunBenchmark, - T: 'static, -{ - type Input = MultiVectorOp; - type Output = Vec; - - fn try_match(&self, from: &MultiVectorOp) -> Result { - let mut failscore: Option = None; - if datatype::Type::::try_match(&from.element_type).is_err() { - *failscore.get_or_insert(0) += 10; - } - if let Err(FailureScore(score)) = I::try_match(&from.implementation) { - *failscore.get_or_insert(0) += 2 + score; - } - - match failscore { - None => Ok(MatchScore(0)), - Some(score) => Err(FailureScore(score)), - } - } - - fn run( - &self, - input: &MultiVectorOp, - _: diskann_benchmark_runner::Checkpoint<'_>, - mut output: &mut dyn diskann_benchmark_runner::Output, - ) -> anyhow::Result { - // The dispatcher only invokes `run` after `try_match` has already accepted - // the input, so a failure here would indicate a dispatcher bug. - I::convert(input.implementation).expect("try_match accepted the input"); - writeln!(output, "{}", input)?; - let results = self.run_benchmark(input)?; - writeln!(output, "\n\n{}", DisplayWrapper(&*results))?; - Ok(results) - } - - fn description( - &self, - f: &mut std::fmt::Formatter<'_>, - input: Option<&MultiVectorOp>, - ) -> std::fmt::Result { - match input { - None => { - writeln!( - f, - "- Element Type: {}", - Description::>::new() - )?; - writeln!( - f, - "- Implementation: {}", - Description::::new() - )?; - } - Some(input) => { - if let Err(err) = datatype::Type::::try_match_verbose(&input.element_type) { - writeln!(f, "\n - Mismatched element type: {}", err)?; - } - if let Err(err) = I::try_match_verbose(&input.implementation) { - writeln!(f, "\n - Mismatched implementation: {}", err)?; - } - } - } - Ok(()) - } -} - -impl Regression for Kernel -where - datatype::Type: DispatchRule, - I: DispatchRule + 'static, - Kernel: RunBenchmark, - T: 'static, -{ - type Tolerances = MultiVectorTolerance; - type Pass = CheckResult; - type Fail = CheckResult; - - fn check( - &self, - tolerance: &MultiVectorTolerance, - _input: &MultiVectorOp, - before: &Vec, - after: &Vec, - ) -> anyhow::Result> { - anyhow::ensure!( - before.len() == after.len(), - "before has {} runs but after has {}", - before.len(), - after.len(), - ); - - let mut passed = true; - let checks: Vec = std::iter::zip(before.iter(), after.iter()) - .enumerate() - .map(|(i, (b, a))| { - anyhow::ensure!(b.run == a.run, "run {i} mismatched"); - - let computations_per_latency = b.computations_per_latency() as f64; - - let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; - let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; - - let comparison = Comparison { - run: b.run.clone(), - tolerance: *tolerance, - before_min, - after_min, - }; - - match relative_change(before_min, after_min) { - Ok(change) => { - if change > tolerance.min_time_regression.get() { - passed = false; - } - } - Err(_) => passed = false, - }; - - Ok(comparison) - }) - .collect::>>()?; - - let check = CheckResult { checks }; - - if passed { - Ok(PassFail::Pass(check)) - } else { - Ok(PassFail::Fail(check)) - } - } -} - -/////////////// -// Benchmark // -/////////////// - -trait RunBenchmark { - fn run_benchmark(&self, input: &MultiVectorOp) -> Result, anyhow::Error>; -} - -#[derive(Debug, Serialize, Deserialize)] -struct RunResult { - /// The configuration for this run. - run: Run, - /// Per-measurement latencies (over `loops_per_measurement` calls). - latencies: Vec, - /// Latency percentiles. - percentiles: percentiles::Percentiles, -} - -impl RunResult { - fn computations_per_latency(&self) -> usize { - self.run.num_query_vectors.get() - * self.run.num_doc_vectors.get() - * self.run.loops_per_measurement.get() - } -} - -impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if self.is_empty() { - return Ok(()); - } - - // ns/IP is normalized as `min_latency_us * 1000 / (Q * D * loops)` and is - // approximately linear in `dim`. Compare across rows with the same `Dim`; - // divide further by `Dim` to recover ns per scalar multiply. - writeln!( - f, - "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)" - )?; - - let header = [ - "Operation", - "Q", - "D", - "Dim", - "Min Time (ns/IP @ Dim)", - "Mean Time (ns/IP @ Dim)", - "Loops", - "Measurements", - ]; - - let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.len()); - - self.iter().enumerate().for_each(|(row, r)| { - let mut row = table.row(row); - - let min_latency = r - .latencies - .iter() - .min() - .copied() - .unwrap_or(MicroSeconds::new(u64::MAX)); - let mean_latency = r.percentiles.mean; - - let computations_per_latency = r.computations_per_latency() as f64; - - // Convert time from micro-seconds to nano-seconds per inner-product call - // (one (query, doc) pair, ~ linear in dim). - let min_time = min_latency.as_f64() / computations_per_latency * 1000.0; - let mean_time = mean_latency / computations_per_latency * 1000.0; - - row.insert(r.run.operation, 0); - row.insert(r.run.num_query_vectors, 1); - row.insert(r.run.num_doc_vectors, 2); - row.insert(r.run.dim, 3); - row.insert(format!("{:.3}", min_time), 4); - row.insert(format!("{:.3}", mean_time), 5); - row.insert(r.run.loops_per_measurement, 6); - row.insert(r.run.num_measurements, 7); - }); - - table.fmt(f) - } -} - -fn run_loops(run: &Run, mut body: F) -> RunResult -where - F: FnMut(), -{ - let mut latencies = Vec::with_capacity(run.num_measurements.get()); - - for _ in 0..run.num_measurements.get() { - let start = std::time::Instant::now(); - for _ in 0..run.loops_per_measurement.get() { - body(); - } - latencies.push(start.elapsed().into()); - } - - let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap(); - RunResult { - run: run.clone(), - latencies, - percentiles, - } -} - -/////////////////// -// Data fixtures // -/////////////////// - -const RNG_SEED: u64 = 0x12345; - -struct Data { - query_data: Box<[T]>, - doc_data: Box<[T]>, -} - -impl Data -where - StandardUniform: Distribution, -{ - fn new(run: &Run) -> Self { - let mut rng = StdRng::seed_from_u64(RNG_SEED); - let query_data: Box<[T]> = (0..run.num_query_vectors.get() * run.dim.get()) - .map(|_| StandardUniform.sample(&mut rng)) - .collect(); - let doc_data: Box<[T]> = (0..run.num_doc_vectors.get() * run.dim.get()) - .map(|_| StandardUniform.sample(&mut rng)) - .collect(); - - Self { - query_data, - doc_data, - } - } - - fn query(&self, run: &Run) -> MatRef<'_, Standard> { - MatRef::new( - Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(), - &self.query_data, - ) - .unwrap() - } - - fn doc(&self, run: &Run) -> MatRef<'_, Standard> { - MatRef::new( - Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(), - &self.doc_data, - ) - .unwrap() - } -} - -///////////////////// -// Implementations // -///////////////////// - -fn run_optimized(input: &MultiVectorOp) -> anyhow::Result> -where - T: Copy, - StandardUniform: Distribution, - QueryComputer: NewFromMatRef, -{ - let mut results = Vec::with_capacity(input.runs.len()); - for run in input.runs.iter() { - let data = Data::::new(run); - // `QueryComputer` performs query-side precomputation that is intentionally - // amortized across many `chamfer` / `max_sim` calls; construct it once per - // shape, outside the timed loop. - let computer = as NewFromMatRef>::new_from(data.query(run)); - let doc = data.doc(run); - - let result = match run.operation { - Operation::Chamfer => run_loops(run, || { - let v = computer.chamfer(doc); - std::hint::black_box(v); - }), - Operation::MaxSim => { - let mut scores = vec![0.0f32; run.num_query_vectors.get()]; - run_loops(run, || { - computer.max_sim(doc, &mut scores); - std::hint::black_box(&mut scores); - }) - } - }; - results.push(result); - } - Ok(results) -} - -/// Drive the [`Chamfer`] / [`MaxSim`] fallback path. -fn run_reference(input: &MultiVectorOp) -> anyhow::Result> -where - T: Copy, - StandardUniform: Distribution, - InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, -{ - let mut results = Vec::with_capacity(input.runs.len()); - for run in input.runs.iter() { - let data = Data::::new(run); - let doc = data.doc(run); - // Hoist out of the timed loop to mirror the optimized path's - // per-shape precomputation. - let query: diskann_quantization::multi_vector::distance::QueryMatRef<'_, _> = - data.query(run).into(); - - let result = match run.operation { - Operation::Chamfer => run_loops(run, || { - let v = Chamfer::evaluate(query, doc); - std::hint::black_box(v); - }), - Operation::MaxSim => { - let mut scores = vec![0.0f32; run.num_query_vectors.get()]; - let mut max_sim = MaxSim::new(&mut scores).unwrap(); - run_loops(run, || { - let _ = max_sim.evaluate(query, doc); - std::hint::black_box(max_sim.scores_mut()); - }) - } - }; - results.push(result); - } - Ok(results) -} - -/// Element-type-erasing constructor for [`QueryComputer`]. -trait NewFromMatRef { - fn new_from(query: MatRef<'_, Standard>) -> QueryComputer; -} - -macro_rules! impl_kernel_for { - ($t:ty) => { - impl NewFromMatRef<$t> for QueryComputer<$t> { - fn new_from(query: MatRef<'_, Standard<$t>>) -> QueryComputer<$t> { - QueryComputer::<$t>::new(query) - } - } - - impl RunBenchmark for Kernel { - fn run_benchmark( - &self, - input: &MultiVectorOp, - ) -> Result, anyhow::Error> { - run_optimized::<$t>(input) - } - } - - impl RunBenchmark for Kernel { - fn run_benchmark( - &self, - input: &MultiVectorOp, - ) -> Result, anyhow::Error> { - run_reference::<$t>(input) - } - } - }; -} - -impl_kernel_for!(f32); -impl_kernel_for!(f16); - -/////////// -// Tests // -/////////// - -#[cfg(test)] -mod tests { - use super::*; - - use diskann_benchmark_runner::{ - benchmark::{PassFail, Regression}, - utils::percentiles::compute_percentiles, - }; - - fn tiny_run(operation: Operation) -> Run { - Run { - operation, - num_query_vectors: NonZeroUsize::new(2).unwrap(), - num_doc_vectors: NonZeroUsize::new(2).unwrap(), - dim: NonZeroUsize::new(4).unwrap(), - loops_per_measurement: NonZeroUsize::new(1).unwrap(), - num_measurements: NonZeroUsize::new(1).unwrap(), - } - } - - fn tiny_op() -> MultiVectorOp { - MultiVectorOp { - element_type: DataType::Float32, - implementation: Implementation::Optimized, - runs: vec![tiny_run(Operation::Chamfer)], - } - } - - fn tiny_result(operation: Operation, minimum: u64) -> RunResult { - let run = tiny_run(operation); - let minimum = MicroSeconds::new(minimum); - let mut latencies = vec![minimum]; - let percentiles = compute_percentiles(&mut latencies).unwrap(); - RunResult { - run, - latencies, - percentiles, - } - } - - fn tolerance(limit: f64) -> MultiVectorTolerance { - MultiVectorTolerance { - min_time_regression: NonNegativeFinite::new(limit).unwrap(), - } - } - - #[test] - fn check_rejects_mismatched_runs() { - let kernel = Kernel::::new(); - - let err = kernel - .check( - &tolerance(0.0), - &tiny_op(), - &vec![tiny_result(Operation::Chamfer, 100)], - &vec![tiny_result(Operation::MaxSim, 100)], - ) - .unwrap_err(); - - assert_eq!(err.to_string(), "run 0 mismatched"); - } - - #[test] - fn check_allows_negative_relative_change() { - let kernel = Kernel::::new(); - - let result = kernel - .check( - &tolerance(0.0), - &tiny_op(), - &vec![tiny_result(Operation::Chamfer, 100)], - &vec![tiny_result(Operation::Chamfer, 95)], - ) - .unwrap(); - - assert!(matches!(result, PassFail::Pass(_))); - } - - #[test] - fn check_passes_on_tolerance_boundary() { - let kernel = Kernel::::new(); - - let result = kernel - .check( - &tolerance(0.05), - &tiny_op(), - &vec![tiny_result(Operation::Chamfer, 100)], - &vec![tiny_result(Operation::Chamfer, 105)], - ) - .unwrap(); - - assert!(matches!(result, PassFail::Pass(_))); - } - - #[test] - fn check_fails_above_tolerance_boundary() { - let kernel = Kernel::::new(); - - let result = kernel - .check( - &tolerance(0.05), - &tiny_op(), - &vec![tiny_result(Operation::Chamfer, 100)], - &vec![tiny_result(Operation::Chamfer, 106)], - ) - .unwrap(); - - assert!(matches!(result, PassFail::Fail(_))); - } - - #[test] - fn check_result_display_includes_failure_details() { - let check = CheckResult { - checks: vec![Comparison { - run: tiny_run(Operation::Chamfer), - tolerance: tolerance(0.05), - before_min: 100.0, - after_min: 106.0, - }], - }; - - let rendered = check.to_string(); - assert!(rendered.contains("Operation"), "rendered = {rendered}"); - assert!(rendered.contains("chamfer"), "rendered = {rendered}"); - assert!(rendered.contains("100.000"), "rendered = {rendered}"); - assert!(rendered.contains("106.000"), "rendered = {rendered}"); - assert!(rendered.contains("6.000 %"), "rendered = {rendered}"); - assert!(rendered.contains("FAIL"), "rendered = {rendered}"); - } - - /// A "before" value of 0 means the measurement was too fast to obtain a - /// reliable signal, so we *could* be letting a regression through. We - /// require at least a non-zero value. - #[test] - fn zero_values_rejected() { - let kernel = Kernel::::new(); - - let result = kernel - .check( - &tolerance(0.05), - &tiny_op(), - &vec![tiny_result(Operation::Chamfer, 0)], - &vec![tiny_result(Operation::Chamfer, 0)], - ) - .unwrap(); - - assert!(matches!(result, PassFail::Fail(_))); - } -} diff --git a/diskann-benchmark/Cargo.toml b/diskann-benchmark/Cargo.toml index bebaf4b8e..efd058ffb 100644 --- a/diskann-benchmark/Cargo.toml +++ b/diskann-benchmark/Cargo.toml @@ -63,6 +63,9 @@ scalar-quantization = [] # Enable minmax-quantization based algorithms minmax-quantization = [] +# Enable multi-vector distance benchmarks (Chamfer / MaxSim) +multi-vector = [] + # Enable Disk Index benchmarks disk-index = [ "diskann-disk/perf_test", diff --git a/diskann-benchmark-multi-vector/examples/test.json b/diskann-benchmark/example/multi-vector-test.json similarity index 100% rename from diskann-benchmark-multi-vector/examples/test.json rename to diskann-benchmark/example/multi-vector-test.json diff --git a/diskann-benchmark-multi-vector/examples/multi-vector.json b/diskann-benchmark/example/multi-vector.json similarity index 100% rename from diskann-benchmark-multi-vector/examples/multi-vector.json rename to diskann-benchmark/example/multi-vector.json diff --git a/diskann-benchmark-multi-vector/examples/tolerance.json b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json similarity index 100% rename from diskann-benchmark-multi-vector/examples/tolerance.json rename to diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json diff --git a/diskann-benchmark/src/backend/mod.rs b/diskann-benchmark/src/backend/mod.rs index 24fe91d7e..0d1c61345 100644 --- a/diskann-benchmark/src/backend/mod.rs +++ b/diskann-benchmark/src/backend/mod.rs @@ -7,10 +7,12 @@ mod disk_index; mod exhaustive; mod filters; mod index; +mod multi_vector; pub(crate) fn register_benchmarks(registry: &mut diskann_benchmark_runner::registry::Benchmarks) { exhaustive::register_benchmarks(registry); disk_index::register_benchmarks(registry); index::register_benchmarks(registry); filters::register_benchmarks(registry); + multi_vector::register_benchmarks(registry); } diff --git a/diskann-benchmark/src/backend/multi_vector.rs b/diskann-benchmark/src/backend/multi_vector.rs new file mode 100644 index 000000000..cfdb77f33 --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector.rs @@ -0,0 +1,806 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Multi-vector distance benchmarks (Chamfer / MaxSim) with regression detection. + +use diskann_benchmark_runner::registry::Benchmarks; + +// Create a stub-module if the "multi-vector" feature is disabled. +crate::utils::stub_impl!("multi-vector", inputs::multi_vector::MultiVectorOp); + +pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) { + #[cfg(feature = "multi-vector")] + { + use half::f16; + + // Optimized (architecture-dispatched QueryComputer). + benchmarks.register_regression( + "multi-vector-op-f32-optimized", + imp::Kernel::::new(), + ); + benchmarks.register_regression( + "multi-vector-op-f16-optimized", + imp::Kernel::::new(), + ); + + // Reference (Chamfer / MaxSim fallback path). + benchmarks.register_regression( + "multi-vector-op-f32-reference", + imp::Kernel::::new(), + ); + benchmarks.register_regression( + "multi-vector-op-f16-reference", + imp::Kernel::::new(), + ); + } + + // Stub implementation + #[cfg(not(feature = "multi-vector"))] + imp::register("multi-vector-op", benchmarks); +} + +#[cfg(feature = "multi-vector")] +mod imp { + use std::io::Write; + + use diskann_benchmark_runner::{ + benchmark::{PassFail, Regression}, + dispatcher::{DispatchRule, FailureScore, MatchScore}, + utils::{datatype, num::relative_change, percentiles, MicroSeconds}, + Benchmark, + }; + use diskann_quantization::multi_vector::{ + Chamfer, Init, Mat, MatRef, MaxSim, QueryComputer, Standard, + }; + use diskann_vector::distance::InnerProduct; + use diskann_vector::{DistanceFunctionMut, PureDistanceFunction}; + use half::f16; + use rand::{ + distr::{Distribution, StandardUniform}, + rngs::StdRng, + SeedableRng, + }; + use serde::{Deserialize, Serialize}; + + use crate::inputs::multi_vector::{ + Implementation, MultiVectorOp, MultiVectorTolerance, Operation, Run, + }; + + /////////// + // Utils // + /////////// + + #[derive(Debug, Clone, Copy)] + pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T); + + impl std::ops::Deref for DisplayWrapper<'_, T> { + type Target = T; + fn deref(&self) -> &T { + self.0 + } + } + + ////////////// + // Dispatch // + ////////////// + + /// Dispatch marker for the [`QueryComputer`] implementation. + #[derive(Debug)] + pub(super) struct Optimized; + + /// Dispatch marker for the [`Chamfer`] / [`MaxSim`] fallback. + #[derive(Debug)] + pub(super) struct Reference; + + /// A multi-vector benchmark. + pub(super) struct Kernel { + _type: std::marker::PhantomData<(I, T)>, + } + + impl Kernel { + pub(super) fn new() -> Self { + Self { + _type: std::marker::PhantomData, + } + } + } + + /// Pairs the standard `TryFrom` conversion with the static + /// description info needed for friendly diagnostics in `Benchmark::description`. + pub(super) trait ImplementationMatcher: + TryFrom + 'static + { + /// Human-readable description of which implementation this marker handles. + const DESCRIPTION: &'static str; + /// The implementation variant this marker expects (for mismatch diagnostics). + const EXPECTED: Implementation; + } + + impl TryFrom for Optimized { + type Error = FailureScore; + fn try_from(i: Implementation) -> Result { + match i { + Implementation::Optimized => Ok(Self), + _ => Err(FailureScore(1)), + } + } + } + + impl ImplementationMatcher for Optimized { + const DESCRIPTION: &'static str = "QueryComputer (architecture-dispatched)"; + const EXPECTED: Implementation = Implementation::Optimized; + } + + impl TryFrom for Reference { + type Error = FailureScore; + fn try_from(i: Implementation) -> Result { + match i { + Implementation::Reference => Ok(Self), + _ => Err(FailureScore(1)), + } + } + } + + impl ImplementationMatcher for Reference { + const DESCRIPTION: &'static str = "Chamfer / MaxSim fallback"; + const EXPECTED: Implementation = Implementation::Reference; + } + + impl Benchmark for Kernel + where + datatype::Type: DispatchRule, + I: ImplementationMatcher, + Kernel: RunBenchmark, + T: 'static, + { + type Input = MultiVectorOp; + type Output = Vec; + + fn try_match(&self, from: &MultiVectorOp) -> Result { + let mut failscore: Option = None; + if datatype::Type::::try_match(&from.element_type).is_err() { + *failscore.get_or_insert(0) += 10; + } + if let Err(FailureScore(score)) = I::try_from(from.implementation) { + *failscore.get_or_insert(0) += 2 + score; + } + + match failscore { + None => Ok(MatchScore(0)), + Some(score) => Err(FailureScore(score)), + } + } + + fn run( + &self, + input: &MultiVectorOp, + _: diskann_benchmark_runner::Checkpoint<'_>, + mut output: &mut dyn diskann_benchmark_runner::Output, + ) -> anyhow::Result { + // The dispatcher only invokes `run` after `try_match` has already accepted + // the input, so a failure here would indicate a dispatcher bug. + I::try_from(input.implementation).expect("try_match accepted the input"); + writeln!(output, "{}", input)?; + let results = self.run_benchmark(input)?; + writeln!(output, "\n\n{}", DisplayWrapper(&*results))?; + Ok(results) + } + + fn description( + &self, + f: &mut std::fmt::Formatter<'_>, + input: Option<&MultiVectorOp>, + ) -> std::fmt::Result { + match input { + None => { + writeln!( + f, + "- Element Type: {}", + diskann_benchmark_runner::dispatcher::Description::< + datatype::DataType, + datatype::Type, + >::new() + )?; + writeln!(f, "- Implementation: {}", I::DESCRIPTION)?; + } + Some(input) => { + if let Err(err) = datatype::Type::::try_match_verbose(&input.element_type) { + writeln!(f, "\n - Mismatched element type: {}", err)?; + } + if I::try_from(input.implementation).is_err() { + writeln!( + f, + "\n - Mismatched implementation: expected {}, got {}", + I::EXPECTED, + input.implementation + )?; + } + } + } + Ok(()) + } + } + + impl Regression for Kernel + where + datatype::Type: DispatchRule, + I: ImplementationMatcher, + Kernel: RunBenchmark, + T: 'static, + { + type Tolerances = MultiVectorTolerance; + type Pass = CheckResult; + type Fail = CheckResult; + + fn check( + &self, + tolerance: &MultiVectorTolerance, + _input: &MultiVectorOp, + before: &Vec, + after: &Vec, + ) -> anyhow::Result> { + anyhow::ensure!( + before.len() == after.len(), + "before has {} runs but after has {}", + before.len(), + after.len(), + ); + + let mut passed = true; + let checks: Vec = std::iter::zip(before.iter(), after.iter()) + .enumerate() + .map(|(i, (b, a))| { + anyhow::ensure!(b.run == a.run, "run {i} mismatched"); + + let computations_per_latency = b.computations_per_latency() as f64; + + let before_min = + b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + let after_min = + a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + + let comparison = Comparison { + run: b.run.clone(), + tolerance: *tolerance, + before_min, + after_min, + }; + + match relative_change(before_min, after_min) { + Ok(change) => { + if change > tolerance.min_time_regression.get() { + passed = false; + } + } + Err(_) => passed = false, + }; + + Ok(comparison) + }) + .collect::>>()?; + + let check = CheckResult { checks }; + + if passed { + Ok(PassFail::Pass(check)) + } else { + Ok(PassFail::Fail(check)) + } + } + } + + ////////////////////// + // Regression Check // + ////////////////////// + + /// Per-run comparison result showing before/after percentile differences. + #[derive(Debug, Serialize)] + pub(super) struct Comparison { + run: Run, + tolerance: MultiVectorTolerance, + before_min: f64, + after_min: f64, + } + + /// Aggregated result of the regression check across all runs. + #[derive(Debug, Serialize)] + pub(super) struct CheckResult { + checks: Vec, + } + + impl std::fmt::Display for CheckResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let header = [ + "Operation", + "Q", + "D", + "Dim", + "Min Before (ns/IP @ Dim)", + "Min After (ns/IP @ Dim)", + "Change (%)", + "Remark", + ]; + + let mut table = + diskann_benchmark_runner::utils::fmt::Table::new(header, self.checks.len()); + + for (i, c) in self.checks.iter().enumerate() { + let mut row = table.row(i); + let change = relative_change(c.before_min, c.after_min); + + row.insert(c.run.operation, 0); + row.insert(c.run.num_query_vectors, 1); + row.insert(c.run.num_doc_vectors, 2); + row.insert(c.run.dim, 3); + row.insert(format!("{:.3}", c.before_min), 4); + row.insert(format!("{:.3}", c.after_min), 5); + match change { + Ok(change) => { + row.insert(format!("{:.3} %", change * 100.0), 6); + if change > c.tolerance.min_time_regression.get() { + row.insert("FAIL", 7); + } + } + Err(err) => { + row.insert("invalid", 6); + row.insert(err, 7); + } + } + } + + table.fmt(f) + } + } + + /////////////// + // Benchmark // + /////////////// + + pub(super) trait RunBenchmark { + fn run_benchmark(&self, input: &MultiVectorOp) -> Result, anyhow::Error>; + } + + #[derive(Debug, Serialize, Deserialize)] + pub(super) struct RunResult { + /// The configuration for this run. + run: Run, + /// Per-measurement latencies (over `loops_per_measurement` calls). + latencies: Vec, + /// Latency percentiles. + percentiles: percentiles::Percentiles, + } + + impl RunResult { + fn computations_per_latency(&self) -> usize { + self.run.num_query_vectors.get() + * self.run.num_doc_vectors.get() + * self.run.loops_per_measurement.get() + } + } + + impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.is_empty() { + return Ok(()); + } + + // ns/IP is normalized as `min_latency_us * 1000 / (Q * D * loops)` and is + // approximately linear in `dim`. Compare across rows with the same `Dim`; + // divide further by `Dim` to recover ns per scalar multiply. + writeln!( + f, + "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)" + )?; + + let header = [ + "Operation", + "Q", + "D", + "Dim", + "Min Time (ns/IP @ Dim)", + "Mean Time (ns/IP @ Dim)", + "Loops", + "Measurements", + ]; + + let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.len()); + + self.iter().enumerate().for_each(|(row, r)| { + let mut row = table.row(row); + + let min_latency = r + .latencies + .iter() + .min() + .copied() + .unwrap_or(MicroSeconds::new(u64::MAX)); + let mean_latency = r.percentiles.mean; + + let computations_per_latency = r.computations_per_latency() as f64; + + // Convert time from micro-seconds to nano-seconds per inner-product call + // (one (query, doc) pair, ~ linear in dim). + let min_time = min_latency.as_f64() / computations_per_latency * 1000.0; + let mean_time = mean_latency / computations_per_latency * 1000.0; + + row.insert(r.run.operation, 0); + row.insert(r.run.num_query_vectors, 1); + row.insert(r.run.num_doc_vectors, 2); + row.insert(r.run.dim, 3); + row.insert(format!("{:.3}", min_time), 4); + row.insert(format!("{:.3}", mean_time), 5); + row.insert(r.run.loops_per_measurement, 6); + row.insert(r.run.num_measurements, 7); + }); + + table.fmt(f) + } + } + + fn run_loops(run: &Run, mut body: F) -> RunResult + where + F: FnMut(), + { + let mut latencies = Vec::with_capacity(run.num_measurements.get()); + + for _ in 0..run.num_measurements.get() { + let start = std::time::Instant::now(); + for _ in 0..run.loops_per_measurement.get() { + body(); + } + latencies.push(start.elapsed().into()); + } + + let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap(); + RunResult { + run: run.clone(), + latencies, + percentiles, + } + } + + /////////////////// + // Data fixtures // + /////////////////// + + const RNG_SEED: u64 = 0x12345; + + struct Data { + queries: Mat>, + docs: Mat>, + } + + impl Data + where + StandardUniform: Distribution, + { + fn new(run: &Run) -> Self { + let mut rng = StdRng::seed_from_u64(RNG_SEED); + let queries = Mat::new( + Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(), + Init(|| StandardUniform.sample(&mut rng)), + ) + .unwrap(); + let docs = Mat::new( + Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(), + Init(|| StandardUniform.sample(&mut rng)), + ) + .unwrap(); + Self { queries, docs } + } + } + + ////////////////////// + // Distance kernels // + ////////////////////// + + /// Object-safe abstraction over a per-shape distance executor. + /// + /// The two implementations ([`OptimizedDistance`] and [`ReferenceDistance`]) share the + /// same hot-loop nest in [`run_with_distance`]; dispatching through `&dyn Distance` + /// keeps `run_loops` from being monomorphised over the implementation axis. + trait Distance { + fn chamfer(&self, doc: MatRef<'_, Standard>) -> f32; + fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]); + } + + /// Distance executor that drives [`QueryComputer`] (architecture-dispatched SIMD). + struct OptimizedDistance(QueryComputer); + + impl Distance for OptimizedDistance { + fn chamfer(&self, doc: MatRef<'_, Standard>) -> f32 { + self.0.chamfer(doc) + } + fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { + self.0.max_sim(doc, scores); + } + } + + /// Distance executor that drives the [`Chamfer`] / [`MaxSim`] fallback path. + struct ReferenceDistance<'a, T: Copy>( + diskann_quantization::multi_vector::distance::QueryMatRef<'a, Standard>, + ); + + impl Distance for ReferenceDistance<'_, T> + where + InnerProduct: for<'q, 'd> PureDistanceFunction<&'q [T], &'d [T], f32>, + { + fn chamfer(&self, doc: MatRef<'_, Standard>) -> f32 { + Chamfer::evaluate(self.0, doc) + } + fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { + // `MaxSim::new` is a non-empty check + pointer wrap, so constructing it per + // iteration is free — no need to hoist it out of the loop. + let mut max_sim = MaxSim::new(scores).unwrap(); + let _ = max_sim.evaluate(self.0, doc); + } + } + + ///////////////////// + // Implementations // + ///////////////////// + + /// Shared loop nest. The trait-object dispatch happens once per outer iteration of + /// `run_loops`; the work inside each `chamfer` / `max_sim` call is O(Q*D*dim), so the + /// vtable hop is in the noise. + fn run_with_distance( + run: &Run, + doc: MatRef<'_, Standard>, + dist: &dyn Distance, + ) -> RunResult { + match run.operation { + Operation::Chamfer => run_loops(run, || { + let v = dist.chamfer(doc); + std::hint::black_box(v); + }), + Operation::MaxSim => { + let mut scores = vec![0.0f32; run.num_query_vectors.get()]; + run_loops(run, || { + dist.max_sim(doc, &mut scores); + std::hint::black_box(&mut scores); + }) + } + } + } + + fn run_optimized(input: &MultiVectorOp) -> anyhow::Result> + where + T: Copy, + StandardUniform: Distribution, + QueryComputer: NewFromMatRef, + OptimizedDistance: Distance, + { + let mut results = Vec::with_capacity(input.runs.len()); + for run in input.runs.iter() { + let data = Data::::new(run); + // `QueryComputer` performs query-side precomputation that is intentionally + // amortized across many `chamfer` / `max_sim` calls; construct it once per + // shape, outside the timed loop. + let dist = OptimizedDistance( as NewFromMatRef>::new_from( + data.queries.as_view(), + )); + results.push(run_with_distance(run, data.docs.as_view(), &dist)); + } + Ok(results) + } + + /// Drive the [`Chamfer`] / [`MaxSim`] fallback path. + fn run_reference(input: &MultiVectorOp) -> anyhow::Result> + where + T: Copy, + StandardUniform: Distribution, + InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, + for<'a> ReferenceDistance<'a, T>: Distance, + { + let mut results = Vec::with_capacity(input.runs.len()); + for run in input.runs.iter() { + let data = Data::::new(run); + let dist = ReferenceDistance(data.queries.as_view().into()); + results.push(run_with_distance(run, data.docs.as_view(), &dist)); + } + Ok(results) + } + + /// Element-type-erasing constructor for [`QueryComputer`]. + /// + /// `QueryComputer::::new` is defined as an inherent method on the concrete + /// `QueryComputer` / `QueryComputer` types (not a generic), so we need + /// this shim trait to let generic code (e.g. `run_optimized`) call it. + trait NewFromMatRef { + fn new_from(query: MatRef<'_, Standard>) -> QueryComputer; + } + + impl NewFromMatRef for QueryComputer { + fn new_from(query: MatRef<'_, Standard>) -> QueryComputer { + QueryComputer::::new(query) + } + } + + impl NewFromMatRef for QueryComputer { + fn new_from(query: MatRef<'_, Standard>) -> QueryComputer { + QueryComputer::::new(query) + } + } + + impl RunBenchmark for Kernel + where + T: Copy + 'static, + StandardUniform: Distribution, + QueryComputer: NewFromMatRef, + OptimizedDistance: Distance, + { + fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result> { + run_optimized::(input) + } + } + + impl RunBenchmark for Kernel + where + T: Copy + 'static, + StandardUniform: Distribution, + InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, + for<'a> ReferenceDistance<'a, T>: Distance, + { + fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result> { + run_reference::(input) + } + } + + /////////// + // Tests // + /////////// + + #[cfg(test)] + mod tests { + use std::num::NonZeroUsize; + + use diskann_benchmark_runner::{ + benchmark::{PassFail, Regression}, + utils::{datatype::DataType, num::NonNegativeFinite, percentiles::compute_percentiles}, + }; + + use super::*; + + fn tiny_run(operation: Operation) -> Run { + Run { + operation, + num_query_vectors: NonZeroUsize::new(2).unwrap(), + num_doc_vectors: NonZeroUsize::new(2).unwrap(), + dim: NonZeroUsize::new(4).unwrap(), + loops_per_measurement: NonZeroUsize::new(1).unwrap(), + num_measurements: NonZeroUsize::new(1).unwrap(), + } + } + + fn tiny_op() -> MultiVectorOp { + MultiVectorOp { + element_type: DataType::Float32, + implementation: Implementation::Optimized, + runs: vec![tiny_run(Operation::Chamfer)], + } + } + + fn tiny_result(operation: Operation, minimum: u64) -> RunResult { + let run = tiny_run(operation); + let minimum = MicroSeconds::new(minimum); + let mut latencies = vec![minimum]; + let percentiles = compute_percentiles(&mut latencies).unwrap(); + RunResult { + run, + latencies, + percentiles, + } + } + + fn tolerance(limit: f64) -> MultiVectorTolerance { + MultiVectorTolerance { + min_time_regression: NonNegativeFinite::new(limit).unwrap(), + } + } + + #[test] + fn check_rejects_mismatched_runs() { + let kernel = Kernel::::new(); + + let err = kernel + .check( + &tolerance(0.0), + &tiny_op(), + &vec![tiny_result(Operation::Chamfer, 100)], + &vec![tiny_result(Operation::MaxSim, 100)], + ) + .unwrap_err(); + + assert_eq!(err.to_string(), "run 0 mismatched"); + } + + #[test] + fn check_allows_negative_relative_change() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.0), + &tiny_op(), + &vec![tiny_result(Operation::Chamfer, 100)], + &vec![tiny_result(Operation::Chamfer, 95)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Pass(_))); + } + + #[test] + fn check_passes_on_tolerance_boundary() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(Operation::Chamfer, 100)], + &vec![tiny_result(Operation::Chamfer, 105)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Pass(_))); + } + + #[test] + fn check_fails_above_tolerance_boundary() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(Operation::Chamfer, 100)], + &vec![tiny_result(Operation::Chamfer, 106)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Fail(_))); + } + + #[test] + fn check_result_display_includes_failure_details() { + let check = CheckResult { + checks: vec![Comparison { + run: tiny_run(Operation::Chamfer), + tolerance: tolerance(0.05), + before_min: 100.0, + after_min: 106.0, + }], + }; + + let rendered = check.to_string(); + assert!(rendered.contains("Operation"), "rendered = {rendered}"); + assert!(rendered.contains("chamfer"), "rendered = {rendered}"); + assert!(rendered.contains("100.000"), "rendered = {rendered}"); + assert!(rendered.contains("106.000"), "rendered = {rendered}"); + assert!(rendered.contains("6.000 %"), "rendered = {rendered}"); + assert!(rendered.contains("FAIL"), "rendered = {rendered}"); + } + + /// A "before" value of 0 means the measurement was too fast to obtain a + /// reliable signal, so we *could* be letting a regression through. We + /// require at least a non-zero value. + #[test] + fn zero_values_rejected() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(Operation::Chamfer, 0)], + &vec![tiny_result(Operation::Chamfer, 0)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Fail(_))); + } + } +} diff --git a/diskann-benchmark/src/inputs/mod.rs b/diskann-benchmark/src/inputs/mod.rs index 856412e2a..414a0b52e 100644 --- a/diskann-benchmark/src/inputs/mod.rs +++ b/diskann-benchmark/src/inputs/mod.rs @@ -7,6 +7,7 @@ pub(crate) mod disk; pub(crate) mod exhaustive; pub(crate) mod filters; pub(crate) mod graph_index; +pub(crate) mod multi_vector; pub(crate) mod save_and_load; pub(crate) fn register_inputs( @@ -16,6 +17,7 @@ pub(crate) fn register_inputs( exhaustive::register_inputs(registry)?; disk::register_inputs(registry)?; filters::register_inputs(registry)?; + multi_vector::register_inputs(registry)?; Ok(()) } diff --git a/diskann-benchmark/src/inputs/multi_vector.rs b/diskann-benchmark/src/inputs/multi_vector.rs new file mode 100644 index 000000000..8010162d6 --- /dev/null +++ b/diskann-benchmark/src/inputs/multi_vector.rs @@ -0,0 +1,190 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +use std::num::NonZeroUsize; + +use diskann_benchmark_runner::{ + utils::{datatype::DataType, num::NonNegativeFinite}, + CheckDeserialization, Checker, +}; +use serde::{Deserialize, Serialize}; + +use crate::inputs::{as_input, Example}; + +////////////// +// Registry // +////////////// + +as_input!(MultiVectorOp); +as_input!(MultiVectorTolerance); + +pub(super) fn register_inputs( + registry: &mut diskann_benchmark_runner::registry::Inputs, +) -> anyhow::Result<()> { + registry.register::()?; + registry.register::()?; + Ok(()) +} + +//////////////// +// Enum types // +//////////////// + +/// The two distance operations exposed by `QueryComputer`. +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub(crate) enum Operation { + Chamfer, + MaxSim, +} + +impl std::fmt::Display for Operation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let st = match self { + Self::Chamfer => "chamfer", + Self::MaxSim => "max_sim", + }; + write!(f, "{}", st) + } +} + +/// Which implementation tier to benchmark. +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub(crate) enum Implementation { + Optimized, + Reference, +} + +impl std::fmt::Display for Implementation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let st = match self { + Self::Optimized => "optimized", + Self::Reference => "reference", + }; + write!(f, "{}", st) + } +} + +/// One benchmark configuration: a single (operation, shape) measurement. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub(crate) struct Run { + pub(crate) operation: Operation, + pub(crate) num_query_vectors: NonZeroUsize, + pub(crate) num_doc_vectors: NonZeroUsize, + pub(crate) dim: NonZeroUsize, + pub(crate) loops_per_measurement: NonZeroUsize, + pub(crate) num_measurements: NonZeroUsize, +} + +/////////////////////// +// Multi-Vector Op // +/////////////////////// + +/// A complete multi-vector benchmark job. +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct MultiVectorOp { + pub(crate) element_type: DataType, + pub(crate) implementation: Implementation, + pub(crate) runs: Vec, +} + +impl MultiVectorOp { + pub(crate) const fn tag() -> &'static str { + "multi-vector-op" + } +} + +impl CheckDeserialization for MultiVectorOp { + fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { + Ok(()) + } +} + +impl Example for MultiVectorOp { + fn example() -> Self { + const NUM_QUERY_VECTORS: NonZeroUsize = NonZeroUsize::new(32).unwrap(); + const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap(); + const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap(); + const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap(); + const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap(); + + let runs = vec![ + Run { + operation: Operation::Chamfer, + num_query_vectors: NUM_QUERY_VECTORS, + num_doc_vectors: NUM_DOC_VECTORS, + dim: DIM, + loops_per_measurement: LOOPS_PER_MEASUREMENT, + num_measurements: NUM_MEASUREMENTS, + }, + Run { + operation: Operation::MaxSim, + num_query_vectors: NUM_QUERY_VECTORS, + num_doc_vectors: NUM_DOC_VECTORS, + dim: DIM, + loops_per_measurement: LOOPS_PER_MEASUREMENT, + num_measurements: NUM_MEASUREMENTS, + }, + ]; + + Self { + element_type: DataType::Float32, + implementation: Implementation::Optimized, + runs, + } + } +} + +macro_rules! write_field { + ($f:ident, $field:tt, $($expr:tt)*) => { + writeln!($f, "{:>18}: {}", $field, $($expr)*) + } +} + +impl std::fmt::Display for MultiVectorOp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Multi-Vector Operation\n")?; + write_field!(f, "tag", Self::tag())?; + write_field!(f, "element type", self.element_type)?; + write_field!(f, "implementation", self.implementation)?; + write_field!(f, "number of runs", self.runs.len())?; + Ok(()) + } +} + +///////////////////////////// +// Multi-Vector Tolerance // +///////////////////////////// + +/// Tolerance thresholds for multi-vector benchmark regression detection. +/// +/// Each field specifies the maximum allowed relative increase in the corresponding metric. +/// For example, a value of `0.05` means a 5% increase is tolerated. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub(crate) struct MultiVectorTolerance { + pub(crate) min_time_regression: NonNegativeFinite, +} + +impl MultiVectorTolerance { + pub(crate) const fn tag() -> &'static str { + "multi-vector-tolerance" + } +} + +impl CheckDeserialization for MultiVectorTolerance { + fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { + Ok(()) + } +} + +impl Example for MultiVectorTolerance { + fn example() -> Self { + Self { + min_time_regression: NonNegativeFinite::new(0.05) + .expect("0.05 is a valid non-negative finite"), + } + } +} diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs index 424e63bb7..c7276f2e1 100644 --- a/diskann-benchmark/src/main.rs +++ b/diskann-benchmark/src/main.rs @@ -776,6 +776,92 @@ mod tests { assert!(!output_path.exists()); } + /////////////////// + // Multi-Vector // + /////////////////// + + #[test] + fn multi_vector_integration() { + let path = example_directory().join("multi-vector-test.json"); + let tempdir = tempfile::tempdir().unwrap(); + let output_path = tempdir.path().join("output.json"); + assert!(!output_path.exists()); + + let modified_input_path = tempdir.path().join("input.json"); + + let mut raw = value_from_file(&path); + prefix_search_directories(&mut raw, &root_directory()); + save_to_file(&modified_input_path, &raw); + + run_multi_vector_integration(&modified_input_path, &output_path) + } + + #[cfg(feature = "multi-vector")] + fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) { + let command = Commands::Run { + input_file: input_path.to_owned(), + output_file: output_path.to_owned(), + dry_run: false, + allow_debug: true, + }; + + let cli = Cli::from_commands(command, true); + let mut output = Memory::new(); + + cli.run(&mut output).unwrap(); + println!( + "output = {}", + String::from_utf8(output.into_inner()).unwrap() + ); + + // Check that the results file is generated. + assert!(output_path.exists()); + } + + #[cfg(not(feature = "multi-vector"))] + fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) { + let command = Commands::Run { + input_file: input_path.to_owned(), + output_file: output_path.to_owned(), + dry_run: false, + allow_debug: true, + }; + let cli = Cli::from_commands(command, true); + let mut output = Memory::new(); + + let err = cli.run(&mut output).unwrap_err(); + println!("err = {:?}", err); + + let output = String::from_utf8(output.into_inner()).unwrap(); + assert!(output.contains("\"multi-vector\" feature")); + println!("output = {}", output); + + // The output file should not have been created because we failed the test. + assert!(!output_path.exists()); + } + + #[test] + #[cfg(feature = "multi-vector")] + fn multi_vector_check_verify() { + let input_path = example_directory().join("multi-vector-test.json"); + let tolerance_path = project_directory() + .join("perf_test_inputs") + .join("multi-vector-tolerance.json"); + + let command = Commands::Check(diskann_benchmark_runner::app::Check::Verify { + tolerances: tolerance_path, + input_file: input_path, + }); + + let cli = Cli::from_commands(command, true); + let mut output = Memory::new(); + cli.run(&mut output).unwrap(); + println!( + "output = {}", + String::from_utf8(output.into_inner()).unwrap() + ); + } + #[test] fn quiet_suppresses_check_target_warning() { let cli = Cli::from_commands(Commands::Skeleton, true); diff --git a/diskann-quantization/src/multi_vector/matrix.rs b/diskann-quantization/src/multi_vector/matrix.rs index 70629d44c..bcbafaaa3 100644 --- a/diskann-quantization/src/multi_vector/matrix.rs +++ b/diskann-quantization/src/multi_vector/matrix.rs @@ -244,6 +244,18 @@ pub unsafe trait NewOwned: ReprOwned { #[derive(Debug, Clone, Copy)] pub struct Defaulted; +/// An initializer argument to [`NewOwned`] that invokes the wrapped closure for each +/// element. +/// +/// # Example +/// ``` +/// use diskann_quantization::multi_vector::{Init, Mat, Standard}; +/// let mut n = 0; +/// let mat = Mat::new(Standard::::new(1, 4).unwrap(), Init(|| { n += 1; n })).unwrap(); +/// assert_eq!(mat.as_slice(), &[1, 2, 3, 4]); +/// ``` +pub struct Init(pub F); + /// Create a new [`Mat`] cloned from a view. pub trait NewCloned: ReprOwned { /// Clone the contents behind `v`, returning a new owning [`Mat`]. @@ -514,6 +526,22 @@ where } } +// SAFETY: The implementation uses guarantees from `Box` to ensure that the pointer +// initialized by it is non-null and properly aligned to the underlying type. +unsafe impl NewOwned> for Standard +where + T: Copy, + F: FnMut() -> T, +{ + type Error = crate::error::Infallible; + fn new_owned(self, mut init: Init) -> Result, Self::Error> { + let b: Box<[T]> = (0..self.num_elements()).map(|_| (init.0)()).collect(); + + // SAFETY: By construction, `b` has length `self.num_elements()`. + Ok(unsafe { self.box_to_mat(b) }) + } +} + // SAFETY: This checks that the slice has the correct length, which is all that is // required for [`Repr`]. unsafe impl NewRef for Standard @@ -1767,6 +1795,22 @@ mod tests { } } + #[test] + fn test_standard_new_owned_with_init() { + let mut counter: i32 = 0; + let m = Mat::new( + Standard::::new(2, 3).unwrap(), + Init(|| { + let v = counter; + counter += 1; + v + }), + ) + .unwrap(); + + assert_eq!(m.as_slice(), &[0, 1, 2, 3, 4, 5]); + } + #[test] fn matref_new_slice_length_error() { let repr = Standard::::new(3, 4).unwrap(); diff --git a/diskann-quantization/src/multi_vector/mod.rs b/diskann-quantization/src/multi_vector/mod.rs index 3670b1aaf..1d765bacc 100644 --- a/diskann-quantization/src/multi_vector/mod.rs +++ b/diskann-quantization/src/multi_vector/mod.rs @@ -74,6 +74,6 @@ pub(crate) mod matrix; pub use block_transposed::{BlockTransposed, BlockTransposedMut, BlockTransposedRef}; pub use distance::{Chamfer, MaxSim, MaxSimError, QueryComputer, QueryMatRef}; pub use matrix::{ - Defaulted, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef, Overflow, - Repr, ReprMut, ReprOwned, SliceError, Standard, + Defaulted, Init, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef, + Overflow, Repr, ReprMut, ReprOwned, SliceError, Standard, }; From a64279e7172bbaaf3dd3be6cca8b7a05746fb2ae Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Thu, 14 May 2026 20:36:54 +0530 Subject: [PATCH 09/13] Revamp the benchmark to be also kernel-research seam --- diskann-benchmark/Cargo.toml | 2 +- .../example/multi-vector-test.json | 47 - diskann-benchmark/example/multi-vector.json | 92 +- .../perf_test_inputs/multi-vector.json | 149 ++++ diskann-benchmark/src/backend/multi_vector.rs | 806 ------------------ .../src/backend/multi_vector/driver.rs | 279 ++++++ .../backend/multi_vector/experimental/mod.rs | 23 + .../multi_vector/experimental/template.rs | 254 ++++++ .../backend/multi_vector/library_kernels.rs | 510 +++++++++++ .../src/backend/multi_vector/mod.rs | 233 +++++ diskann-benchmark/src/inputs/multi_vector.rs | 63 +- diskann-benchmark/src/main.rs | 4 +- .../src/multi_vector/distance/kernels/f16.rs | 2 +- .../multi_vector/distance/kernels/f32/mod.rs | 6 +- .../distance/kernels/f32/scalar.rs | 2 +- .../multi_vector/distance/kernels/f32/v3.rs | 2 +- .../multi_vector/distance/kernels/layouts.rs | 37 +- .../src/multi_vector/distance/kernels/mod.rs | 57 +- .../distance/kernels/tiled_reduce.rs | 12 +- .../src/multi_vector/distance/mod.rs | 4 +- .../distance/query_computer/f16.rs | 15 +- .../distance/query_computer/f32.rs | 17 +- .../distance/query_computer/mod.rs | 65 +- .../src/multi_vector/matrix.rs | 60 +- diskann-quantization/src/multi_vector/mod.rs | 4 +- 25 files changed, 1692 insertions(+), 1053 deletions(-) delete mode 100644 diskann-benchmark/example/multi-vector-test.json create mode 100644 diskann-benchmark/perf_test_inputs/multi-vector.json delete mode 100644 diskann-benchmark/src/backend/multi_vector.rs create mode 100644 diskann-benchmark/src/backend/multi_vector/driver.rs create mode 100644 diskann-benchmark/src/backend/multi_vector/experimental/mod.rs create mode 100644 diskann-benchmark/src/backend/multi_vector/experimental/template.rs create mode 100644 diskann-benchmark/src/backend/multi_vector/library_kernels.rs create mode 100644 diskann-benchmark/src/backend/multi_vector/mod.rs diff --git a/diskann-benchmark/Cargo.toml b/diskann-benchmark/Cargo.toml index efd058ffb..ecc3a53dd 100644 --- a/diskann-benchmark/Cargo.toml +++ b/diskann-benchmark/Cargo.toml @@ -63,7 +63,7 @@ scalar-quantization = [] # Enable minmax-quantization based algorithms minmax-quantization = [] -# Enable multi-vector distance benchmarks (Chamfer / MaxSim) +# Enable multi-vector MaxSim distance benchmarks multi-vector = [] # Enable Disk Index benchmarks diff --git a/diskann-benchmark/example/multi-vector-test.json b/diskann-benchmark/example/multi-vector-test.json deleted file mode 100644 index 28e9b9d64..000000000 --- a/diskann-benchmark/example/multi-vector-test.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "search_directories": [], - "jobs": [ - { - "type": "multi-vector-op", - "content": { - "element_type": "float32", - "implementation": "optimized", - "runs": [ - { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 2, "num_measurements": 1 } - ] - } - }, - { - "type": "multi-vector-op", - "content": { - "element_type": "float16", - "implementation": "optimized", - "runs": [ - { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 } - ] - } - }, - { - "type": "multi-vector-op", - "content": { - "element_type": "float32", - "implementation": "reference", - "runs": [ - { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 2, "num_measurements": 1 } - ] - } - }, - { - "type": "multi-vector-op", - "content": { - "element_type": "float16", - "implementation": "reference", - "runs": [ - { "operation": "max_sim", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 } - ] - } - } - ] -} diff --git a/diskann-benchmark/example/multi-vector.json b/diskann-benchmark/example/multi-vector.json index 553a6a9d8..7a4e59539 100644 --- a/diskann-benchmark/example/multi-vector.json +++ b/diskann-benchmark/example/multi-vector.json @@ -5,55 +5,20 @@ "type": "multi-vector-op", "content": { "element_type": "float32", - "implementation": "optimized", + "arch": "auto", "runs": [ - { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 }, - - { "operation": "max_sim", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 2, "num_measurements": 1 } ] } }, { "type": "multi-vector-op", "content": { - "element_type": "float16", - "implementation": "optimized", + "element_type": "float32", + "arch": "scalar", "runs": [ - { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 }, - - { "operation": "max_sim", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 } ] } }, @@ -61,27 +26,10 @@ "type": "multi-vector-op", "content": { "element_type": "float32", - "implementation": "reference", + "arch": "reference", "runs": [ - { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 }, - - { "operation": "max_sim", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 2, "num_measurements": 1 } ] } }, @@ -89,27 +37,9 @@ "type": "multi-vector-op", "content": { "element_type": "float16", - "implementation": "reference", + "arch": "auto", "runs": [ - { "operation": "chamfer", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 }, - - { "operation": "max_sim", "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, - { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 } ] } } diff --git a/diskann-benchmark/perf_test_inputs/multi-vector.json b/diskann-benchmark/perf_test_inputs/multi-vector.json new file mode 100644 index 000000000..57922fe10 --- /dev/null +++ b/diskann-benchmark/perf_test_inputs/multi-vector.json @@ -0,0 +1,149 @@ +{ + "search_directories": [], + "jobs": [ + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "auto", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "scalar", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "x86-64-v3", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "x86-64-v4", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "reference", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "arch": "x86-64-v3", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "arch": "x86-64-v4", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "arch": "reference", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + } + ] +} diff --git a/diskann-benchmark/src/backend/multi_vector.rs b/diskann-benchmark/src/backend/multi_vector.rs deleted file mode 100644 index cfdb77f33..000000000 --- a/diskann-benchmark/src/backend/multi_vector.rs +++ /dev/null @@ -1,806 +0,0 @@ -/* - * Copyright (c) Microsoft Corporation. - * Licensed under the MIT license. - */ - -//! Multi-vector distance benchmarks (Chamfer / MaxSim) with regression detection. - -use diskann_benchmark_runner::registry::Benchmarks; - -// Create a stub-module if the "multi-vector" feature is disabled. -crate::utils::stub_impl!("multi-vector", inputs::multi_vector::MultiVectorOp); - -pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) { - #[cfg(feature = "multi-vector")] - { - use half::f16; - - // Optimized (architecture-dispatched QueryComputer). - benchmarks.register_regression( - "multi-vector-op-f32-optimized", - imp::Kernel::::new(), - ); - benchmarks.register_regression( - "multi-vector-op-f16-optimized", - imp::Kernel::::new(), - ); - - // Reference (Chamfer / MaxSim fallback path). - benchmarks.register_regression( - "multi-vector-op-f32-reference", - imp::Kernel::::new(), - ); - benchmarks.register_regression( - "multi-vector-op-f16-reference", - imp::Kernel::::new(), - ); - } - - // Stub implementation - #[cfg(not(feature = "multi-vector"))] - imp::register("multi-vector-op", benchmarks); -} - -#[cfg(feature = "multi-vector")] -mod imp { - use std::io::Write; - - use diskann_benchmark_runner::{ - benchmark::{PassFail, Regression}, - dispatcher::{DispatchRule, FailureScore, MatchScore}, - utils::{datatype, num::relative_change, percentiles, MicroSeconds}, - Benchmark, - }; - use diskann_quantization::multi_vector::{ - Chamfer, Init, Mat, MatRef, MaxSim, QueryComputer, Standard, - }; - use diskann_vector::distance::InnerProduct; - use diskann_vector::{DistanceFunctionMut, PureDistanceFunction}; - use half::f16; - use rand::{ - distr::{Distribution, StandardUniform}, - rngs::StdRng, - SeedableRng, - }; - use serde::{Deserialize, Serialize}; - - use crate::inputs::multi_vector::{ - Implementation, MultiVectorOp, MultiVectorTolerance, Operation, Run, - }; - - /////////// - // Utils // - /////////// - - #[derive(Debug, Clone, Copy)] - pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T); - - impl std::ops::Deref for DisplayWrapper<'_, T> { - type Target = T; - fn deref(&self) -> &T { - self.0 - } - } - - ////////////// - // Dispatch // - ////////////// - - /// Dispatch marker for the [`QueryComputer`] implementation. - #[derive(Debug)] - pub(super) struct Optimized; - - /// Dispatch marker for the [`Chamfer`] / [`MaxSim`] fallback. - #[derive(Debug)] - pub(super) struct Reference; - - /// A multi-vector benchmark. - pub(super) struct Kernel { - _type: std::marker::PhantomData<(I, T)>, - } - - impl Kernel { - pub(super) fn new() -> Self { - Self { - _type: std::marker::PhantomData, - } - } - } - - /// Pairs the standard `TryFrom` conversion with the static - /// description info needed for friendly diagnostics in `Benchmark::description`. - pub(super) trait ImplementationMatcher: - TryFrom + 'static - { - /// Human-readable description of which implementation this marker handles. - const DESCRIPTION: &'static str; - /// The implementation variant this marker expects (for mismatch diagnostics). - const EXPECTED: Implementation; - } - - impl TryFrom for Optimized { - type Error = FailureScore; - fn try_from(i: Implementation) -> Result { - match i { - Implementation::Optimized => Ok(Self), - _ => Err(FailureScore(1)), - } - } - } - - impl ImplementationMatcher for Optimized { - const DESCRIPTION: &'static str = "QueryComputer (architecture-dispatched)"; - const EXPECTED: Implementation = Implementation::Optimized; - } - - impl TryFrom for Reference { - type Error = FailureScore; - fn try_from(i: Implementation) -> Result { - match i { - Implementation::Reference => Ok(Self), - _ => Err(FailureScore(1)), - } - } - } - - impl ImplementationMatcher for Reference { - const DESCRIPTION: &'static str = "Chamfer / MaxSim fallback"; - const EXPECTED: Implementation = Implementation::Reference; - } - - impl Benchmark for Kernel - where - datatype::Type: DispatchRule, - I: ImplementationMatcher, - Kernel: RunBenchmark, - T: 'static, - { - type Input = MultiVectorOp; - type Output = Vec; - - fn try_match(&self, from: &MultiVectorOp) -> Result { - let mut failscore: Option = None; - if datatype::Type::::try_match(&from.element_type).is_err() { - *failscore.get_or_insert(0) += 10; - } - if let Err(FailureScore(score)) = I::try_from(from.implementation) { - *failscore.get_or_insert(0) += 2 + score; - } - - match failscore { - None => Ok(MatchScore(0)), - Some(score) => Err(FailureScore(score)), - } - } - - fn run( - &self, - input: &MultiVectorOp, - _: diskann_benchmark_runner::Checkpoint<'_>, - mut output: &mut dyn diskann_benchmark_runner::Output, - ) -> anyhow::Result { - // The dispatcher only invokes `run` after `try_match` has already accepted - // the input, so a failure here would indicate a dispatcher bug. - I::try_from(input.implementation).expect("try_match accepted the input"); - writeln!(output, "{}", input)?; - let results = self.run_benchmark(input)?; - writeln!(output, "\n\n{}", DisplayWrapper(&*results))?; - Ok(results) - } - - fn description( - &self, - f: &mut std::fmt::Formatter<'_>, - input: Option<&MultiVectorOp>, - ) -> std::fmt::Result { - match input { - None => { - writeln!( - f, - "- Element Type: {}", - diskann_benchmark_runner::dispatcher::Description::< - datatype::DataType, - datatype::Type, - >::new() - )?; - writeln!(f, "- Implementation: {}", I::DESCRIPTION)?; - } - Some(input) => { - if let Err(err) = datatype::Type::::try_match_verbose(&input.element_type) { - writeln!(f, "\n - Mismatched element type: {}", err)?; - } - if I::try_from(input.implementation).is_err() { - writeln!( - f, - "\n - Mismatched implementation: expected {}, got {}", - I::EXPECTED, - input.implementation - )?; - } - } - } - Ok(()) - } - } - - impl Regression for Kernel - where - datatype::Type: DispatchRule, - I: ImplementationMatcher, - Kernel: RunBenchmark, - T: 'static, - { - type Tolerances = MultiVectorTolerance; - type Pass = CheckResult; - type Fail = CheckResult; - - fn check( - &self, - tolerance: &MultiVectorTolerance, - _input: &MultiVectorOp, - before: &Vec, - after: &Vec, - ) -> anyhow::Result> { - anyhow::ensure!( - before.len() == after.len(), - "before has {} runs but after has {}", - before.len(), - after.len(), - ); - - let mut passed = true; - let checks: Vec = std::iter::zip(before.iter(), after.iter()) - .enumerate() - .map(|(i, (b, a))| { - anyhow::ensure!(b.run == a.run, "run {i} mismatched"); - - let computations_per_latency = b.computations_per_latency() as f64; - - let before_min = - b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; - let after_min = - a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; - - let comparison = Comparison { - run: b.run.clone(), - tolerance: *tolerance, - before_min, - after_min, - }; - - match relative_change(before_min, after_min) { - Ok(change) => { - if change > tolerance.min_time_regression.get() { - passed = false; - } - } - Err(_) => passed = false, - }; - - Ok(comparison) - }) - .collect::>>()?; - - let check = CheckResult { checks }; - - if passed { - Ok(PassFail::Pass(check)) - } else { - Ok(PassFail::Fail(check)) - } - } - } - - ////////////////////// - // Regression Check // - ////////////////////// - - /// Per-run comparison result showing before/after percentile differences. - #[derive(Debug, Serialize)] - pub(super) struct Comparison { - run: Run, - tolerance: MultiVectorTolerance, - before_min: f64, - after_min: f64, - } - - /// Aggregated result of the regression check across all runs. - #[derive(Debug, Serialize)] - pub(super) struct CheckResult { - checks: Vec, - } - - impl std::fmt::Display for CheckResult { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let header = [ - "Operation", - "Q", - "D", - "Dim", - "Min Before (ns/IP @ Dim)", - "Min After (ns/IP @ Dim)", - "Change (%)", - "Remark", - ]; - - let mut table = - diskann_benchmark_runner::utils::fmt::Table::new(header, self.checks.len()); - - for (i, c) in self.checks.iter().enumerate() { - let mut row = table.row(i); - let change = relative_change(c.before_min, c.after_min); - - row.insert(c.run.operation, 0); - row.insert(c.run.num_query_vectors, 1); - row.insert(c.run.num_doc_vectors, 2); - row.insert(c.run.dim, 3); - row.insert(format!("{:.3}", c.before_min), 4); - row.insert(format!("{:.3}", c.after_min), 5); - match change { - Ok(change) => { - row.insert(format!("{:.3} %", change * 100.0), 6); - if change > c.tolerance.min_time_regression.get() { - row.insert("FAIL", 7); - } - } - Err(err) => { - row.insert("invalid", 6); - row.insert(err, 7); - } - } - } - - table.fmt(f) - } - } - - /////////////// - // Benchmark // - /////////////// - - pub(super) trait RunBenchmark { - fn run_benchmark(&self, input: &MultiVectorOp) -> Result, anyhow::Error>; - } - - #[derive(Debug, Serialize, Deserialize)] - pub(super) struct RunResult { - /// The configuration for this run. - run: Run, - /// Per-measurement latencies (over `loops_per_measurement` calls). - latencies: Vec, - /// Latency percentiles. - percentiles: percentiles::Percentiles, - } - - impl RunResult { - fn computations_per_latency(&self) -> usize { - self.run.num_query_vectors.get() - * self.run.num_doc_vectors.get() - * self.run.loops_per_measurement.get() - } - } - - impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if self.is_empty() { - return Ok(()); - } - - // ns/IP is normalized as `min_latency_us * 1000 / (Q * D * loops)` and is - // approximately linear in `dim`. Compare across rows with the same `Dim`; - // divide further by `Dim` to recover ns per scalar multiply. - writeln!( - f, - "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)" - )?; - - let header = [ - "Operation", - "Q", - "D", - "Dim", - "Min Time (ns/IP @ Dim)", - "Mean Time (ns/IP @ Dim)", - "Loops", - "Measurements", - ]; - - let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.len()); - - self.iter().enumerate().for_each(|(row, r)| { - let mut row = table.row(row); - - let min_latency = r - .latencies - .iter() - .min() - .copied() - .unwrap_or(MicroSeconds::new(u64::MAX)); - let mean_latency = r.percentiles.mean; - - let computations_per_latency = r.computations_per_latency() as f64; - - // Convert time from micro-seconds to nano-seconds per inner-product call - // (one (query, doc) pair, ~ linear in dim). - let min_time = min_latency.as_f64() / computations_per_latency * 1000.0; - let mean_time = mean_latency / computations_per_latency * 1000.0; - - row.insert(r.run.operation, 0); - row.insert(r.run.num_query_vectors, 1); - row.insert(r.run.num_doc_vectors, 2); - row.insert(r.run.dim, 3); - row.insert(format!("{:.3}", min_time), 4); - row.insert(format!("{:.3}", mean_time), 5); - row.insert(r.run.loops_per_measurement, 6); - row.insert(r.run.num_measurements, 7); - }); - - table.fmt(f) - } - } - - fn run_loops(run: &Run, mut body: F) -> RunResult - where - F: FnMut(), - { - let mut latencies = Vec::with_capacity(run.num_measurements.get()); - - for _ in 0..run.num_measurements.get() { - let start = std::time::Instant::now(); - for _ in 0..run.loops_per_measurement.get() { - body(); - } - latencies.push(start.elapsed().into()); - } - - let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap(); - RunResult { - run: run.clone(), - latencies, - percentiles, - } - } - - /////////////////// - // Data fixtures // - /////////////////// - - const RNG_SEED: u64 = 0x12345; - - struct Data { - queries: Mat>, - docs: Mat>, - } - - impl Data - where - StandardUniform: Distribution, - { - fn new(run: &Run) -> Self { - let mut rng = StdRng::seed_from_u64(RNG_SEED); - let queries = Mat::new( - Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(), - Init(|| StandardUniform.sample(&mut rng)), - ) - .unwrap(); - let docs = Mat::new( - Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(), - Init(|| StandardUniform.sample(&mut rng)), - ) - .unwrap(); - Self { queries, docs } - } - } - - ////////////////////// - // Distance kernels // - ////////////////////// - - /// Object-safe abstraction over a per-shape distance executor. - /// - /// The two implementations ([`OptimizedDistance`] and [`ReferenceDistance`]) share the - /// same hot-loop nest in [`run_with_distance`]; dispatching through `&dyn Distance` - /// keeps `run_loops` from being monomorphised over the implementation axis. - trait Distance { - fn chamfer(&self, doc: MatRef<'_, Standard>) -> f32; - fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]); - } - - /// Distance executor that drives [`QueryComputer`] (architecture-dispatched SIMD). - struct OptimizedDistance(QueryComputer); - - impl Distance for OptimizedDistance { - fn chamfer(&self, doc: MatRef<'_, Standard>) -> f32 { - self.0.chamfer(doc) - } - fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { - self.0.max_sim(doc, scores); - } - } - - /// Distance executor that drives the [`Chamfer`] / [`MaxSim`] fallback path. - struct ReferenceDistance<'a, T: Copy>( - diskann_quantization::multi_vector::distance::QueryMatRef<'a, Standard>, - ); - - impl Distance for ReferenceDistance<'_, T> - where - InnerProduct: for<'q, 'd> PureDistanceFunction<&'q [T], &'d [T], f32>, - { - fn chamfer(&self, doc: MatRef<'_, Standard>) -> f32 { - Chamfer::evaluate(self.0, doc) - } - fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { - // `MaxSim::new` is a non-empty check + pointer wrap, so constructing it per - // iteration is free — no need to hoist it out of the loop. - let mut max_sim = MaxSim::new(scores).unwrap(); - let _ = max_sim.evaluate(self.0, doc); - } - } - - ///////////////////// - // Implementations // - ///////////////////// - - /// Shared loop nest. The trait-object dispatch happens once per outer iteration of - /// `run_loops`; the work inside each `chamfer` / `max_sim` call is O(Q*D*dim), so the - /// vtable hop is in the noise. - fn run_with_distance( - run: &Run, - doc: MatRef<'_, Standard>, - dist: &dyn Distance, - ) -> RunResult { - match run.operation { - Operation::Chamfer => run_loops(run, || { - let v = dist.chamfer(doc); - std::hint::black_box(v); - }), - Operation::MaxSim => { - let mut scores = vec![0.0f32; run.num_query_vectors.get()]; - run_loops(run, || { - dist.max_sim(doc, &mut scores); - std::hint::black_box(&mut scores); - }) - } - } - } - - fn run_optimized(input: &MultiVectorOp) -> anyhow::Result> - where - T: Copy, - StandardUniform: Distribution, - QueryComputer: NewFromMatRef, - OptimizedDistance: Distance, - { - let mut results = Vec::with_capacity(input.runs.len()); - for run in input.runs.iter() { - let data = Data::::new(run); - // `QueryComputer` performs query-side precomputation that is intentionally - // amortized across many `chamfer` / `max_sim` calls; construct it once per - // shape, outside the timed loop. - let dist = OptimizedDistance( as NewFromMatRef>::new_from( - data.queries.as_view(), - )); - results.push(run_with_distance(run, data.docs.as_view(), &dist)); - } - Ok(results) - } - - /// Drive the [`Chamfer`] / [`MaxSim`] fallback path. - fn run_reference(input: &MultiVectorOp) -> anyhow::Result> - where - T: Copy, - StandardUniform: Distribution, - InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, - for<'a> ReferenceDistance<'a, T>: Distance, - { - let mut results = Vec::with_capacity(input.runs.len()); - for run in input.runs.iter() { - let data = Data::::new(run); - let dist = ReferenceDistance(data.queries.as_view().into()); - results.push(run_with_distance(run, data.docs.as_view(), &dist)); - } - Ok(results) - } - - /// Element-type-erasing constructor for [`QueryComputer`]. - /// - /// `QueryComputer::::new` is defined as an inherent method on the concrete - /// `QueryComputer` / `QueryComputer` types (not a generic), so we need - /// this shim trait to let generic code (e.g. `run_optimized`) call it. - trait NewFromMatRef { - fn new_from(query: MatRef<'_, Standard>) -> QueryComputer; - } - - impl NewFromMatRef for QueryComputer { - fn new_from(query: MatRef<'_, Standard>) -> QueryComputer { - QueryComputer::::new(query) - } - } - - impl NewFromMatRef for QueryComputer { - fn new_from(query: MatRef<'_, Standard>) -> QueryComputer { - QueryComputer::::new(query) - } - } - - impl RunBenchmark for Kernel - where - T: Copy + 'static, - StandardUniform: Distribution, - QueryComputer: NewFromMatRef, - OptimizedDistance: Distance, - { - fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result> { - run_optimized::(input) - } - } - - impl RunBenchmark for Kernel - where - T: Copy + 'static, - StandardUniform: Distribution, - InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, - for<'a> ReferenceDistance<'a, T>: Distance, - { - fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result> { - run_reference::(input) - } - } - - /////////// - // Tests // - /////////// - - #[cfg(test)] - mod tests { - use std::num::NonZeroUsize; - - use diskann_benchmark_runner::{ - benchmark::{PassFail, Regression}, - utils::{datatype::DataType, num::NonNegativeFinite, percentiles::compute_percentiles}, - }; - - use super::*; - - fn tiny_run(operation: Operation) -> Run { - Run { - operation, - num_query_vectors: NonZeroUsize::new(2).unwrap(), - num_doc_vectors: NonZeroUsize::new(2).unwrap(), - dim: NonZeroUsize::new(4).unwrap(), - loops_per_measurement: NonZeroUsize::new(1).unwrap(), - num_measurements: NonZeroUsize::new(1).unwrap(), - } - } - - fn tiny_op() -> MultiVectorOp { - MultiVectorOp { - element_type: DataType::Float32, - implementation: Implementation::Optimized, - runs: vec![tiny_run(Operation::Chamfer)], - } - } - - fn tiny_result(operation: Operation, minimum: u64) -> RunResult { - let run = tiny_run(operation); - let minimum = MicroSeconds::new(minimum); - let mut latencies = vec![minimum]; - let percentiles = compute_percentiles(&mut latencies).unwrap(); - RunResult { - run, - latencies, - percentiles, - } - } - - fn tolerance(limit: f64) -> MultiVectorTolerance { - MultiVectorTolerance { - min_time_regression: NonNegativeFinite::new(limit).unwrap(), - } - } - - #[test] - fn check_rejects_mismatched_runs() { - let kernel = Kernel::::new(); - - let err = kernel - .check( - &tolerance(0.0), - &tiny_op(), - &vec![tiny_result(Operation::Chamfer, 100)], - &vec![tiny_result(Operation::MaxSim, 100)], - ) - .unwrap_err(); - - assert_eq!(err.to_string(), "run 0 mismatched"); - } - - #[test] - fn check_allows_negative_relative_change() { - let kernel = Kernel::::new(); - - let result = kernel - .check( - &tolerance(0.0), - &tiny_op(), - &vec![tiny_result(Operation::Chamfer, 100)], - &vec![tiny_result(Operation::Chamfer, 95)], - ) - .unwrap(); - - assert!(matches!(result, PassFail::Pass(_))); - } - - #[test] - fn check_passes_on_tolerance_boundary() { - let kernel = Kernel::::new(); - - let result = kernel - .check( - &tolerance(0.05), - &tiny_op(), - &vec![tiny_result(Operation::Chamfer, 100)], - &vec![tiny_result(Operation::Chamfer, 105)], - ) - .unwrap(); - - assert!(matches!(result, PassFail::Pass(_))); - } - - #[test] - fn check_fails_above_tolerance_boundary() { - let kernel = Kernel::::new(); - - let result = kernel - .check( - &tolerance(0.05), - &tiny_op(), - &vec![tiny_result(Operation::Chamfer, 100)], - &vec![tiny_result(Operation::Chamfer, 106)], - ) - .unwrap(); - - assert!(matches!(result, PassFail::Fail(_))); - } - - #[test] - fn check_result_display_includes_failure_details() { - let check = CheckResult { - checks: vec![Comparison { - run: tiny_run(Operation::Chamfer), - tolerance: tolerance(0.05), - before_min: 100.0, - after_min: 106.0, - }], - }; - - let rendered = check.to_string(); - assert!(rendered.contains("Operation"), "rendered = {rendered}"); - assert!(rendered.contains("chamfer"), "rendered = {rendered}"); - assert!(rendered.contains("100.000"), "rendered = {rendered}"); - assert!(rendered.contains("106.000"), "rendered = {rendered}"); - assert!(rendered.contains("6.000 %"), "rendered = {rendered}"); - assert!(rendered.contains("FAIL"), "rendered = {rendered}"); - } - - /// A "before" value of 0 means the measurement was too fast to obtain a - /// reliable signal, so we *could* be letting a regression through. We - /// require at least a non-zero value. - #[test] - fn zero_values_rejected() { - let kernel = Kernel::::new(); - - let result = kernel - .check( - &tolerance(0.05), - &tiny_op(), - &vec![tiny_result(Operation::Chamfer, 0)], - &vec![tiny_result(Operation::Chamfer, 0)], - ) - .unwrap(); - - assert!(matches!(result, PassFail::Fail(_))); - } - } -} diff --git a/diskann-benchmark/src/backend/multi_vector/driver.rs b/diskann-benchmark/src/backend/multi_vector/driver.rs new file mode 100644 index 000000000..2f83eb22f --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/driver.rs @@ -0,0 +1,279 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Shared benchmark infrastructure for multi-vector kernels. +//! +//! Houses the timing harness ([`run_loops`]), data fixtures ([`Data`]), result +//! types ([`RunResult`], [`Comparison`], [`CheckResult`]), and the trait-object +//! [`Distance`] boundary that both library and experimental kernels go +//! through. None of the contents are kernel-aware. + +use diskann_benchmark_runner::utils::{ + fmt::Table, num::relative_change, percentiles, MicroSeconds, +}; +use diskann_quantization::multi_vector::distance::QueryMatRef; +use diskann_quantization::multi_vector::{Mat, MatRef, MaxSim, QueryComputer, Standard}; +use diskann_vector::distance::InnerProduct; +use diskann_vector::{DistanceFunctionMut, PureDistanceFunction}; +use rand::{ + distr::{Distribution, StandardUniform}, + rngs::StdRng, + SeedableRng, +}; +use serde::{Deserialize, Serialize}; + +use crate::inputs::multi_vector::{MultiVectorTolerance, Run}; + +/////////////////// +// Data fixtures // +/////////////////// + +/// Random query / doc fixture for a single benchmark run. +pub(super) struct Data { + pub(super) queries: Mat>, + pub(super) docs: Mat>, +} + +impl Data +where + StandardUniform: Distribution, +{ + pub(super) fn new(run: &Run) -> Self { + let mut rng = StdRng::seed_from_u64(0x12345); + let queries = Mat::from_fn( + Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(), + || StandardUniform.sample(&mut rng), + ); + let docs = Mat::from_fn( + Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(), + || StandardUniform.sample(&mut rng), + ); + Self { queries, docs } + } +} + +////////////////////// +// Distance kernels // +////////////////////// + +/// Object-safe abstraction over a per-shape distance executor. +/// +/// `OptimizedDistance` wraps any [`QueryComputer`] — library-shipped +/// arch-pinned ones (via `from_arch`) AND experimental ones (via +/// `from_dyn`) — so the driver's hot loop dispatches through one vtable +/// hop regardless of which kernel produced the computer. +/// `ReferenceDistance` is the only path that doesn't go through +/// `QueryComputer` (it uses the `MaxSim` fallback directly). +pub(super) trait Distance { + fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]); +} + +/// Distance executor wrapping a [`QueryComputer`] — covers all arch-pinned, +/// auto-dispatched, and experimental kernels. +pub(super) struct OptimizedDistance(pub(super) QueryComputer); + +impl Distance for OptimizedDistance { + fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { + self.0.max_sim(doc, scores); + } +} + +/// Distance executor driving the [`MaxSim`] fallback path. +pub(super) struct ReferenceDistance<'a, T: Copy>(pub(super) QueryMatRef<'a, Standard>); + +impl Distance for ReferenceDistance<'_, T> +where + InnerProduct: for<'q, 'd> PureDistanceFunction<&'q [T], &'d [T], f32>, +{ + fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { + // `MaxSim::new` is a non-empty check + pointer wrap, free per iteration. + let mut max_sim = MaxSim::new(scores).unwrap(); + let _ = max_sim.evaluate(self.0, doc); + } +} + +////////////////////// +// Timing harness // +////////////////////// + +fn run_loops(run: &Run, mut body: F) -> RunResult +where + F: FnMut(), +{ + let mut latencies = Vec::with_capacity(run.num_measurements.get()); + + for _ in 0..run.num_measurements.get() { + let start = std::time::Instant::now(); + for _ in 0..run.loops_per_measurement.get() { + body(); + } + latencies.push(start.elapsed().into()); + } + + let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap(); + RunResult { + run: run.clone(), + latencies, + percentiles, + } +} + +/// Shared loop nest. The trait-object dispatch happens once per outer iteration +/// of `run_loops`; the work inside each `max_sim` call is O(Q·D·dim), so the +/// vtable hop is in the noise. +pub(super) fn run_with_distance( + run: &Run, + doc: MatRef<'_, Standard>, + dist: &dyn Distance, +) -> RunResult { + let mut scores = vec![0.0f32; run.num_query_vectors.get()]; + run_loops(run, || { + dist.max_sim(doc, &mut scores); + std::hint::black_box(&mut scores); + }) +} + +////////////////////// +// Result types // +////////////////////// + +#[derive(Debug, Clone, Copy)] +pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T); + +impl std::ops::Deref for DisplayWrapper<'_, T> { + type Target = T; + fn deref(&self) -> &T { + self.0 + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub(super) struct RunResult { + /// The configuration for this run. + pub(super) run: Run, + /// Per-measurement latencies (over `loops_per_measurement` calls). + pub(super) latencies: Vec, + /// Latency percentiles. + pub(super) percentiles: percentiles::Percentiles, +} + +impl RunResult { + pub(super) fn computations_per_latency(&self) -> usize { + self.run.num_query_vectors.get() + * self.run.num_doc_vectors.get() + * self.run.loops_per_measurement.get() + } +} + +impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.is_empty() { + return Ok(()); + } + + writeln!( + f, + "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)" + )?; + + let header = [ + "Q", + "D", + "Dim", + "Min Time (ns/IP @ Dim)", + "Mean Time (ns/IP @ Dim)", + "Loops", + "Measurements", + ]; + + let mut table = Table::new(header, self.len()); + + self.iter().enumerate().for_each(|(row, r)| { + let mut row = table.row(row); + + let min_latency = r + .latencies + .iter() + .min() + .copied() + .unwrap_or(MicroSeconds::new(u64::MAX)); + let mean_latency = r.percentiles.mean; + + let computations_per_latency = r.computations_per_latency() as f64; + let min_time = min_latency.as_f64() / computations_per_latency * 1000.0; + let mean_time = mean_latency / computations_per_latency * 1000.0; + + row.insert(r.run.num_query_vectors, 0); + row.insert(r.run.num_doc_vectors, 1); + row.insert(r.run.dim, 2); + row.insert(format!("{:.3}", min_time), 3); + row.insert(format!("{:.3}", mean_time), 4); + row.insert(r.run.loops_per_measurement, 5); + row.insert(r.run.num_measurements, 6); + }); + + table.fmt(f) + } +} + +////////////////////// +// Regression Check // +////////////////////// + +/// Per-run comparison result showing before/after percentile differences. +#[derive(Debug, Serialize)] +pub(super) struct Comparison { + pub(super) run: Run, + pub(super) tolerance: MultiVectorTolerance, + pub(super) before_min: f64, + pub(super) after_min: f64, +} + +/// Aggregated result of the regression check across all runs. +#[derive(Debug, Serialize)] +pub(super) struct CheckResult { + pub(super) checks: Vec, +} + +impl std::fmt::Display for CheckResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let header = [ + "Q", + "D", + "Dim", + "Min Before (ns/IP @ Dim)", + "Min After (ns/IP @ Dim)", + "Change (%)", + "Remark", + ]; + + let mut table = Table::new(header, self.checks.len()); + + for (i, c) in self.checks.iter().enumerate() { + let mut row = table.row(i); + let change = relative_change(c.before_min, c.after_min); + + row.insert(c.run.num_query_vectors, 0); + row.insert(c.run.num_doc_vectors, 1); + row.insert(c.run.dim, 2); + row.insert(format!("{:.3}", c.before_min), 3); + row.insert(format!("{:.3}", c.after_min), 4); + match change { + Ok(change) => { + row.insert(format!("{:.3} %", change * 100.0), 5); + if change > c.tolerance.min_time_regression.get() { + row.insert("FAIL", 6); + } + } + Err(err) => { + row.insert("invalid", 5); + row.insert(err, 6); + } + } + } + + table.fmt(f) + } +} diff --git a/diskann-benchmark/src/backend/multi_vector/experimental/mod.rs b/diskann-benchmark/src/backend/multi_vector/experimental/mod.rs new file mode 100644 index 000000000..b0e106fc4 --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/experimental/mod.rs @@ -0,0 +1,23 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Researcher-authored experimental multi-vector kernels. +//! +//! See [`template`] for the full kernel-author workflow (writing a `Kernel` +//! impl, adapting it via `DynQueryComputer`, wiring up dispatch and +//! registration, and validating under Miri). +//! +//! New experimental kernels live in their own module file in this directory. +//! Their registration goes in [`register`] below. + +use diskann_benchmark_runner::registry::Benchmarks; + +mod template; + +pub(super) fn register(_benchmarks: &mut Benchmarks) { + // No experimental kernels registered by default. + // Add `benchmarks.register_regression(...)` calls here when authoring + // new experimental kernels. +} diff --git a/diskann-benchmark/src/backend/multi_vector/experimental/template.rs b/diskann-benchmark/src/backend/multi_vector/experimental/template.rs new file mode 100644 index 000000000..f09f0c74e --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/experimental/template.rs @@ -0,0 +1,254 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! **Template for an experimental multi-vector kernel.** +//! +//! Copy this file (e.g. to `v4_wide.rs`), rename `Template*`, change the +//! `Kernel` impl to your target ISA, and add an `Arch` variant + a +//! `register_regression` call to wire it up. +//! +//! # The 6-step workflow +//! +//! 1. **Add an [`Arch`](crate::inputs::multi_vector::Arch) variant** for your +//! experimental kernel (e.g. `X86_64_V4_Wide`). The `#[non_exhaustive]` +//! attribute on `Arch` makes this a non-breaking addition. +//! 2. **Author the micro-kernel.** Implement +//! [`Kernel`](diskann_quantization::multi_vector::distance::kernels::Kernel) +//! on your struct (`unsafe impl Kernel` etc.), filling in +//! `full_panel` and `partial_panel` with your SIMD intrinsics. +//! 3. **Author the adapter.** Implement +//! [`DynQueryComputer`](diskann_quantization::multi_vector::distance::DynQueryComputer) +//! on a struct that owns the prepared query data; in `compute_max_sim`, +//! call +//! [`tiled_reduce`](diskann_quantization::multi_vector::distance::kernels::tiled_reduce) +//! with your kernel. +//! 4. **Add a marker + `DispatchRule`.** Mirror the pattern in +//! `library_kernels.rs` (e.g. `match_arch_x86_64!`) for your new variant. +//! 5. **Add a `RunBenchmark` impl + `register_regression` call.** Use +//! `Kernel::::new()` as the registered benchmark entry. +//! 6. **Validate under Miri.** See the section below. +//! +//! # Validating under Miri (REQUIRED) +//! +//! Experimental kernels rely on `unsafe fn full_panel` / `partial_panel` +//! with raw-pointer arithmetic. Pointer provenance, alignment, and +//! out-of-bounds bugs are easy to introduce and hard to catch by +//! inspection. **Run your kernel under Miri before assuming it's correct.** +//! +//! Rules: +//! +//! - Inside your `#[cfg(test)]` module, construct arch tokens via the +//! Miri-friendly variants: `Scalar::new()` (always Miri-safe) or +//! `V4::new_checked_miri()` (returns a token unconditionally under +//! `cfg(miri)` using AVX-512 emulation, so tests run even when Miri +//! can't do real CPU detection). `V3` and `Neon` only expose +//! `new_checked()` today — if you need them under Miri, follow +//! `V4::new_checked_miri()`'s pattern in `diskann-wide`. +//! - Any SIMD intrinsic Miri doesn't support must have a scalar fallback +//! gated by `#[cfg(miri)]`. +//! - Add at least one small-shape correctness test that runs your kernel +//! against a naive reference and is Miri-friendly. +//! - Run: `cargo +nightly miri test -p diskann-benchmark --features multi-vector +//! backend::multi_vector::experimental::`. Reduce +//! test-sweep size under Miri with `if cfg!(miri) { small } else { full }` +//! (see this file's test for the pattern). +//! +//! Miri won't catch performance bugs, but it'll catch UB — and UB in an +//! experimental kernel breaks the benchmark binary, not the kernel you're +//! trying to measure. +//! +//! # This template +//! +//! This file defines `TemplateKernel: Kernel` (uses `Scalar` so the +//! template is host-portable + Miri-friendly) and a `TemplateComputer` +//! adapter that pipes it through `tiled_reduce`. It is **not registered** as +//! a benchmark entry — see step 5 in the workflow. The included +//! `#[cfg(test)]` `template_matches_pinned_scalar` test exercises the API +//! surface end-to-end so this file catches public-API drift even though it +//! isn't wired into the benchmark dispatcher. + +#![allow(dead_code)] + +use diskann_quantization::multi_vector::distance::{ + kernels::{layouts, tiled_reduce, Kernel, TileBudget}, + DynQueryComputer, +}; +use diskann_quantization::multi_vector::{BlockTransposed, BlockTransposedRef, MatRef, Standard}; +use diskann_wide::arch::Scalar; + +/// Step 2: the micro-kernel struct. Rename and implement for your target arch. +pub(super) struct TemplateKernel; + +// SAFETY: `full_panel` / `partial_panel` only access `A_PANEL * k` / +// `B_PANEL * k` source elements and write `A_PANEL` destination f32s, +// matching `Kernel`'s safety contract. The simple scalar +// computation here is Miri-clean. +unsafe impl Kernel for TemplateKernel { + type Left = layouts::BlockTransposedLayout; + type Right = layouts::RowMajor; + const A_PANEL: usize = 8; + const B_PANEL: usize = 2; + + unsafe fn full_panel(_arch: Scalar, a: *const f32, b: *const f32, k: usize, r: *mut f32) { + // SAFETY: a covers A_PANEL * k contiguous block-transposed f32s, + // b covers B_PANEL * k contiguous row-major f32s, r covers A_PANEL f32s. + unsafe { panel::<8, 2>(a, b, k, r) } + } + + unsafe fn partial_panel( + _arch: Scalar, + remainder: usize, + a: *const f32, + b: *const f32, + k: usize, + r: *mut f32, + ) { + debug_assert!(remainder == 1); + // SAFETY: as full_panel but with `b` covering `remainder * k` f32s. + unsafe { panel::<8, 1>(a, b, k, r) } + } +} + +/// Replace this with your SIMD intrinsics. The block-transposed A layout +/// stores `A_ROWS` contiguous f32s per dimension index `i`, so the q-th +/// query row at dimension i lives at `a[i * A_ROWS + q]`. The row-major B +/// layout stores doc d's k-th element at `b[d * k_dim + k]`. The scratch +/// `r` accumulates max IP per query row (library convention; the +/// `QueryComputer` veneer negates at the end). +/// +/// # Safety +/// - `a` covers `A_ROWS * k` block-transposed f32s. +/// - `b` covers `B_ROWS * k` row-major f32s. +/// - `r` covers `A_ROWS` writable f32s. +unsafe fn panel( + a: *const f32, + b: *const f32, + k: usize, + r: *mut f32, +) { + for q in 0..A_ROWS { + // SAFETY: q < A_ROWS. + let mut best = unsafe { *r.add(q) }; + + for d in 0..B_ROWS { + let mut ip: f32 = 0.0; + for i in 0..k { + // SAFETY: i < k, q < A_ROWS. + let a_val = unsafe { *a.add(i * A_ROWS + q) }; + // SAFETY: d < B_ROWS, b covers B_ROWS rows of k f32s each. + let b_val = unsafe { *b.add(d * k + i) }; + ip += a_val * b_val; + } + best = best.max(ip); + } + + // SAFETY: q < A_ROWS. + unsafe { *r.add(q) = best }; + } +} + +/// Step 3: the `DynQueryComputer` adapter. Owns the prepared query data +/// and routes `compute_max_sim` through `tiled_reduce` with the kernel. +#[derive(Debug)] +pub(super) struct TemplateComputer { + arch: Scalar, + prepared: BlockTransposed, +} + +impl TemplateComputer { + pub(super) fn new(query: MatRef<'_, Standard>) -> Self { + let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); + Self { + arch: Scalar::new(), + prepared, + } + } +} + +impl DynQueryComputer for TemplateComputer { + fn nrows(&self) -> usize { + self.prepared.nrows() + } + + fn compute_max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { + let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()]; + let prepared_ref: BlockTransposedRef<'_, f32, 8> = self.prepared.as_view(); + let ca = as layouts::DescribeLayout>::layout(&prepared_ref); + let cb = > as layouts::DescribeLayout>::layout(&doc); + + // SAFETY: prepared.as_ptr() covers padded_nrows * ncols block-transposed + // f32s; doc.as_slice() covers num_vectors * vector_dim row-major f32s; + // scratch length == padded_nrows; padded_nrows is a multiple of + // A_PANEL=8 by BlockTransposed construction. + unsafe { + tiled_reduce::( + self.arch, + &ca, + &cb, + self.prepared.as_ptr(), + self.prepared.padded_nrows(), + doc.as_slice().as_ptr(), + doc.num_vectors(), + doc.vector_dim(), + &mut scratch, + TileBudget::default(), + ); + } + + for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) { + *dst = -src; + } + } +} + +#[cfg(test)] +mod tests { + //! Miri-friendly correctness test for the template kernel. + //! + //! Validates that the template's adapter machinery produces the same + //! per-row scores as `QueryComputer::from_arch(Scalar)`. Iteration + //! count is reduced under Miri so `cargo +nightly miri test` finishes + //! in seconds, not minutes. + use super::*; + use diskann_quantization::multi_vector::QueryComputer; + + fn make_data(nrows: usize, ncols: usize, shift: usize) -> Vec { + (0..nrows * ncols) + .map(|v| ((v + shift) % ncols) as f32) + .collect() + } + + #[test] + fn template_matches_pinned_scalar() { + let cases: &[(usize, usize, usize)] = if cfg!(miri) { + // Single small case under Miri to keep runtime reasonable. + &[(3, 4, 8)] + } else { + &[(1, 1, 4), (3, 5, 8), (8, 4, 16), (10, 6, 32)] + }; + + for &(nq, nd, dim) in cases { + let qd = make_data(nq, dim, dim / 2); + let dd = make_data(nd, dim, dim); + let query = MatRef::new(Standard::::new(nq, dim).unwrap(), &qd).unwrap(); + let doc = MatRef::new(Standard::::new(nd, dim).unwrap(), &dd).unwrap(); + + let pinned = QueryComputer::::from_arch(query, Scalar::new()); + let template = QueryComputer::::from_dyn(Box::new(TemplateComputer::new(query))); + + let mut pinned_scores = vec![0.0f32; nq]; + let mut template_scores = vec![0.0f32; nq]; + pinned.max_sim(doc, &mut pinned_scores); + template.max_sim(doc, &mut template_scores); + + for (i, (p, t)) in pinned_scores.iter().zip(template_scores.iter()).enumerate() { + assert!( + (p - t).abs() < 1e-10, + "shape ({nq},{nd},{dim}) row {i}: pinned={p} template={t}", + ); + } + } + } +} diff --git a/diskann-benchmark/src/backend/multi_vector/library_kernels.rs b/diskann-benchmark/src/backend/multi_vector/library_kernels.rs new file mode 100644 index 000000000..56d6c0db1 --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/library_kernels.rs @@ -0,0 +1,510 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Library kernel registrations and arch-dispatch machinery. +//! +//! Mirrors the structure of `diskann-benchmark-simd`: a `Kernel` +//! PhantomData carrier carries the (arch × element type) pair through the +//! benchmark registry, [`DispatchRule`] maps the JSON-facing `Arch` +//! enum to a concrete arch token, and the `stamp!` / `match_arch!` macros +//! generate the repetitive `RunBenchmark` / `DispatchRule` impls. +//! +//! Library kernels registered here: +//! - `multi-vector-op-{f32,f16}-auto` — `QueryComputer::new` (auto-dispatch) +//! - `multi-vector-op-{f32,f16}-scalar` — `from_arch(Scalar)` +//! - `multi-vector-op-{f32,f16}-x86_64_V3` — `from_arch(V3)` (x86_64 only) +//! - `multi-vector-op-{f32,f16}-x86_64_V4` — `from_arch(V4)` (x86_64 only) +//! - `multi-vector-op-{f32,f16}-aarch64_neon` — `from_arch(Neon)` (aarch64 only) +//! - `multi-vector-op-{f32,f16}-reference` — `MaxSim` fallback + +use std::io::Write; +use std::marker::PhantomData; + +use diskann_benchmark_runner::{ + benchmark::{PassFail, Regression}, + dispatcher::{Description, DispatchRule, FailureScore, MatchScore}, + utils::{datatype, num::relative_change}, + Benchmark, Checkpoint, Output, +}; +use diskann_quantization::multi_vector::{MatRef, QueryComputer, Standard}; +use diskann_vector::distance::InnerProduct; +use diskann_vector::PureDistanceFunction; +#[cfg(target_arch = "aarch64")] +use diskann_wide::arch::aarch64::Neon; +#[cfg(target_arch = "x86_64")] +use diskann_wide::arch::x86_64::{V3, V4}; +use diskann_wide::arch::Scalar; +use diskann_wide::Architecture; +use rand::distr::{Distribution, StandardUniform}; + +use super::driver::{ + run_with_distance, CheckResult, Comparison, Data, DisplayWrapper, OptimizedDistance, + ReferenceDistance, RunResult, +}; +use crate::inputs::multi_vector::{Arch, MultiVectorOp, MultiVectorTolerance}; + +/// PhantomData carrier for one (arch, element-type) entry in the benchmark +/// registry. The arch parameter `A` is either a real arch token (`Scalar`, +/// `V3`, `V4`, `Neon`) or one of the marker types [`Auto`] / [`Reference`]. +pub(super) struct Kernel { + _type: PhantomData<(A, T)>, +} + +impl Kernel { + pub(super) fn new() -> Self { + Self { _type: PhantomData } + } +} + +/// Marker for the auto-dispatched (CPU-detected) kernel — `QueryComputer::new`. +#[derive(Debug, Clone, Copy)] +pub(super) struct Auto; + +/// Marker for the reference (`MaxSim` fallback) kernel. +#[derive(Debug, Clone, Copy)] +pub(super) struct Reference; + +/// Wrapper around an arch token (real or marker) that implements +/// [`DispatchRule`] for the JSON-facing [`Arch`] enum. +pub(super) struct Identity(pub(super) A); + +/// Returned by `Identity::::convert` when the host CPU doesn't support the +/// requested ISA. The dispatcher converts this into a friendly error message. +#[derive(Debug, Clone, Copy)] +pub(super) struct ArchNotSupported(pub(super) Arch); + +impl std::fmt::Display for ArchNotSupported { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} not supported on this CPU", self.0) + } +} + +impl std::error::Error for ArchNotSupported {} + +////////////////////// +// Dispatch rules // +////////////////////// + +/// Generates a [`DispatchRule`] for a real arch token. `try_match` returns: +/// - `Ok(MatchScore(0))` when the input names this arch AND the host CPU supports it +/// - `Err(FailureScore(0))` when the input names this arch but the CPU doesn't support it +/// (this surfaces in the dispatcher's near-miss diagnostic) +/// - `Err(FailureScore(1))` when the input names a different arch +macro_rules! match_arch_x86_64 { + ($arch:path, $enum:ident) => { + #[cfg(target_arch = "x86_64")] + impl DispatchRule for Identity<$arch> { + type Error = ArchNotSupported; + fn try_match(from: &Arch) -> Result { + if *from != Arch::$enum { + return Err(FailureScore(1)); + } + if <$arch>::new_checked().is_some() { + Ok(MatchScore(0)) + } else { + Err(FailureScore(0)) + } + } + fn convert(from: Arch) -> Result { + <$arch>::new_checked() + .ok_or(ArchNotSupported(from)) + .map(Identity) + } + } + }; +} + +match_arch_x86_64!(V3, X86_64_V3); +match_arch_x86_64!(V4, X86_64_V4); + +#[cfg(target_arch = "aarch64")] +impl DispatchRule for Identity { + type Error = ArchNotSupported; + fn try_match(from: &Arch) -> Result { + if *from != Arch::Neon { + return Err(FailureScore(1)); + } + if Neon::new_checked().is_some() { + Ok(MatchScore(0)) + } else { + Err(FailureScore(0)) + } + } + fn convert(from: Arch) -> Result { + Neon::new_checked() + .ok_or(ArchNotSupported(from)) + .map(Identity) + } +} + +// Scalar is always available; no CPU check needed. +impl DispatchRule for Identity { + type Error = ArchNotSupported; + fn try_match(from: &Arch) -> Result { + if *from == Arch::Scalar { + Ok(MatchScore(0)) + } else { + Err(FailureScore(1)) + } + } + fn convert(_from: Arch) -> Result { + Ok(Identity(Scalar::new())) + } +} + +impl DispatchRule for Identity { + type Error = ArchNotSupported; + fn try_match(from: &Arch) -> Result { + if *from == Arch::Auto { + Ok(MatchScore(0)) + } else { + Err(FailureScore(1)) + } + } + fn convert(_from: Arch) -> Result { + Ok(Identity(Auto)) + } +} + +impl DispatchRule for Identity { + type Error = ArchNotSupported; + fn try_match(from: &Arch) -> Result { + if *from == Arch::Reference { + Ok(MatchScore(0)) + } else { + Err(FailureScore(1)) + } + } + fn convert(_from: Arch) -> Result { + Ok(Identity(Reference)) + } +} + +////////////////////// +// Benchmark trait // +////////////////////// + +/// Per-arch run trait. The `stamp!` macro generates impls for real arch tokens; +/// `Auto` and `Reference` get hand-written impls. +pub(super) trait RunBenchmark { + fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result>; +} + +impl Benchmark for Kernel +where + datatype::Type: DispatchRule, + Identity: DispatchRule, + Kernel: RunBenchmark, + A: 'static, + T: 'static, +{ + type Input = MultiVectorOp; + type Output = Vec; + + fn try_match(&self, from: &MultiVectorOp) -> Result { + let mut failscore: Option = None; + if datatype::Type::::try_match(&from.element_type).is_err() { + *failscore.get_or_insert(0) += 10; + } + match Identity::::try_match(&from.arch) { + Ok(MatchScore(_)) => (), + Err(FailureScore(score)) => { + *failscore.get_or_insert(0) += score; + } + } + match failscore { + None => Ok(MatchScore(0)), + Some(score) => Err(FailureScore(score)), + } + } + + fn run( + &self, + input: &MultiVectorOp, + _: Checkpoint<'_>, + mut output: &mut dyn Output, + ) -> anyhow::Result { + writeln!(output, "{}", input)?; + let results = self.run_benchmark(input)?; + writeln!(output, "\n\n{}", DisplayWrapper(&*results))?; + Ok(results) + } + + fn description( + &self, + f: &mut std::fmt::Formatter<'_>, + input: Option<&MultiVectorOp>, + ) -> std::fmt::Result { + match input { + None => { + writeln!( + f, + "- Element Type: {}", + Description::>::new() + )?; + writeln!(f, "- Arch: {}", Description::>::new())?; + } + Some(input) => { + if let Err(err) = datatype::Type::::try_match_verbose(&input.element_type) { + writeln!(f, "\n - Mismatched element type: {}", err)?; + } + if Identity::::try_match(&input.arch).is_err() { + writeln!(f, "\n - Wrong or unsupported arch: {}", input.arch)?; + } + } + } + Ok(()) + } +} + +impl Regression for Kernel +where + datatype::Type: DispatchRule, + Identity: DispatchRule, + Kernel: RunBenchmark, + A: 'static, + T: 'static, +{ + type Tolerances = MultiVectorTolerance; + type Pass = CheckResult; + type Fail = CheckResult; + + fn check( + &self, + tolerance: &MultiVectorTolerance, + _input: &MultiVectorOp, + before: &Vec, + after: &Vec, + ) -> anyhow::Result> { + anyhow::ensure!( + before.len() == after.len(), + "before has {} runs but after has {}", + before.len(), + after.len(), + ); + + let mut passed = true; + let checks: Vec = std::iter::zip(before.iter(), after.iter()) + .enumerate() + .map(|(i, (b, a))| { + anyhow::ensure!(b.run == a.run, "run {i} mismatched"); + + let computations_per_latency = b.computations_per_latency() as f64; + let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + + let comparison = Comparison { + run: b.run.clone(), + tolerance: *tolerance, + before_min, + after_min, + }; + + match relative_change(before_min, after_min) { + Ok(change) => { + if change > tolerance.min_time_regression.get() { + passed = false; + } + } + Err(_) => passed = false, + }; + + Ok(comparison) + }) + .collect::>>()?; + + let check = CheckResult { checks }; + Ok(if passed { + PassFail::Pass(check) + } else { + PassFail::Fail(check) + }) + } +} + +////////////////////// +// RunBenchmark impls +////////////////////// + +/// Element-type-erasing constructor for [`QueryComputer`]. `QueryComputer`'s +/// `new` / `from_arch` are inherent methods on the concrete `QueryComputer` +/// and `QueryComputer` types, so generic code needs this shim. +pub(super) trait BuildArchQc { + /// Build a `QueryComputer` pinned to the host's auto-dispatched arch. + fn build_auto(query: MatRef<'_, Standard>) -> QueryComputer; +} + +impl BuildArchQc for f32 { + fn build_auto(query: MatRef<'_, Standard>) -> QueryComputer { + QueryComputer::::new(query) + } +} + +impl BuildArchQc for half::f16 { + fn build_auto(query: MatRef<'_, Standard>) -> QueryComputer { + QueryComputer::::new(query) + } +} + +/// Per-(arch, T) constructor for `QueryComputer::from_arch`. Same idea as +/// [`BuildArchQc::build_auto`] but pinned to a specific arch token. +pub(super) trait BuildPinnedQc { + fn build_pinned(query: MatRef<'_, Standard>, arch: A) -> QueryComputer; +} + +macro_rules! impl_build_pinned { + ($arch:path, $T:ty) => { + impl BuildPinnedQc<$arch, $T> for $T { + fn build_pinned(query: MatRef<'_, Standard<$T>>, arch: $arch) -> QueryComputer<$T> { + QueryComputer::<$T>::from_arch(query, arch) + } + } + }; +} + +impl_build_pinned!(Scalar, f32); +impl_build_pinned!(Scalar, half::f16); +#[cfg(target_arch = "x86_64")] +impl_build_pinned!(V3, f32); +#[cfg(target_arch = "x86_64")] +impl_build_pinned!(V3, half::f16); +#[cfg(target_arch = "x86_64")] +impl_build_pinned!(V4, f32); +#[cfg(target_arch = "x86_64")] +impl_build_pinned!(V4, half::f16); +#[cfg(target_arch = "aarch64")] +impl_build_pinned!(Neon, f32); +#[cfg(target_arch = "aarch64")] +impl_build_pinned!(Neon, half::f16); + +/// Stamp out `RunBenchmark<$arch>` for `Kernel<$arch, $T>` using +/// `QueryComputer::::from_arch($arch_token)`. +macro_rules! stamp { + ($arch:path, $T:ty) => { + impl RunBenchmark<$arch> for Kernel<$arch, $T> + where + StandardUniform: Distribution<$T>, + $T: BuildPinnedQc<$arch, $T>, + { + fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result> { + let arch = Identity::<$arch>::convert(input.arch)?.0; + let mut results = Vec::with_capacity(input.runs.len()); + for run in input.runs.iter() { + let data = Data::<$T>::new(run); + // `QueryComputer` performs query-side precomputation that is + // intentionally amortized across many `max_sim` calls; + // construct it once per shape, outside the timed loop. + let qc = <$T as BuildPinnedQc<$arch, $T>>::build_pinned( + data.queries.as_view(), + arch, + ); + let dist = OptimizedDistance(qc); + results.push(run_with_distance(run, data.docs.as_view(), &dist)); + } + Ok(results) + } + } + }; + ($target_arch:literal, $arch:path, $T:ty) => { + #[cfg(target_arch = $target_arch)] + stamp!($arch, $T); + }; +} + +stamp!(Scalar, f32); +stamp!(Scalar, half::f16); +stamp!("x86_64", V3, f32); +stamp!("x86_64", V3, half::f16); +stamp!("x86_64", V4, f32); +stamp!("x86_64", V4, half::f16); +stamp!("aarch64", Neon, f32); +stamp!("aarch64", Neon, half::f16); + +// Auto and Reference get hand-written impls (different construction paths). + +impl RunBenchmark for Kernel +where + T: Copy + 'static + BuildArchQc, + StandardUniform: Distribution, +{ + fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result> { + let mut results = Vec::with_capacity(input.runs.len()); + for run in input.runs.iter() { + let data = Data::::new(run); + let qc = >::build_auto(data.queries.as_view()); + let dist = OptimizedDistance(qc); + results.push(run_with_distance(run, data.docs.as_view(), &dist)); + } + Ok(results) + } +} + +impl RunBenchmark for Kernel +where + T: Copy + 'static, + StandardUniform: Distribution, + InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, + for<'a> ReferenceDistance<'a, T>: super::driver::Distance, +{ + fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result> { + let mut results = Vec::with_capacity(input.runs.len()); + for run in input.runs.iter() { + let data = Data::::new(run); + let dist = ReferenceDistance(data.queries.as_view().into()); + results.push(run_with_distance(run, data.docs.as_view(), &dist)); + } + Ok(results) + } +} + +////////////////////// +// Registration // +////////////////////// + +pub(super) fn register(benchmarks: &mut diskann_benchmark_runner::registry::Benchmarks) { + benchmarks.register_regression("multi-vector-op-f32-auto", Kernel::::new()); + benchmarks.register_regression("multi-vector-op-f16-auto", Kernel::::new()); + + benchmarks.register_regression("multi-vector-op-f32-scalar", Kernel::::new()); + benchmarks.register_regression( + "multi-vector-op-f16-scalar", + Kernel::::new(), + ); + + benchmarks.register_regression( + "multi-vector-op-f32-reference", + Kernel::::new(), + ); + benchmarks.register_regression( + "multi-vector-op-f16-reference", + Kernel::::new(), + ); + + #[cfg(target_arch = "x86_64")] + { + benchmarks.register_regression("multi-vector-op-f32-x86_64_V3", Kernel::::new()); + benchmarks.register_regression( + "multi-vector-op-f16-x86_64_V3", + Kernel::::new(), + ); + benchmarks.register_regression("multi-vector-op-f32-x86_64_V4", Kernel::::new()); + benchmarks.register_regression( + "multi-vector-op-f16-x86_64_V4", + Kernel::::new(), + ); + } + + #[cfg(target_arch = "aarch64")] + { + benchmarks.register_regression( + "multi-vector-op-f32-aarch64_neon", + Kernel::::new(), + ); + benchmarks.register_regression( + "multi-vector-op-f16-aarch64_neon", + Kernel::::new(), + ); + } +} diff --git a/diskann-benchmark/src/backend/multi_vector/mod.rs b/diskann-benchmark/src/backend/multi_vector/mod.rs new file mode 100644 index 000000000..90426e571 --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/mod.rs @@ -0,0 +1,233 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Multi-vector MaxSim distance benchmarks with regression detection. +//! +//! This module is a **kernel-research substrate**, not just a benchmark. It +//! supports two distinct use cases: +//! +//! 1. **Head-to-head ISA (instruction set architecture) comparison.** Library +//! kernels are registered per arch (`scalar`, `x86-64-v3`, `x86-64-v4`, +//! `aarch64-neon`) plus `auto` (CPU-detected) and `reference` (fallback). +//! Pinning to a specific ISA lets you compare e.g. AVX2 vs AVX512 on the +//! same AVX512 host. +//! +//! 2. **Experimental kernel authoring.** External crates and the +//! `experimental/` submodule can author new SIMD micro-kernels by +//! implementing the public `Kernel` trait in +//! `diskann-quantization::multi_vector::distance::kernels`, plug them +//! into the existing cache-aware tile orchestrator (`tiled_reduce`), +//! and slot them into the benchmark via +//! `QueryComputer::from_dyn(Box::new(...))`. +//! +//! # Adding a new experimental kernel +//! +//! See `experimental/template.rs` for the full step-by-step workflow with +//! a worked example. Summary: +//! +//! 1. Add a variant to [`crate::inputs::multi_vector::Arch`]. +//! 2. Implement `Kernel` for your micro-kernel. +//! 3. Implement `DynQueryComputer` for your adapter, calling +//! `tiled_reduce` with your kernel. +//! 4. Add a marker type + `DispatchRule` impl so the new variant +//! routes to your kernel. +//! 5. Add a `RunBenchmark` impl + `register_regression(...)` call +//! in `experimental::register`. +//! +//! **Validate experimental kernels under Miri:** +//! - Construct arch tokens via `Scalar::new()` (Miri-safe) or +//! `V4::new_checked_miri()` (Miri-safe AVX-512 emulation). `V3::new_checked()` +//! and `Neon::new_checked()` don't have `_miri` variants today; if you need +//! them under Miri, follow `V4::new_checked_miri()`'s pattern. +//! - Gate Miri-unsupported intrinsics with `#[cfg(not(miri))]`. +//! - Reduce test-sweep size under `cfg(miri)` to keep runtimes reasonable. + +use diskann_benchmark_runner::registry::Benchmarks; + +cfg_if::cfg_if! { + if #[cfg(feature = "multi-vector")] { + mod driver; + mod experimental; + mod library_kernels; + + pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) { + library_kernels::register(benchmarks); + experimental::register(benchmarks); + } + } else { + crate::utils::stub_impl!("multi-vector", inputs::multi_vector::MultiVectorOp); + + pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) { + imp::register("multi-vector-op", benchmarks); + } + } +} + +#[cfg(all(test, feature = "multi-vector"))] +mod tests { + use std::num::NonZeroUsize; + + use diskann_benchmark_runner::{ + benchmark::{PassFail, Regression}, + utils::{ + datatype::DataType, num::NonNegativeFinite, percentiles::compute_percentiles, + MicroSeconds, + }, + }; + + use super::driver::{CheckResult, Comparison, RunResult}; + use super::library_kernels::{Auto, Kernel}; + use crate::inputs::multi_vector::{Arch, MultiVectorOp, MultiVectorTolerance, Run}; + + fn tiny_run() -> Run { + Run { + num_query_vectors: NonZeroUsize::new(2).unwrap(), + num_doc_vectors: NonZeroUsize::new(2).unwrap(), + dim: NonZeroUsize::new(4).unwrap(), + loops_per_measurement: NonZeroUsize::new(1).unwrap(), + num_measurements: NonZeroUsize::new(1).unwrap(), + } + } + + fn tiny_op() -> MultiVectorOp { + MultiVectorOp { + element_type: DataType::Float32, + arch: Arch::Auto, + runs: vec![tiny_run()], + } + } + + fn tiny_result(minimum: u64) -> RunResult { + let mut latencies = vec![MicroSeconds::new(minimum)]; + let percentiles = compute_percentiles(&mut latencies).unwrap(); + RunResult { + run: tiny_run(), + latencies, + percentiles, + } + } + + fn tolerance(limit: f64) -> MultiVectorTolerance { + MultiVectorTolerance { + min_time_regression: NonNegativeFinite::new(limit).unwrap(), + } + } + + #[test] + fn check_rejects_mismatched_runs() { + let kernel = Kernel::::new(); + + // Build a result whose `run` diverges from `tiny_run()` so the + // regression check's `b.run == a.run` invariant fires. + let mut latencies = vec![MicroSeconds::new(100)]; + let percentiles = compute_percentiles(&mut latencies).unwrap(); + let mismatched_result = RunResult { + run: Run { + num_query_vectors: NonZeroUsize::new(4).unwrap(), + ..tiny_run() + }, + latencies, + percentiles, + }; + + let err = kernel + .check( + &tolerance(0.0), + &tiny_op(), + &vec![tiny_result(100)], + &vec![mismatched_result], + ) + .unwrap_err(); + + assert_eq!(err.to_string(), "run 0 mismatched"); + } + + #[test] + fn check_allows_negative_relative_change() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.0), + &tiny_op(), + &vec![tiny_result(100)], + &vec![tiny_result(95)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Pass(_))); + } + + #[test] + fn check_passes_on_tolerance_boundary() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(100)], + &vec![tiny_result(105)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Pass(_))); + } + + #[test] + fn check_fails_above_tolerance_boundary() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(100)], + &vec![tiny_result(106)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Fail(_))); + } + + #[test] + fn check_result_display_includes_failure_details() { + let check = CheckResult { + checks: vec![Comparison { + run: tiny_run(), + tolerance: tolerance(0.05), + before_min: 100.0, + after_min: 106.0, + }], + }; + + let rendered = check.to_string(); + assert!(rendered.contains("Q"), "rendered = {rendered}"); + assert!(rendered.contains("Dim"), "rendered = {rendered}"); + assert!(rendered.contains("100.000"), "rendered = {rendered}"); + assert!(rendered.contains("106.000"), "rendered = {rendered}"); + assert!(rendered.contains("6.000 %"), "rendered = {rendered}"); + assert!(rendered.contains("FAIL"), "rendered = {rendered}"); + } + + /// A "before" value of 0 means the measurement was too fast to obtain a + /// reliable signal, so we *could* be letting a regression through. We + /// require at least a non-zero value. + #[test] + fn zero_values_rejected() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(0)], + &vec![tiny_result(0)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Fail(_))); + } +} diff --git a/diskann-benchmark/src/inputs/multi_vector.rs b/diskann-benchmark/src/inputs/multi_vector.rs index 8010162d6..150d72a8e 100644 --- a/diskann-benchmark/src/inputs/multi_vector.rs +++ b/diskann-benchmark/src/inputs/multi_vector.rs @@ -32,46 +32,46 @@ pub(super) fn register_inputs( // Enum types // //////////////// -/// The two distance operations exposed by `QueryComputer`. -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub(crate) enum Operation { - Chamfer, - MaxSim, -} - -impl std::fmt::Display for Operation { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let st = match self { - Self::Chamfer => "chamfer", - Self::MaxSim => "max_sim", - }; - write!(f, "{}", st) - } -} - -/// Which implementation tier to benchmark. +/// Which kernel to benchmark. +/// +/// Mirrors `diskann-benchmark-simd`'s `Arch` enum: kebab-case serialization, +/// one variant per supported ISA plus `Reference` (fallback) and `Auto` +/// (host-portable). Marked `#[non_exhaustive]` so experimental kernels can +/// add variants without breaking JSON configs. #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] -pub(crate) enum Implementation { - Optimized, +#[non_exhaustive] +pub(crate) enum Arch { + #[serde(rename = "x86-64-v4")] + #[allow(non_camel_case_types)] + X86_64_V4, + #[serde(rename = "x86-64-v3")] + #[allow(non_camel_case_types)] + X86_64_V3, + Neon, + Scalar, Reference, + /// Auto-dispatch to the host's best supported arch (calls `QueryComputer::new`). + Auto, } -impl std::fmt::Display for Implementation { +impl std::fmt::Display for Arch { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let st = match self { - Self::Optimized => "optimized", + Self::X86_64_V4 => "x86-64-v4", + Self::X86_64_V3 => "x86-64-v3", + Self::Neon => "neon", + Self::Scalar => "scalar", Self::Reference => "reference", + Self::Auto => "auto", }; write!(f, "{}", st) } } -/// One benchmark configuration: a single (operation, shape) measurement. +/// One benchmark configuration: a single shape measurement. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub(crate) struct Run { - pub(crate) operation: Operation, pub(crate) num_query_vectors: NonZeroUsize, pub(crate) num_doc_vectors: NonZeroUsize, pub(crate) dim: NonZeroUsize, @@ -87,7 +87,7 @@ pub(crate) struct Run { #[derive(Debug, Serialize, Deserialize)] pub(crate) struct MultiVectorOp { pub(crate) element_type: DataType, - pub(crate) implementation: Implementation, + pub(crate) arch: Arch, pub(crate) runs: Vec, } @@ -105,7 +105,6 @@ impl CheckDeserialization for MultiVectorOp { impl Example for MultiVectorOp { fn example() -> Self { - const NUM_QUERY_VECTORS: NonZeroUsize = NonZeroUsize::new(32).unwrap(); const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap(); const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap(); const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap(); @@ -113,16 +112,14 @@ impl Example for MultiVectorOp { let runs = vec![ Run { - operation: Operation::Chamfer, - num_query_vectors: NUM_QUERY_VECTORS, + num_query_vectors: NonZeroUsize::new(32).unwrap(), num_doc_vectors: NUM_DOC_VECTORS, dim: DIM, loops_per_measurement: LOOPS_PER_MEASUREMENT, num_measurements: NUM_MEASUREMENTS, }, Run { - operation: Operation::MaxSim, - num_query_vectors: NUM_QUERY_VECTORS, + num_query_vectors: NonZeroUsize::new(64).unwrap(), num_doc_vectors: NUM_DOC_VECTORS, dim: DIM, loops_per_measurement: LOOPS_PER_MEASUREMENT, @@ -132,7 +129,7 @@ impl Example for MultiVectorOp { Self { element_type: DataType::Float32, - implementation: Implementation::Optimized, + arch: Arch::Auto, runs, } } @@ -149,7 +146,7 @@ impl std::fmt::Display for MultiVectorOp { writeln!(f, "Multi-Vector Operation\n")?; write_field!(f, "tag", Self::tag())?; write_field!(f, "element type", self.element_type)?; - write_field!(f, "implementation", self.implementation)?; + write_field!(f, "arch", self.arch)?; write_field!(f, "number of runs", self.runs.len())?; Ok(()) } diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs index 9dba42609..5f641dd9f 100644 --- a/diskann-benchmark/src/main.rs +++ b/diskann-benchmark/src/main.rs @@ -782,7 +782,7 @@ mod tests { #[test] fn multi_vector_integration() { - let path = example_directory().join("multi-vector-test.json"); + let path = example_directory().join("multi-vector.json"); let tempdir = tempfile::tempdir().unwrap(); let output_path = tempdir.path().join("output.json"); assert!(!output_path.exists()); @@ -843,7 +843,7 @@ mod tests { #[test] #[cfg(feature = "multi-vector")] fn multi_vector_check_verify() { - let input_path = example_directory().join("multi-vector-test.json"); + let input_path = example_directory().join("multi-vector.json"); let tolerance_path = project_directory() .join("perf_test_inputs") .join("multi-vector-tolerance.json"); diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f16.rs b/diskann-quantization/src/multi_vector/distance/kernels/f16.rs index a535c68dc..e6dc8a772 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/f16.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/f16.rs @@ -34,7 +34,7 @@ impl where A: Architecture, F32Kernel: Kernel, - layouts::BlockTransposed: layouts::ConvertTo as Kernel>::Left> + layouts::BlockTransposedLayout: layouts::ConvertTo as Kernel>::Left> + layouts::Layout, layouts::RowMajor: layouts::ConvertTo as Kernel>::Right> + layouts::Layout, diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs index a900ea356..602da6324 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs @@ -30,7 +30,7 @@ mod scalar; mod v3; /// Zero-sized kernel type for f32 micro-kernels with block size `GROUP`. -pub(crate) struct F32Kernel; +pub struct F32Kernel; #[inline(never)] #[cold] @@ -66,7 +66,7 @@ pub(super) fn max_ip_kernel( budget: TileBudget, ) where F32Kernel: Kernel, - layouts::BlockTransposed: + layouts::BlockTransposedLayout: layouts::ConvertTo as Kernel>::Left> + layouts::Layout, layouts::RowMajor: layouts::ConvertTo as Kernel>::Right> + layouts::Layout, @@ -117,7 +117,7 @@ impl where A: Architecture, Self: Kernel, - layouts::BlockTransposed: + layouts::BlockTransposedLayout: layouts::ConvertTo>::Left> + layouts::Layout, layouts::RowMajor: layouts::ConvertTo>::Right> + layouts::Layout, diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs b/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs index bd8fb1c4a..2a230ca62 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs @@ -27,7 +27,7 @@ diskann_wide::alias!(f32s = ::f32x8); // A_PANEL(8) * k A elements, UNROLL * k B elements, and A_PANEL(8) // scratch elements — all within the bounds guaranteed by `tiled_reduce`. unsafe impl Kernel for F32Kernel<8> { - type Left = layouts::BlockTransposed; + type Left = layouts::BlockTransposedLayout; type Right = layouts::RowMajor; const A_PANEL: usize = 8; const B_PANEL: usize = 2; diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs b/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs index b05195b1e..319cf1cda 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs @@ -17,7 +17,7 @@ diskann_wide::alias!(f32s = ::f32x8); // A_PANEL(16) * k A elements, UNROLL * k B elements, and A_PANEL(16) // scratch elements — all within the bounds guaranteed by `tiled_reduce`. unsafe impl Kernel for F32Kernel<16> { - type Left = layouts::BlockTransposed; + type Left = layouts::BlockTransposedLayout; type Right = layouts::RowMajor; const A_PANEL: usize = 16; const B_PANEL: usize = 4; diff --git a/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs b/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs index e1ec8dd36..54962acaa 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs @@ -4,7 +4,7 @@ //! Layout markers and tile-level conversion traits. //! //! - [`Layout`] — marker trait: memory layout + element type. -//! - [`BlockTransposed`] / [`RowMajor`] — zero-sized layout markers. +//! - [`BlockTransposedLayout`] / [`RowMajor`] — zero-sized layout markers. //! - [`DescribeLayout`] — bridges matrix types to layout markers. //! - [`ConvertTo`] — tile-level conversion (blanket identity + f16→f32). @@ -17,7 +17,7 @@ use diskann_wide::arch::Target2; // ── Layout trait ───────────────────────────────────── /// Memory layout and element type marker for tile data. -pub(super) trait Layout { +pub trait Layout { type Element: Copy; } @@ -25,28 +25,36 @@ pub(super) trait Layout { /// Block-transposed tile layout: `GROUP` rows per block, `PACK` columns /// interleaved. Matches [`BlockTransposedRef`](crate::multi_vector::BlockTransposedRef). -pub(super) struct BlockTransposed(PhantomData); - -impl BlockTransposed { +/// +/// This is the zero-sized **layout marker** used in [`Kernel::Left`] / +/// [`Kernel::Right`](super::Kernel) associated types. It is distinct +/// from the owning storage type [`BlockTransposed`](crate::multi_vector::BlockTransposed) +/// — the marker carries layout information at the type level; the owning +/// type holds actual data. +pub struct BlockTransposedLayout(PhantomData); + +impl BlockTransposedLayout { pub(super) fn new() -> Self { Self(PhantomData) } } -impl Copy for BlockTransposed {} +impl Copy for BlockTransposedLayout {} -impl Clone for BlockTransposed { +impl Clone for BlockTransposedLayout { fn clone(&self) -> Self { *self } } -impl Layout for BlockTransposed { +impl Layout + for BlockTransposedLayout +{ type Element = T; } /// Dense row-major tile layout. Matches [`MatRef>`](crate::multi_vector::MatRef). -pub(super) struct RowMajor(PhantomData); +pub struct RowMajor(PhantomData); impl RowMajor { pub(super) fn new() -> Self { @@ -70,7 +78,7 @@ impl Layout for RowMajor { /// Bridges a concrete matrix type to its [`Layout`] marker, enabling /// type inference of [`ConvertTo`] parameters at call sites. -pub(super) trait DescribeLayout { +pub trait DescribeLayout { type Layout: Layout; fn layout(&self) -> Self::Layout; @@ -79,10 +87,10 @@ pub(super) trait DescribeLayout { impl DescribeLayout for crate::multi_vector::BlockTransposedRef<'_, T, GROUP, PACK> { - type Layout = BlockTransposed; + type Layout = BlockTransposedLayout; fn layout(&self) -> Self::Layout { - BlockTransposed::new() + BlockTransposedLayout::new() } } @@ -108,7 +116,7 @@ impl DescribeLayout for crate::multi_vector::MatRef<'_, crate::multi_ve /// - `convert` reads at most `rows * k` source elements. /// - `convert` writes only within `buf`. /// - The returned pointer is valid until the next `&mut` access to `buf`. -pub(super) unsafe trait ConvertTo: Layout { +pub unsafe trait ConvertTo: Layout { /// Staging buffer for converted tile data (`()` for identity conversions). type Buffer; @@ -162,7 +170,8 @@ unsafe impl ConvertTo for L { // into `rows * k` f32 values in `buf`. The returned pointer is // `buf.as_ptr()`, valid until the next `&mut` access to `buf`. unsafe impl - ConvertTo> for BlockTransposed + ConvertTo> + for BlockTransposedLayout where A: Architecture, SliceCast: for<'a> Target2, diff --git a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs index bd9121a24..e7fbd16ed 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs @@ -12,27 +12,45 @@ //! - **Query**: Block-transposed (`GROUP` vectors per block, dimensions contiguous //! within each block). The block size is determined by the kernel's `A_PANEL`. //! - **Document**: Row-major (standard [`MatRef`](crate::multi_vector::MatRef) format). +//! +//! The [`Kernel`] trait + [`tiled_reduce`] + [`layouts`] are public so +//! external crates can plug new micro-kernels into the existing orchestrator. pub(super) mod f16; pub(super) mod f32; -mod layouts; +pub mod layouts; mod reduce; mod tiled_reduce; +pub use tiled_reduce::tiled_reduce; + // ── Tile budget ────────────────────────────────────────────────── /// Cache budgets fed to the tile planner. /// /// `Default` returns the production budgets derived from hardcoded L1/L2 -/// cache-size estimates and fixed fractions. +/// cache-size estimates and fixed fractions. Researchers benchmarking with +/// non-default cache assumptions can construct a custom [`TileBudget`] via +/// [`TileBudget::new`] and pass it to [`tiled_reduce`]. #[derive(Debug, Clone, Copy)] -struct TileBudget { +pub struct TileBudget { /// L2 budget in bytes reserved for A tiles. l2_a: usize, /// L1 budget in bytes reserved for B tiles (before A-panel subtraction). l1_b: usize, } +impl TileBudget { + /// Construct a [`TileBudget`] with explicit L2 and L1 byte budgets. + /// + /// `l2_a` is the budget the tile planner uses to size A tiles; `l1_b` + /// is the budget for B tiles (one A micro-panel is subtracted at + /// runtime since both must coexist in L1 during the inner loop). + pub fn new(l2_a: usize, l1_b: usize) -> Self { + Self { l2_a, l1_b } + } +} + impl Default for TileBudget { // TODO: Replace hardcoded fallbacks with detected cache sizes // (e.g. via `diskann_platform`, env-var override, or runtime query). @@ -51,7 +69,7 @@ impl Default for TileBudget { // ── Kernel trait ───────────────────────────────────────────────── -/// SIMD micro-kernel for the [`tiled_reduce`](tiled_reduce::tiled_reduce) loop. +/// SIMD micro-kernel for the [`tiled_reduce`] loop. /// /// The kernel only sees already-converted data: storage-layout to /// kernel-layout conversion is handled at tile boundaries by @@ -59,17 +77,32 @@ impl Default for TileBudget { /// pointers reference `::Element` / /// `::Element` directly. /// +/// # Invariant +/// +/// When pairing this kernel with the owning storage type +/// [`BlockTransposed`](crate::multi_vector::BlockTransposed) via +/// [`tiled_reduce`], the storage's `GROUP` const must equal this kernel's +/// [`A_PANEL`](Self::A_PANEL). The library's own f32 / f16 paths enforce +/// this with a `const { assert!(...) }` in `max_ip_kernel`; external +/// implementors must uphold it manually. +/// /// # Safety /// /// Implementors must respect the per-method `# Safety` contracts on /// [`full_panel`](Self::full_panel) and [`partial_panel`](Self::partial_panel). -unsafe trait Kernel { +/// Implementations should be validated under Miri: construct arch tokens +/// via `*::new_checked_miri()` in tests and gate Miri-unsupported +/// intrinsics with `#[cfg(not(miri))]`. +pub unsafe trait Kernel { /// Layout consumed by the A (left / query) side of the micro-kernel. type Left: layouts::Layout; /// Layout consumed by the B (right / document) side of the micro-kernel. type Right: layouts::Layout; /// Number of A rows processed per micro-kernel invocation. + /// + /// Callers of [`tiled_reduce`] must guarantee + /// `a_padded_nrows % A_PANEL == 0`. const A_PANEL: usize; /// Number of B rows processed per micro-kernel invocation. const B_PANEL: usize; @@ -79,10 +112,13 @@ unsafe trait Kernel { /// # Safety /// /// * `a` must point to `A_PANEL * k` contiguous elements of - /// `::Element`. + /// `::Element`, properly aligned for that layout. /// * `b` must point to `B_PANEL * k` contiguous elements of - /// `::Element`. + /// `::Element`, properly aligned for that layout. /// * `r` must point to at least `A_PANEL` writable `f32` values. + /// * `k > 0`. + /// * The caller must invoke this from within an `arch.run3` (or + /// equivalent) so that target_feature is active for the entire body. unsafe fn full_panel( arch: A, a: *const ::Element, @@ -95,11 +131,8 @@ unsafe trait Kernel { /// /// # Safety /// - /// * `a` must point to `A_PANEL * k` contiguous elements of - /// `::Element`. - /// * `b` must point to `remainder * k` contiguous elements of - /// `::Element`. - /// * `r` must point to at least `A_PANEL` writable `f32` values. + /// Same as [`full_panel`](Self::full_panel) except `b` points to + /// `remainder * k` contiguous elements and `1 <= remainder < B_PANEL`. unsafe fn partial_panel( arch: A, remainder: usize, diff --git a/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs b/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs index ff873c01f..285d823b6 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs @@ -89,7 +89,7 @@ impl FullReduce { /// * `b_ptr` must be valid for `b_nrows * k` elements of `BElem`. /// * `scratch` must have length ≥ `a_padded_nrows` and be initialized by caller. #[allow(clippy::too_many_arguments)] -pub(super) unsafe fn tiled_reduce( +pub unsafe fn tiled_reduce( arch: A, ca: &LA, cb: &LB, @@ -343,7 +343,7 @@ mod tests { let b = vec![0.0f32; 2 * k]; let mut scratch = vec![f32::MIN; 16]; - let ca = layouts::BlockTransposed::::new(); + let ca = layouts::BlockTransposedLayout::::new(); let cb = layouts::RowMajor::::new(); // SAFETY: pointers and scratch are correctly sized; we expect a panic. @@ -373,7 +373,7 @@ mod tests { let b = Vec::::new(); let mut scratch = vec![f32::MIN; a_rows]; - let ca = layouts::BlockTransposed::::new(); + let ca = layouts::BlockTransposedLayout::::new(); let cb = layouts::RowMajor::::new(); // SAFETY: k == 0 so no elements are read; pointers are never dereferenced. @@ -402,7 +402,7 @@ mod tests { let a_rows = 8; let mut scratch = vec![f32::MIN; a_rows]; - let ca = layouts::BlockTransposed::::new(); + let ca = layouts::BlockTransposedLayout::::new(); let cb = layouts::RowMajor::::new(); // SAFETY: k == 0, b_nrows == 0; no elements read. @@ -516,7 +516,7 @@ mod tests { A: Architecture, T: Copy + Default, F32Kernel: Kernel, - layouts::BlockTransposed: + layouts::BlockTransposedLayout: ConvertTo as Kernel>::Left> + Layout, layouts::RowMajor: ConvertTo as Kernel>::Right> + Layout, @@ -698,7 +698,7 @@ mod tests { A: Architecture, T: Copy + Default, F32Kernel: Kernel, - layouts::BlockTransposed: + layouts::BlockTransposedLayout: ConvertTo as Kernel>::Left> + Layout, layouts::RowMajor: ConvertTo as Kernel>::Right> + Layout, diff --git a/diskann-quantization/src/multi_vector/distance/mod.rs b/diskann-quantization/src/multi_vector/distance/mod.rs index 853f60753..354a47afe 100644 --- a/diskann-quantization/src/multi_vector/distance/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/mod.rs @@ -50,10 +50,10 @@ //! ``` mod fallback; -mod kernels; +pub mod kernels; mod max_sim; mod query_computer; pub use fallback::QueryMatRef; pub use max_sim::{Chamfer, MaxSim, MaxSimError}; -pub use query_computer::QueryComputer; +pub use query_computer::{DynQueryComputer, QueryComputer}; diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs index 9bb348a6a..cb575c09f 100644 --- a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs +++ b/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs @@ -19,6 +19,18 @@ impl QueryComputer { pub fn new(query: MatRef<'_, Standard>) -> Self { diskann_wide::arch::dispatch1_no_features(BuildComputer, query) } + + /// Build an f16 query computer pinned to a specific architecture token. + /// + /// See [`QueryComputer::::from_arch`] for the rationale and usage. + pub fn from_arch(query: MatRef<'_, Standard>, arch: A) -> Self + where + A: Architecture, + BuildComputer: + for<'a> diskann_wide::arch::Target1>>, + { + arch.run1(BuildComputer, query) + } } impl DynQueryComputer @@ -51,8 +63,9 @@ where } } +/// Architecture-dispatch target for `QueryComputer::` construction. #[derive(Debug, Clone, Copy)] -pub(super) struct BuildComputer; +pub struct BuildComputer; impl diskann_wide::arch::Target1, MatRef<'_, Standard>> for BuildComputer diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs index 9ff16b8b4..f8193ad7e 100644 --- a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs +++ b/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs @@ -19,6 +19,20 @@ impl QueryComputer { pub fn new(query: MatRef<'_, Standard>) -> Self { diskann_wide::arch::dispatch1_no_features(BuildComputer, query) } + + /// Build an f32 query computer pinned to a specific architecture token. + /// + /// The caller obtains the token via `Scalar::new()` (always available) + /// or `V3::new_checked()` / `V4::new_checked()` / `Neon::new_checked()` + /// (which check CPU support). Use this constructor to A/B compare kernels + /// across ISAs on the same machine. + pub fn from_arch(query: MatRef<'_, Standard>, arch: A) -> Self + where + A: Architecture, + BuildComputer: for<'a> diskann_wide::arch::Target1>>, + { + arch.run1(BuildComputer, query) + } } impl DynQueryComputer for Prepared> @@ -50,8 +64,9 @@ where } } +/// Architecture-dispatch target for `QueryComputer::` construction. #[derive(Debug, Clone, Copy)] -pub(super) struct BuildComputer; +pub struct BuildComputer; impl diskann_wide::arch::Target1, MatRef<'_, Standard>> for BuildComputer diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs b/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs index fbe84fcd3..efd552076 100644 --- a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs @@ -43,6 +43,16 @@ pub struct QueryComputer { } impl QueryComputer { + /// Wrap any [`DynQueryComputer`] implementation as a `QueryComputer`. + /// + /// This is the public seam for experimental kernels: implement + /// [`DynQueryComputer`] on a custom struct, then wrap it here so the + /// existing [`QueryComputer::max_sim`] / [`QueryComputer::chamfer`] veneer + /// works against it. + pub fn from_dyn(inner: Box>) -> Self { + Self { inner } + } + /// Number of logical (non-padded) query vectors. #[inline] pub fn nrows(&self) -> usize { @@ -88,8 +98,27 @@ impl QueryComputer { } } -trait DynQueryComputer: std::fmt::Debug + Send + Sync { +/// Object-safe interface for "anything that can compute MaxSim for a [`QueryComputer`]". +/// +/// The library's own architecture-dispatched path implements this on the +/// internal `Prepared` carriers. External crates implement it on their +/// own structs and wrap via [`QueryComputer::from_dyn`]. +/// +/// # Contract +/// +/// - [`compute_max_sim`](Self::compute_max_sim) is only invoked by +/// [`QueryComputer::max_sim`], which has already asserted +/// `scores.len() == self.nrows()` and short-circuited the zero-doc case. +/// Implementations may rely on `scores.len() == self.nrows()` and +/// `doc.num_vectors() > 0`. +/// - Implementations must populate all `nrows()` entries of `scores`. +/// [`QueryComputer::chamfer`] sums every entry, so leaving any trailing +/// slot unwritten would silently corrupt the result. +pub trait DynQueryComputer: std::fmt::Debug + Send + Sync { + /// Compute MaxSim into `scores`. See trait-level docs for the contract. fn compute_max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]); + + /// Number of query rows. fn nrows(&self) -> usize; } @@ -287,4 +316,38 @@ mod tests { test_matches_fallback!(f32, f32, 1e-10, "f32 "); test_matches_fallback!(f16, half::f16, 1e-10, "f16 "); + + // ============================================================ + // from_dyn: wrap a trivial custom DynQueryComputer. + // ============================================================ + #[derive(Debug)] + struct ConstantComputer { + nrows: usize, + value: f32, + } + + impl DynQueryComputer for ConstantComputer { + fn compute_max_sim(&self, _doc: MatRef<'_, Standard>, scores: &mut [f32]) { + for s in scores.iter_mut() { + *s = self.value; + } + } + fn nrows(&self) -> usize { + self.nrows + } + } + + #[test] + fn from_dyn_wraps_custom_impl() { + let computer = QueryComputer::::from_dyn(Box::new(ConstantComputer { + nrows: 3, + value: -1.5, + })); + assert_eq!(computer.nrows(), 3); + + let doc = make_mat(&[1.0f32, 0.0, 0.0, 1.0], 2, 2); + let mut scores = vec![0.0f32; 3]; + computer.max_sim(doc, &mut scores); + assert_eq!(scores, vec![-1.5, -1.5, -1.5]); + } } diff --git a/diskann-quantization/src/multi_vector/matrix.rs b/diskann-quantization/src/multi_vector/matrix.rs index bcbafaaa3..734cb7247 100644 --- a/diskann-quantization/src/multi_vector/matrix.rs +++ b/diskann-quantization/src/multi_vector/matrix.rs @@ -244,18 +244,6 @@ pub unsafe trait NewOwned: ReprOwned { #[derive(Debug, Clone, Copy)] pub struct Defaulted; -/// An initializer argument to [`NewOwned`] that invokes the wrapped closure for each -/// element. -/// -/// # Example -/// ``` -/// use diskann_quantization::multi_vector::{Init, Mat, Standard}; -/// let mut n = 0; -/// let mat = Mat::new(Standard::::new(1, 4).unwrap(), Init(|| { n += 1; n })).unwrap(); -/// assert_eq!(mat.as_slice(), &[1, 2, 3, 4]); -/// ``` -pub struct Init(pub F); - /// Create a new [`Mat`] cloned from a view. pub trait NewCloned: ReprOwned { /// Clone the contents behind `v`, returning a new owning [`Mat`]. @@ -526,22 +514,6 @@ where } } -// SAFETY: The implementation uses guarantees from `Box` to ensure that the pointer -// initialized by it is non-null and properly aligned to the underlying type. -unsafe impl NewOwned> for Standard -where - T: Copy, - F: FnMut() -> T, -{ - type Error = crate::error::Infallible; - fn new_owned(self, mut init: Init) -> Result, Self::Error> { - let b: Box<[T]> = (0..self.num_elements()).map(|_| (init.0)()).collect(); - - // SAFETY: By construction, `b` has length `self.num_elements()`. - Ok(unsafe { self.box_to_mat(b) }) - } -} - // SAFETY: This checks that the slice has the correct length, which is all that is // required for [`Repr`]. unsafe impl NewRef for Standard @@ -740,6 +712,22 @@ impl Clone for Mat { } impl Mat> { + /// Create a new matrix by invoking `f` once per element in row-major order. + /// + /// # Example + /// ``` + /// use diskann_quantization::multi_vector::{Mat, Standard}; + /// let mut n = 0; + /// let mat = Mat::from_fn(Standard::::new(1, 4).unwrap(), || { n += 1; n }); + /// assert_eq!(mat.as_slice(), &[1, 2, 3, 4]); + /// ``` + pub fn from_fn T>(repr: Standard, mut f: F) -> Self { + let b: Box<[T]> = (0..repr.num_elements()).map(|_| f()).collect(); + + // SAFETY: By construction, `b` has length `repr.num_elements()`. + unsafe { repr.box_to_mat(b) } + } + /// Returns the raw dimension (columns) of the vectors in the matrix. #[inline] pub fn vector_dim(&self) -> usize { @@ -1796,17 +1784,13 @@ mod tests { } #[test] - fn test_standard_new_owned_with_init() { + fn test_standard_from_fn() { let mut counter: i32 = 0; - let m = Mat::new( - Standard::::new(2, 3).unwrap(), - Init(|| { - let v = counter; - counter += 1; - v - }), - ) - .unwrap(); + let m = Mat::from_fn(Standard::::new(2, 3).unwrap(), || { + let v = counter; + counter += 1; + v + }); assert_eq!(m.as_slice(), &[0, 1, 2, 3, 4, 5]); } diff --git a/diskann-quantization/src/multi_vector/mod.rs b/diskann-quantization/src/multi_vector/mod.rs index 1d765bacc..3670b1aaf 100644 --- a/diskann-quantization/src/multi_vector/mod.rs +++ b/diskann-quantization/src/multi_vector/mod.rs @@ -74,6 +74,6 @@ pub(crate) mod matrix; pub use block_transposed::{BlockTransposed, BlockTransposedMut, BlockTransposedRef}; pub use distance::{Chamfer, MaxSim, MaxSimError, QueryComputer, QueryMatRef}; pub use matrix::{ - Defaulted, Init, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef, - Overflow, Repr, ReprMut, ReprOwned, SliceError, Standard, + Defaulted, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef, Overflow, + Repr, ReprMut, ReprOwned, SliceError, Standard, }; From 94fd8de24a937750e0b1c0fa3b52fc2392e26587 Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Thu, 14 May 2026 21:23:30 +0530 Subject: [PATCH 10/13] Minor doc fix --- .../src/backend/multi_vector/experimental/template.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/diskann-benchmark/src/backend/multi_vector/experimental/template.rs b/diskann-benchmark/src/backend/multi_vector/experimental/template.rs index f09f0c74e..64eeb3e00 100644 --- a/diskann-benchmark/src/backend/multi_vector/experimental/template.rs +++ b/diskann-benchmark/src/backend/multi_vector/experimental/template.rs @@ -9,7 +9,7 @@ //! `Kernel` impl to your target ISA, and add an `Arch` variant + a //! `register_regression` call to wire it up. //! -//! # The 6-step workflow +//! # The 5-step workflow //! //! 1. **Add an [`Arch`](crate::inputs::multi_vector::Arch) variant** for your //! experimental kernel (e.g. `X86_64_V4_Wide`). The `#[non_exhaustive]` @@ -28,7 +28,9 @@ //! `library_kernels.rs` (e.g. `match_arch_x86_64!`) for your new variant. //! 5. **Add a `RunBenchmark` impl + `register_regression` call.** Use //! `Kernel::::new()` as the registered benchmark entry. -//! 6. **Validate under Miri.** See the section below. +//! +//! Then validate under Miri before treating the kernel as correct — see the +//! section below. //! //! # Validating under Miri (REQUIRED) //! From f51bc2a2b0ceb635be46c0773fdfdc90ab018169 Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Tue, 19 May 2026 03:07:22 +0530 Subject: [PATCH 11/13] MaxSim benchmark with BYOTE factory --- diskann-benchmark/Cargo.toml | 3 + diskann-benchmark/example/multi-vector.json | 47 +++ .../multi-vector-tolerance.json | 16 + .../perf_test_inputs/multi-vector.json | 149 ++++++++ diskann-benchmark/src/backend/mod.rs | 2 + .../src/backend/multi_vector/README.md | 43 +++ .../src/backend/multi_vector/driver.rs | 318 ++++++++++++++++ .../src/backend/multi_vector/kernels.rs | 220 +++++++++++ .../src/backend/multi_vector/mod.rs | 202 ++++++++++ diskann-benchmark/src/inputs/mod.rs | 1 + diskann-benchmark/src/inputs/multi_vector.rs | 156 ++++++++ diskann-benchmark/src/main.rs | 86 +++++ .../src/multi_vector/distance/factory.rs | 360 ++++++++++++++++++ .../src/multi_vector/distance/isa.rs | 62 +++ .../src/multi_vector/distance/kernel.rs | 53 +++ .../src/multi_vector/distance/kernels/mod.rs | 5 +- .../src/multi_vector/distance/mod.rs | 22 +- .../distance/query_computer/f16.rs | 100 ----- .../distance/query_computer/f32.rs | 101 ----- .../distance/query_computer/mod.rs | 290 -------------- .../src/multi_vector/matrix.rs | 7 + diskann-quantization/src/multi_vector/mod.rs | 9 +- 22 files changed, 1747 insertions(+), 505 deletions(-) create mode 100644 diskann-benchmark/example/multi-vector.json create mode 100644 diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json create mode 100644 diskann-benchmark/perf_test_inputs/multi-vector.json create mode 100644 diskann-benchmark/src/backend/multi_vector/README.md create mode 100644 diskann-benchmark/src/backend/multi_vector/driver.rs create mode 100644 diskann-benchmark/src/backend/multi_vector/kernels.rs create mode 100644 diskann-benchmark/src/backend/multi_vector/mod.rs create mode 100644 diskann-benchmark/src/inputs/multi_vector.rs create mode 100644 diskann-quantization/src/multi_vector/distance/factory.rs create mode 100644 diskann-quantization/src/multi_vector/distance/isa.rs create mode 100644 diskann-quantization/src/multi_vector/distance/kernel.rs delete mode 100644 diskann-quantization/src/multi_vector/distance/query_computer/f16.rs delete mode 100644 diskann-quantization/src/multi_vector/distance/query_computer/f32.rs delete mode 100644 diskann-quantization/src/multi_vector/distance/query_computer/mod.rs diff --git a/diskann-benchmark/Cargo.toml b/diskann-benchmark/Cargo.toml index bebaf4b8e..ecc3a53dd 100644 --- a/diskann-benchmark/Cargo.toml +++ b/diskann-benchmark/Cargo.toml @@ -63,6 +63,9 @@ scalar-quantization = [] # Enable minmax-quantization based algorithms minmax-quantization = [] +# Enable multi-vector MaxSim distance benchmarks +multi-vector = [] + # Enable Disk Index benchmarks disk-index = [ "diskann-disk/perf_test", diff --git a/diskann-benchmark/example/multi-vector.json b/diskann-benchmark/example/multi-vector.json new file mode 100644 index 000000000..af66a886d --- /dev/null +++ b/diskann-benchmark/example/multi-vector.json @@ -0,0 +1,47 @@ +{ + "search_directories": [], + "jobs": [ + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "isa": "auto", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 2, "num_measurements": 1 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "isa": "scalar", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "isa": "reference", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 2, "num_measurements": 1 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "isa": "auto", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 } + ] + } + } + ] +} diff --git a/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json new file mode 100644 index 000000000..8d5997199 --- /dev/null +++ b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json @@ -0,0 +1,16 @@ +{ + "checks": [ + { + "input": { + "type": "multi-vector-op", + "content": {} + }, + "tolerance": { + "type": "multi-vector-tolerance", + "content": { + "min_time_regression": 0.05 + } + } + } + ] +} diff --git a/diskann-benchmark/perf_test_inputs/multi-vector.json b/diskann-benchmark/perf_test_inputs/multi-vector.json new file mode 100644 index 000000000..c4ce9bb8b --- /dev/null +++ b/diskann-benchmark/perf_test_inputs/multi-vector.json @@ -0,0 +1,149 @@ +{ + "search_directories": [], + "jobs": [ + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "isa": "auto", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "isa": "scalar", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "isa": "x86-64-v3", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "isa": "x86-64-v4", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "isa": "reference", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "isa": "x86-64-v3", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "isa": "x86-64-v4", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "isa": "reference", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + } + ] +} diff --git a/diskann-benchmark/src/backend/mod.rs b/diskann-benchmark/src/backend/mod.rs index 8396577e8..d04bae158 100644 --- a/diskann-benchmark/src/backend/mod.rs +++ b/diskann-benchmark/src/backend/mod.rs @@ -9,11 +9,13 @@ mod disk_index; mod exhaustive; mod filters; mod index; +mod multi_vector; pub(crate) fn register_benchmarks(registry: &mut Registry) -> anyhow::Result<()> { exhaustive::register_benchmarks(registry)?; disk_index::register_benchmarks(registry)?; index::register_benchmarks(registry)?; filters::register_benchmarks(registry)?; + multi_vector::register_benchmarks(registry)?; Ok(()) } diff --git a/diskann-benchmark/src/backend/multi_vector/README.md b/diskann-benchmark/src/backend/multi_vector/README.md new file mode 100644 index 000000000..f24d38ff7 --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/README.md @@ -0,0 +1,43 @@ +# Multi-vector benchmark — kernel-author workflow + +The multi-vector benchmark dispatches through `diskann-quantization`'s +`build_max_sim_f32` / `build_max_sim_f16` factory. Selection is driven by a +non-exhaustive `MaxSimIsa` enum. To add a new in-tree experimental kernel, +extend the enum + factory + the benchmark's shadow enum. + +## Steps + +1. **Library: variant + factory arm.** In + `diskann-quantization::multi_vector::distance`: + - Add a new variant to `MaxSimIsa` (in `isa.rs`). + - Implement `MaxSimKernel` for your kernel struct (in `factory.rs`, + next to `Prepared` and `ReferenceKernel`). + - Add a matching arm to `build_max_sim_f32` and/or `build_max_sim_f16` + that constructs your kernel and hands it to `erase.erase(...)`. + +2. **Benchmark: matching shadow variant.** In + `diskann-benchmark::inputs::multi_vector`: + - Add the same variant to `BenchIsa`. + - Add the matching arm to `From for MaxSimIsa`. + +3. **Run.** Set `"isa": "your-variant"` in the JSON job; the existing + `KernelF32` / `KernelF16` benchmark entries handle the rest. No new + `Benchmark` registration required. + +## Why two enums? + +`MaxSimIsa` (library) and `BenchIsa` (benchmark) are kept separate so the +library doesn't pin its public API on a serde version or a particular JSON +shape. The benchmark owns its kebab-case JSON layout; the library is +serde-agnostic. Mirroring variant-for-variant is intentional — small price +for keeping the library boundary clean. + +## Background + +The factory follows the BYOTE ("Bring your own type erasure") pattern +described in [RFC #1068]. If you want your kernel packaged as something +other than `Box>` (e.g. composed with chamfer summing, +or wrapped in a custom thin trait), implement your own `Erase` and pass +it to the factory in place of `BoxErase`. + +[RFC #1068]: https://github.com/microsoft/DiskANN/pull/1068 diff --git a/diskann-benchmark/src/backend/multi_vector/driver.rs b/diskann-benchmark/src/backend/multi_vector/driver.rs new file mode 100644 index 000000000..c9ac8b488 --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/driver.rs @@ -0,0 +1,318 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Shared benchmark infrastructure for multi-vector kernels. +//! +//! Houses the timing harness ([`run_loops`]), data fixtures ([`Data`]), result +//! types ([`RunResult`], [`Comparison`], [`CheckResult`]), and the trait-object +//! [`Distance`] boundary the driver dispatches through. None of the +//! contents are kernel-aware. + +use diskann_benchmark_runner::{ + utils::{ + fmt::Table, + num::{relative_change, NonNegativeFinite}, + percentiles, MicroSeconds, + }, + Any, CheckDeserialization, Checker, Input, +}; +use diskann_quantization::multi_vector::{Mat, MatRef, MaxSimKernel, Standard}; +use rand::{ + distr::{Distribution, StandardUniform}, + rngs::StdRng, + SeedableRng, +}; +use serde::{Deserialize, Serialize}; + +use crate::inputs::multi_vector::Run; + +////////////////////// +// Tolerance // +////////////////////// + +/// Tolerance thresholds for multi-vector benchmark regression detection. +/// +/// Each field specifies the maximum allowed relative increase in the corresponding metric. +/// For example, a value of `0.05` means a 5% increase is tolerated. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub(super) struct MultiVectorTolerance { + pub(super) min_time_regression: NonNegativeFinite, +} + +impl CheckDeserialization for MultiVectorTolerance { + fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { + Ok(()) + } +} + +impl Input for MultiVectorTolerance { + fn tag() -> &'static str { + "multi-vector-tolerance" + } + + fn try_deserialize( + serialized: &serde_json::Value, + checker: &mut Checker, + ) -> anyhow::Result { + checker.any(Self::deserialize(serialized)?) + } + + fn example() -> anyhow::Result { + const EXAMPLE: NonNegativeFinite = match NonNegativeFinite::new(0.05) { + Ok(v) => v, + Err(_) => panic!("use a non-negative finite please"), + }; + + Ok(serde_json::to_value(MultiVectorTolerance { + min_time_regression: EXAMPLE, + })?) + } +} + +/////////////////// +// Data fixtures // +/////////////////// + +/// Random query / doc fixture for a single benchmark run. +pub(super) struct Data { + pub(super) queries: Mat>, + pub(super) docs: Mat>, +} + +impl Data +where + StandardUniform: Distribution, +{ + pub(super) fn new(run: &Run) -> Self { + let mut rng = StdRng::seed_from_u64(0x12345); + let queries = Mat::from_fn( + Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(), + || StandardUniform.sample(&mut rng), + ); + let docs = Mat::from_fn( + Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(), + || StandardUniform.sample(&mut rng), + ); + Self { queries, docs } + } +} + +////////////////////// +// Distance trait // +////////////////////// + +/// Object-safe distance executor. The library factory's `Erase` visitor +/// already produces a `Box>`, but the driver wants its +/// own narrow trait so the kernel + its assertions are tucked inside one +/// vtable boundary. Simpler than threading `Box>` +/// generically through the timing harness. +pub(super) trait Distance { + fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]); +} + +/// Distance executor wrapping a boxed `MaxSimKernel` from the library +/// factory. One vtable hop in the hot loop. +pub(super) struct BoxedKernel(pub(super) Box>); + +impl Distance for BoxedKernel { + fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { + let nq = self.0.nrows(); + assert_eq!( + scores.len(), + nq, + "scores buffer not right size: {} != {}", + scores.len(), + nq + ); + if doc.num_vectors() == 0 { + return; + } + self.0.compute_max_sim(doc, scores); + } +} + +////////////////////// +// Timing harness // +////////////////////// + +fn run_loops(run: &Run, mut body: F) -> RunResult +where + F: FnMut(), +{ + let mut latencies = Vec::with_capacity(run.num_measurements.get()); + + for _ in 0..run.num_measurements.get() { + let start = std::time::Instant::now(); + for _ in 0..run.loops_per_measurement.get() { + body(); + } + latencies.push(start.elapsed().into()); + } + + let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap(); + RunResult { + run: run.clone(), + latencies, + percentiles, + } +} + +/// Shared loop nest. The trait-object dispatch happens once per outer iteration +/// of `run_loops`; the work inside each `max_sim` call is O(Q·D·dim), so the +/// vtable hop is in the noise. +pub(super) fn run_with_distance( + run: &Run, + doc: MatRef<'_, Standard>, + dist: &dyn Distance, +) -> RunResult { + let mut scores = vec![0.0f32; run.num_query_vectors.get()]; + run_loops(run, || { + dist.max_sim(doc, &mut scores); + std::hint::black_box(&mut scores); + }) +} + +////////////////////// +// Result types // +////////////////////// + +#[derive(Debug, Clone, Copy)] +pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T); + +impl std::ops::Deref for DisplayWrapper<'_, T> { + type Target = T; + fn deref(&self) -> &T { + self.0 + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub(super) struct RunResult { + /// The configuration for this run. + pub(super) run: Run, + /// Per-measurement latencies (over `loops_per_measurement` calls). + pub(super) latencies: Vec, + /// Latency percentiles. + pub(super) percentiles: percentiles::Percentiles, +} + +impl RunResult { + pub(super) fn computations_per_latency(&self) -> usize { + self.run.num_query_vectors.get() + * self.run.num_doc_vectors.get() + * self.run.loops_per_measurement.get() + } +} + +impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.is_empty() { + return Ok(()); + } + + writeln!( + f, + "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)" + )?; + + let header = [ + "Q", + "D", + "Dim", + "Min Time (ns/IP @ Dim)", + "Mean Time (ns/IP @ Dim)", + "Loops", + "Measurements", + ]; + + let mut table = Table::new(header, self.len()); + + self.iter().enumerate().for_each(|(row, r)| { + let mut row = table.row(row); + + let min_latency = r + .latencies + .iter() + .min() + .copied() + .unwrap_or(MicroSeconds::new(u64::MAX)); + let mean_latency = r.percentiles.mean; + + let computations_per_latency = r.computations_per_latency() as f64; + let min_time = min_latency.as_f64() / computations_per_latency * 1000.0; + let mean_time = mean_latency / computations_per_latency * 1000.0; + + row.insert(r.run.num_query_vectors, 0); + row.insert(r.run.num_doc_vectors, 1); + row.insert(r.run.dim, 2); + row.insert(format!("{:.3}", min_time), 3); + row.insert(format!("{:.3}", mean_time), 4); + row.insert(r.run.loops_per_measurement, 5); + row.insert(r.run.num_measurements, 6); + }); + + table.fmt(f) + } +} + +////////////////////// +// Regression Check // +////////////////////// + +/// Per-run comparison result showing before/after percentile differences. +#[derive(Debug, Serialize)] +pub(super) struct Comparison { + pub(super) run: Run, + pub(super) tolerance: MultiVectorTolerance, + pub(super) before_min: f64, + pub(super) after_min: f64, +} + +/// Aggregated result of the regression check across all runs. +#[derive(Debug, Serialize)] +pub(super) struct CheckResult { + pub(super) checks: Vec, +} + +impl std::fmt::Display for CheckResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let header = [ + "Q", + "D", + "Dim", + "Min Before (ns/IP @ Dim)", + "Min After (ns/IP @ Dim)", + "Change (%)", + "Remark", + ]; + + let mut table = Table::new(header, self.checks.len()); + + for (i, c) in self.checks.iter().enumerate() { + let mut row = table.row(i); + let change = relative_change(c.before_min, c.after_min); + + row.insert(c.run.num_query_vectors, 0); + row.insert(c.run.num_doc_vectors, 1); + row.insert(c.run.dim, 2); + row.insert(format!("{:.3}", c.before_min), 3); + row.insert(format!("{:.3}", c.after_min), 4); + match change { + Ok(change) => { + row.insert(format!("{:.3} %", change * 100.0), 5); + if change > c.tolerance.min_time_regression.get() { + row.insert("FAIL", 6); + } + } + Err(err) => { + row.insert("invalid", 5); + row.insert(err, 6); + } + } + } + + table.fmt(f) + } +} diff --git a/diskann-benchmark/src/backend/multi_vector/kernels.rs b/diskann-benchmark/src/backend/multi_vector/kernels.rs new file mode 100644 index 000000000..c9359b705 --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/kernels.rs @@ -0,0 +1,220 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! `Benchmark` impls for the multi-vector MaxSim factory. +//! +//! One entry per element type. Each `try_match` checks `element_type` only; +//! the `isa` field is passed to the library factory at run time. ISA +//! unavailability surfaces as `NotSupported`, which becomes a job-level +//! error. + +use std::io::Write; + +use diskann_benchmark_runner::{ + benchmark::{FailureScore, MatchScore, PassFail, Regression}, + utils::{datatype::AsDataType, num::relative_change}, + Benchmark, Checkpoint, Output, Registry, +}; +use diskann_quantization::multi_vector::{ + build_max_sim_f16, build_max_sim_f32, BoxErase, MaxSimKernel, +}; +use rand::distr::{Distribution, StandardUniform}; + +use super::driver::{ + run_with_distance, BoxedKernel, CheckResult, Comparison, Data, DisplayWrapper, + MultiVectorTolerance, RunResult, +}; +use crate::inputs::multi_vector::MultiVectorOp; + +// ───────────────────────────────────────────────────────────────────────── +// Per-element-type `Benchmark` carriers. +// ───────────────────────────────────────────────────────────────────────── + +#[derive(Debug)] +pub(super) struct KernelF32; + +#[derive(Debug)] +pub(super) struct KernelF16; + +/// Per-element-type bridge: factory entry name + factory call. +/// +/// Data-type matching (`DATA_TYPE`, `is_match`, `describe`) comes from the +/// framework's [`AsDataType`] trait, which is already implemented for `f32`, +/// `half::f16`, etc. +trait ElementType: AsDataType + Copy { + const ENTRY_NAME: &'static str; + fn build( + isa: diskann_quantization::multi_vector::MaxSimIsa, + query: diskann_quantization::multi_vector::MatRef< + '_, + diskann_quantization::multi_vector::Standard, + >, + ) -> Result>, diskann_quantization::multi_vector::NotSupported>; +} + +impl ElementType for f32 { + const ENTRY_NAME: &'static str = "multi-vector-op-f32"; + fn build( + isa: diskann_quantization::multi_vector::MaxSimIsa, + query: diskann_quantization::multi_vector::MatRef< + '_, + diskann_quantization::multi_vector::Standard, + >, + ) -> Result>, diskann_quantization::multi_vector::NotSupported> { + build_max_sim_f32(isa, query, BoxErase) + } +} + +impl ElementType for half::f16 { + const ENTRY_NAME: &'static str = "multi-vector-op-f16"; + fn build( + isa: diskann_quantization::multi_vector::MaxSimIsa, + query: diskann_quantization::multi_vector::MatRef< + '_, + diskann_quantization::multi_vector::Standard, + >, + ) -> Result>, diskann_quantization::multi_vector::NotSupported> + { + build_max_sim_f16(isa, query, BoxErase) + } +} + +fn run_benchmark(input: &MultiVectorOp) -> anyhow::Result> +where + StandardUniform: Distribution, +{ + let mut results = Vec::with_capacity(input.runs.len()); + for run in input.runs.iter() { + let data = Data::::new(run); + let kernel = T::build(input.isa.into(), data.queries.as_view())?; + let dist = BoxedKernel(kernel); + results.push(run_with_distance(run, data.docs.as_view(), &dist)); + } + Ok(results) +} + +// ───────────────────────────────────────────────────────────────────────── +// Benchmark + Regression impls. +// ───────────────────────────────────────────────────────────────────────── + +macro_rules! impl_benchmark { + ($ty:ident, $T:ty) => { + impl Benchmark for $ty + where + StandardUniform: Distribution<$T>, + { + type Input = MultiVectorOp; + type Output = Vec; + + fn try_match(&self, from: &MultiVectorOp) -> Result { + crate::utils::match_data_type::<$T>(from.element_type) + } + + fn run( + &self, + input: &MultiVectorOp, + _: Checkpoint<'_>, + mut output: &mut dyn Output, + ) -> anyhow::Result { + writeln!(output, "{}", input)?; + let results = run_benchmark::<$T>(input)?; + writeln!(output, "\n\n{}", DisplayWrapper(&*results))?; + Ok(results) + } + + fn description( + &self, + f: &mut std::fmt::Formatter<'_>, + input: Option<&MultiVectorOp>, + ) -> std::fmt::Result { + match input { + None => writeln!(f, "- Element Type: {}", <$T as AsDataType>::DATA_TYPE)?, + Some(input) => { + let desc = <$T as AsDataType>::describe(input.element_type); + if !desc.is_match() { + writeln!(f, "\n - Mismatched element type: {}", desc)?; + } + } + } + Ok(()) + } + } + + impl Regression for $ty + where + StandardUniform: Distribution<$T>, + { + type Tolerances = MultiVectorTolerance; + type Pass = CheckResult; + type Fail = CheckResult; + + fn check( + &self, + tolerance: &MultiVectorTolerance, + _input: &MultiVectorOp, + before: &Vec, + after: &Vec, + ) -> anyhow::Result> { + anyhow::ensure!( + before.len() == after.len(), + "before has {} runs but after has {}", + before.len(), + after.len(), + ); + + let mut passed = true; + let checks: Vec = std::iter::zip(before.iter(), after.iter()) + .enumerate() + .map(|(i, (b, a))| { + anyhow::ensure!(b.run == a.run, "run {i} mismatched"); + + let computations_per_latency = b.computations_per_latency() as f64; + let before_min = + b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + let after_min = + a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + + let comparison = Comparison { + run: b.run.clone(), + tolerance: *tolerance, + before_min, + after_min, + }; + + match relative_change(before_min, after_min) { + Ok(change) => { + if change > tolerance.min_time_regression.get() { + passed = false; + } + } + Err(_) => passed = false, + }; + + Ok(comparison) + }) + .collect::>>()?; + + Ok(if passed { + PassFail::Pass(CheckResult { checks }) + } else { + PassFail::Fail(CheckResult { checks }) + }) + } + } + }; +} + +impl_benchmark!(KernelF32, f32); +impl_benchmark!(KernelF16, half::f16); + +// ───────────────────────────────────────────────────────────────────────── +// Registration. +// ───────────────────────────────────────────────────────────────────────── + +pub(super) fn register(registry: &mut Registry) -> anyhow::Result<()> { + registry.register_regression(::ENTRY_NAME, KernelF32)?; + registry.register_regression(::ENTRY_NAME, KernelF16)?; + Ok(()) +} diff --git a/diskann-benchmark/src/backend/multi_vector/mod.rs b/diskann-benchmark/src/backend/multi_vector/mod.rs new file mode 100644 index 000000000..c3ffffeaf --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/mod.rs @@ -0,0 +1,202 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Multi-vector MaxSim distance benchmarks with regression detection. +//! +//! Registers one `Benchmark` entry per supported element type; the JSON `isa` +//! field selects the kernel at run time via the library's +//! [`build_max_sim_f32`] / [`build_max_sim_f16`] factories. +//! +//! See [`README.md`](./README.md) for the in-tree workflow when authoring a new +//! experimental kernel. +//! +//! [`build_max_sim_f32`]: diskann_quantization::multi_vector::build_max_sim_f32 +//! [`build_max_sim_f16`]: diskann_quantization::multi_vector::build_max_sim_f16 + +use diskann_benchmark_runner::Registry; + +cfg_if::cfg_if! { + if #[cfg(feature = "multi-vector")] { + mod driver; + mod kernels; + + pub(super) fn register_benchmarks(registry: &mut Registry) -> anyhow::Result<()> { + kernels::register(registry) + } + } else { + crate::utils::stub_impl!("multi-vector", inputs::multi_vector::MultiVectorOp); + + pub(super) fn register_benchmarks(registry: &mut Registry) -> anyhow::Result<()> { + imp::register("multi-vector-op", registry) + } + } +} + +#[cfg(all(test, feature = "multi-vector"))] +mod tests { + use std::num::NonZeroUsize; + + use diskann_benchmark_runner::{ + benchmark::{PassFail, Regression}, + utils::{ + datatype::DataType, num::NonNegativeFinite, percentiles::compute_percentiles, + MicroSeconds, + }, + }; + + use super::driver::{CheckResult, Comparison, MultiVectorTolerance, RunResult}; + use super::kernels::KernelF32; + use crate::inputs::multi_vector::{BenchIsa, MultiVectorOp, Run}; + + fn tiny_run() -> Run { + Run { + num_query_vectors: NonZeroUsize::new(2).unwrap(), + num_doc_vectors: NonZeroUsize::new(2).unwrap(), + dim: NonZeroUsize::new(4).unwrap(), + loops_per_measurement: NonZeroUsize::new(1).unwrap(), + num_measurements: NonZeroUsize::new(1).unwrap(), + } + } + + fn tiny_op() -> MultiVectorOp { + MultiVectorOp { + element_type: DataType::Float32, + isa: BenchIsa::Auto, + runs: vec![tiny_run()], + } + } + + fn tiny_result(minimum: u64) -> RunResult { + let mut latencies = vec![MicroSeconds::new(minimum)]; + let percentiles = compute_percentiles(&mut latencies).unwrap(); + RunResult { + run: tiny_run(), + latencies, + percentiles, + } + } + + fn tolerance(limit: f64) -> MultiVectorTolerance { + MultiVectorTolerance { + min_time_regression: NonNegativeFinite::new(limit).unwrap(), + } + } + + #[test] + fn check_rejects_mismatched_runs() { + let kernel = KernelF32; + + // Build a result whose `run` diverges from `tiny_run()` so the + // regression check's `b.run == a.run` invariant fires. + let mut latencies = vec![MicroSeconds::new(100)]; + let percentiles = compute_percentiles(&mut latencies).unwrap(); + let mismatched_result = RunResult { + run: Run { + num_query_vectors: NonZeroUsize::new(4).unwrap(), + ..tiny_run() + }, + latencies, + percentiles, + }; + + let err = kernel + .check( + &tolerance(0.0), + &tiny_op(), + &vec![tiny_result(100)], + &vec![mismatched_result], + ) + .unwrap_err(); + + assert_eq!(err.to_string(), "run 0 mismatched"); + } + + #[test] + fn check_allows_negative_relative_change() { + let kernel = KernelF32; + + let result = kernel + .check( + &tolerance(0.0), + &tiny_op(), + &vec![tiny_result(100)], + &vec![tiny_result(95)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Pass(_))); + } + + #[test] + fn check_passes_on_tolerance_boundary() { + let kernel = KernelF32; + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(100)], + &vec![tiny_result(105)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Pass(_))); + } + + #[test] + fn check_fails_above_tolerance_boundary() { + let kernel = KernelF32; + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(100)], + &vec![tiny_result(106)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Fail(_))); + } + + #[test] + fn check_result_display_includes_failure_details() { + let check = CheckResult { + checks: vec![Comparison { + run: tiny_run(), + tolerance: tolerance(0.05), + before_min: 100.0, + after_min: 106.0, + }], + }; + + let rendered = check.to_string(); + assert!(rendered.contains("Q"), "rendered = {rendered}"); + assert!(rendered.contains("Dim"), "rendered = {rendered}"); + assert!(rendered.contains("100.000"), "rendered = {rendered}"); + assert!(rendered.contains("106.000"), "rendered = {rendered}"); + assert!(rendered.contains("6.000 %"), "rendered = {rendered}"); + assert!(rendered.contains("FAIL"), "rendered = {rendered}"); + } + + /// A "before" value of 0 means the measurement was too fast to obtain a + /// reliable signal, so we *could* be letting a regression through. We + /// require at least a non-zero value. + #[test] + fn zero_values_rejected() { + let kernel = KernelF32; + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(0)], + &vec![tiny_result(0)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Fail(_))); + } +} diff --git a/diskann-benchmark/src/inputs/mod.rs b/diskann-benchmark/src/inputs/mod.rs index 7875beb1d..58c07aa00 100644 --- a/diskann-benchmark/src/inputs/mod.rs +++ b/diskann-benchmark/src/inputs/mod.rs @@ -7,6 +7,7 @@ pub(crate) mod disk; pub(crate) mod exhaustive; pub(crate) mod filters; pub(crate) mod graph_index; +pub(crate) mod multi_vector; pub(crate) mod save_and_load; /// Construct an example input of type `Self`. diff --git a/diskann-benchmark/src/inputs/multi_vector.rs b/diskann-benchmark/src/inputs/multi_vector.rs new file mode 100644 index 000000000..9d863c13a --- /dev/null +++ b/diskann-benchmark/src/inputs/multi_vector.rs @@ -0,0 +1,156 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +use std::num::NonZeroUsize; + +use diskann_benchmark_runner::{utils::datatype::DataType, CheckDeserialization, Checker}; +use diskann_quantization::multi_vector::MaxSimIsa; +use serde::{Deserialize, Serialize}; + +use crate::inputs::{as_input, Example}; + +////////////// +// Registry // +////////////// + +as_input!(MultiVectorOp); + +//////////////// +// Enum types // +//////////////// + +/// JSON-facing shadow of [`MaxSimIsa`] from `diskann-quantization`. The +/// library's enum is deliberately not `Serialize`/`Deserialize` so it isn't +/// pinned to a particular JSON shape; this enum owns the kebab-case +/// serialization and converts to the library type at dispatch time. +/// +/// **Stays in sync with `MaxSimIsa` manually.** When the library adds a +/// variant, mirror it here + add a matching arm to `From for +/// MaxSimIsa`. +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +#[non_exhaustive] +pub(crate) enum BenchIsa { + #[serde(rename = "x86-64-v4")] + #[allow(non_camel_case_types)] + X86_64_V4, + #[serde(rename = "x86-64-v3")] + #[allow(non_camel_case_types)] + X86_64_V3, + Neon, + Scalar, + Reference, + Auto, +} + +impl std::fmt::Display for BenchIsa { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let st = match self { + Self::X86_64_V4 => "x86-64-v4", + Self::X86_64_V3 => "x86-64-v3", + Self::Neon => "neon", + Self::Scalar => "scalar", + Self::Reference => "reference", + Self::Auto => "auto", + }; + write!(f, "{}", st) + } +} + +impl From for MaxSimIsa { + fn from(b: BenchIsa) -> Self { + match b { + BenchIsa::X86_64_V4 => MaxSimIsa::X86_64_V4, + BenchIsa::X86_64_V3 => MaxSimIsa::X86_64_V3, + BenchIsa::Neon => MaxSimIsa::Neon, + BenchIsa::Scalar => MaxSimIsa::Scalar, + BenchIsa::Reference => MaxSimIsa::Reference, + BenchIsa::Auto => MaxSimIsa::Auto, + } + } +} + +/// One benchmark configuration: a single shape measurement. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub(crate) struct Run { + pub(crate) num_query_vectors: NonZeroUsize, + pub(crate) num_doc_vectors: NonZeroUsize, + pub(crate) dim: NonZeroUsize, + pub(crate) loops_per_measurement: NonZeroUsize, + pub(crate) num_measurements: NonZeroUsize, +} + +/////////////////////// +// Multi-Vector Op // +/////////////////////// + +/// A complete multi-vector benchmark job. +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct MultiVectorOp { + pub(crate) element_type: DataType, + pub(crate) isa: BenchIsa, + pub(crate) runs: Vec, +} + +impl MultiVectorOp { + pub(crate) const fn tag() -> &'static str { + "multi-vector-op" + } +} + +impl CheckDeserialization for MultiVectorOp { + fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { + Ok(()) + } +} + +impl Example for MultiVectorOp { + fn example() -> Self { + const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap(); + const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap(); + const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap(); + const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap(); + + let runs = vec![ + Run { + num_query_vectors: NonZeroUsize::new(32).unwrap(), + num_doc_vectors: NUM_DOC_VECTORS, + dim: DIM, + loops_per_measurement: LOOPS_PER_MEASUREMENT, + num_measurements: NUM_MEASUREMENTS, + }, + Run { + num_query_vectors: NonZeroUsize::new(64).unwrap(), + num_doc_vectors: NUM_DOC_VECTORS, + dim: DIM, + loops_per_measurement: LOOPS_PER_MEASUREMENT, + num_measurements: NUM_MEASUREMENTS, + }, + ]; + + Self { + element_type: DataType::Float32, + isa: BenchIsa::Auto, + runs, + } + } +} + +macro_rules! write_field { + ($f:ident, $field:tt, $($expr:tt)*) => { + writeln!($f, "{:>18}: {}", $field, $($expr)*) + } +} + +impl std::fmt::Display for MultiVectorOp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Multi-Vector Operation\n")?; + write_field!(f, "tag", Self::tag())?; + write_field!(f, "element type", self.element_type)?; + write_field!(f, "isa", self.isa)?; + write_field!(f, "number of runs", self.runs.len())?; + Ok(()) + } +} diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs index cc70120cd..c87a08e17 100644 --- a/diskann-benchmark/src/main.rs +++ b/diskann-benchmark/src/main.rs @@ -772,6 +772,92 @@ mod tests { assert!(!output_path.exists()); } + /////////////////// + // Multi-Vector // + /////////////////// + + #[test] + fn multi_vector_integration() { + let path = example_directory().join("multi-vector.json"); + let tempdir = tempfile::tempdir().unwrap(); + let output_path = tempdir.path().join("output.json"); + assert!(!output_path.exists()); + + let modified_input_path = tempdir.path().join("input.json"); + + let mut raw = value_from_file(&path); + prefix_search_directories(&mut raw, &root_directory()); + save_to_file(&modified_input_path, &raw); + + run_multi_vector_integration(&modified_input_path, &output_path) + } + + #[cfg(feature = "multi-vector")] + fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) { + let command = Commands::Run { + input_file: input_path.to_owned(), + output_file: output_path.to_owned(), + dry_run: false, + allow_debug: true, + }; + + let cli = Cli::from_commands(command, true); + let mut output = Memory::new(); + + cli.run(&mut output).unwrap(); + println!( + "output = {}", + String::from_utf8(output.into_inner()).unwrap() + ); + + // Check that the results file is generated. + assert!(output_path.exists()); + } + + #[cfg(not(feature = "multi-vector"))] + fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) { + let command = Commands::Run { + input_file: input_path.to_owned(), + output_file: output_path.to_owned(), + dry_run: false, + allow_debug: true, + }; + let cli = Cli::from_commands(command, true); + let mut output = Memory::new(); + + let err = cli.run(&mut output).unwrap_err(); + println!("err = {:?}", err); + + let output = String::from_utf8(output.into_inner()).unwrap(); + assert!(output.contains("\"multi-vector\" feature")); + println!("output = {}", output); + + // The output file should not have been created because we failed the test. + assert!(!output_path.exists()); + } + + #[test] + #[cfg(feature = "multi-vector")] + fn multi_vector_check_verify() { + let input_path = example_directory().join("multi-vector.json"); + let tolerance_path = project_directory() + .join("perf_test_inputs") + .join("multi-vector-tolerance.json"); + + let command = Commands::Check(diskann_benchmark_runner::app::Check::Verify { + tolerances: tolerance_path, + input_file: input_path, + }); + + let cli = Cli::from_commands(command, true); + let mut output = Memory::new(); + cli.run(&mut output).unwrap(); + println!( + "output = {}", + String::from_utf8(output.into_inner()).unwrap() + ); + } + #[test] fn quiet_suppresses_check_target_warning() { let cli = Cli::from_commands(Commands::Skeleton, true); diff --git a/diskann-quantization/src/multi_vector/distance/factory.rs b/diskann-quantization/src/multi_vector/distance/factory.rs new file mode 100644 index 000000000..78d15273e --- /dev/null +++ b/diskann-quantization/src/multi_vector/distance/factory.rs @@ -0,0 +1,360 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Factory + concrete `MaxSimKernel` implementations for the multi-vector +//! distance API. See [`build_max_sim_f32`] / [`build_max_sim_f16`] for the +//! BYOTE entry points. + +use diskann_utils::Reborrow; +use diskann_vector::distance::InnerProduct; +use diskann_vector::{DistanceFunctionMut, PureDistanceFunction}; +use diskann_wide::Architecture; +use diskann_wide::arch::Scalar; +#[cfg(target_arch = "aarch64")] +use diskann_wide::arch::aarch64::Neon; +#[cfg(target_arch = "x86_64")] +use diskann_wide::arch::x86_64::{V3, V4}; + +use super::isa::{MaxSimIsa, NotSupported}; +use super::kernel::{Erase, MaxSimKernel}; +use super::kernels::f16::F16Entry; +use super::kernels::f32::F32Kernel; +use super::max_sim::MaxSim; +use crate::multi_vector::distance::QueryMatRef; +use crate::multi_vector::{BlockTransposed, BlockTransposedRef, Mat, MatRef, Standard}; + +// ───────────────────────────────────────────────────────────────────────── +// Prepared — concrete kernel for the arch-dispatched paths. +// ───────────────────────────────────────────────────────────────────────── + +/// Concrete kernel: owns an arch token and a block-transposed prepared query. +/// One generic `MaxSimKernel` impl covers every arch (Scalar/V3/V4/Neon) +/// for every supported element type (f32, f16) via the `Kernel` / `Target3` +/// dispatch in the `kernels` module. +#[derive(Debug)] +struct Prepared { + arch: A, + prepared: Q, +} + +impl MaxSimKernel for Prepared> +where + A: Architecture, + F32Kernel: for<'a> diskann_wide::arch::Target3< + A, + (), + BlockTransposedRef<'a, f32, GROUP>, + MatRef<'a, Standard>, + &'a mut [f32], + >, +{ + fn nrows(&self) -> usize { + self.prepared.nrows() + } + + fn compute_max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { + let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()]; + self.arch.run3( + F32Kernel::, + self.prepared.reborrow(), + doc, + &mut scratch, + ); + for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) { + *dst = -src; + } + } +} + +impl MaxSimKernel + for Prepared> +where + A: Architecture, + F16Entry: for<'a> diskann_wide::arch::Target3< + A, + (), + BlockTransposedRef<'a, half::f16, GROUP>, + MatRef<'a, Standard>, + &'a mut [f32], + >, +{ + fn nrows(&self) -> usize { + self.prepared.nrows() + } + + fn compute_max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { + let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()]; + self.arch.run3( + F16Entry::, + self.prepared.reborrow(), + doc, + &mut scratch, + ); + for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) { + *dst = -src; + } + } +} + +// ───────────────────────────────────────────────────────────────────────── +// ReferenceKernel — non-SIMD fallback that wraps MaxSim::evaluate. +// ───────────────────────────────────────────────────────────────────────── + +/// `MaxSimIsa::Reference` path. Owns the query as a `Mat>` and +/// delegates to the existing `MaxSim` fallback per `compute_max_sim` call. +struct ReferenceKernel { + query: Mat>, +} + +impl std::fmt::Debug for ReferenceKernel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ReferenceKernel") + .field("nrows", &self.query.num_vectors()) + .finish() + } +} + +impl ReferenceKernel { + fn new(query: MatRef<'_, Standard>) -> Self { + let repr = *query.repr(); + let src = query.as_slice(); + let mut idx = 0usize; + let owned = Mat::>::from_fn(repr, || { + let v = src[idx]; + idx += 1; + v + }); + Self { query: owned } + } +} + +impl MaxSimKernel for ReferenceKernel +where + T: Copy + Send + Sync + std::fmt::Debug + 'static, + InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, +{ + fn nrows(&self) -> usize { + self.query.num_vectors() + } + + fn compute_max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { + if scores.is_empty() { + return; + } + let query: QueryMatRef<'_, Standard> = self.query.as_view().into(); + let Ok(mut max_sim) = MaxSim::new(scores) else { + return; + }; + let _ = max_sim.evaluate(query, doc); + } +} + +// ───────────────────────────────────────────────────────────────────────── +// BuildAndErase — Target1 impls used by `dispatch1_no_features` (Auto). +// ───────────────────────────────────────────────────────────────────────── + +/// Internal Target1 carrier used only by the `MaxSimIsa::Auto` arm of +/// `build_max_sim_*`. `dispatch1_no_features` picks the highest available +/// arch on the host CPU and calls the matching `Target1::run` below. +struct BuildAndErase(E); + +// ───── f32 Target1 impls ───── + +impl> diskann_wide::arch::Target1>> + for BuildAndErase +{ + fn run(self, arch: Scalar, query: MatRef<'_, Standard>) -> E::Output { + let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); + self.0.erase(Prepared { arch, prepared }) + } +} + +#[cfg(target_arch = "x86_64")] +impl> diskann_wide::arch::Target1>> + for BuildAndErase +{ + fn run(self, arch: V3, query: MatRef<'_, Standard>) -> E::Output { + let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); + self.0.erase(Prepared { arch, prepared }) + } +} + +#[cfg(target_arch = "x86_64")] +impl> diskann_wide::arch::Target1>> + for BuildAndErase +{ + fn run(self, arch: V4, query: MatRef<'_, Standard>) -> E::Output { + // V4 has no dedicated kernel yet; retarget to V3. + let arch = arch.retarget(); + let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); + self.0.erase(Prepared { arch, prepared }) + } +} + +#[cfg(target_arch = "aarch64")] +impl> diskann_wide::arch::Target1>> + for BuildAndErase +{ + fn run(self, arch: Neon, query: MatRef<'_, Standard>) -> E::Output { + // Neon has no dedicated kernel yet; retarget to Scalar. + let arch = arch.retarget(); + let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); + self.0.erase(Prepared { arch, prepared }) + } +} + +// ───── f16 Target1 impls ───── + +impl> + diskann_wide::arch::Target1>> + for BuildAndErase +{ + fn run(self, arch: Scalar, query: MatRef<'_, Standard>) -> E::Output { + let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); + self.0.erase(Prepared { arch, prepared }) + } +} + +#[cfg(target_arch = "x86_64")] +impl> + diskann_wide::arch::Target1>> + for BuildAndErase +{ + fn run(self, arch: V3, query: MatRef<'_, Standard>) -> E::Output { + let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); + self.0.erase(Prepared { arch, prepared }) + } +} + +#[cfg(target_arch = "x86_64")] +impl> + diskann_wide::arch::Target1>> + for BuildAndErase +{ + fn run(self, arch: V4, query: MatRef<'_, Standard>) -> E::Output { + let arch = arch.retarget(); + let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); + self.0.erase(Prepared { arch, prepared }) + } +} + +#[cfg(target_arch = "aarch64")] +impl> + diskann_wide::arch::Target1>> + for BuildAndErase +{ + fn run(self, arch: Neon, query: MatRef<'_, Standard>) -> E::Output { + let arch = arch.retarget(); + let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); + self.0.erase(Prepared { arch, prepared }) + } +} + +// ───────────────────────────────────────────────────────────────────────── +// Factory functions. +// ───────────────────────────────────────────────────────────────────────── + +/// Build a multi-vector MaxSim kernel for `f32` queries. +/// +/// Dispatches on `isa`, constructs the corresponding concrete kernel, and +/// hands it to `erase.erase(...)`. Returns [`NotSupported`] when the requested +/// ISA cannot run on this build (e.g. AVX-512 unavailable; aarch64 on x86_64). +pub fn build_max_sim_f32>( + isa: MaxSimIsa, + query: MatRef<'_, Standard>, + erase: E, +) -> Result { + match isa { + MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features( + BuildAndErase(erase), + query, + )), + MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)), + #[cfg(target_arch = "x86_64")] + MaxSimIsa::X86_64_V3 => { + let arch = V3::new_checked().ok_or(NotSupported { + isa, + reason: "AVX2/FMA unavailable on this CPU", + })?; + Ok(arch.run1(BuildAndErase(erase), query)) + } + #[cfg(target_arch = "x86_64")] + MaxSimIsa::X86_64_V4 => { + let arch = V4::new_checked().ok_or(NotSupported { + isa, + reason: "AVX-512 unavailable on this CPU", + })?; + Ok(arch.run1(BuildAndErase(erase), query)) + } + #[cfg(not(target_arch = "x86_64"))] + MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported { + isa, + reason: "x86_64 target only", + }), + #[cfg(target_arch = "aarch64")] + MaxSimIsa::Neon => { + let arch = Neon::new_checked().ok_or(NotSupported { + isa, + reason: "Neon unavailable on this CPU", + })?; + Ok(arch.run1(BuildAndErase(erase), query)) + } + #[cfg(not(target_arch = "aarch64"))] + MaxSimIsa::Neon => Err(NotSupported { + isa, + reason: "aarch64 target only", + }), + MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::::new(query))), + } +} + +/// Build a multi-vector MaxSim kernel for `half::f16` queries. Same contract +/// as [`build_max_sim_f32`]. +pub fn build_max_sim_f16>( + isa: MaxSimIsa, + query: MatRef<'_, Standard>, + erase: E, +) -> Result { + match isa { + MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features( + BuildAndErase(erase), + query, + )), + MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)), + #[cfg(target_arch = "x86_64")] + MaxSimIsa::X86_64_V3 => { + let arch = V3::new_checked().ok_or(NotSupported { + isa, + reason: "AVX2/FMA unavailable on this CPU", + })?; + Ok(arch.run1(BuildAndErase(erase), query)) + } + #[cfg(target_arch = "x86_64")] + MaxSimIsa::X86_64_V4 => { + let arch = V4::new_checked().ok_or(NotSupported { + isa, + reason: "AVX-512 unavailable on this CPU", + })?; + Ok(arch.run1(BuildAndErase(erase), query)) + } + #[cfg(not(target_arch = "x86_64"))] + MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported { + isa, + reason: "x86_64 target only", + }), + #[cfg(target_arch = "aarch64")] + MaxSimIsa::Neon => { + let arch = Neon::new_checked().ok_or(NotSupported { + isa, + reason: "Neon unavailable on this CPU", + })?; + Ok(arch.run1(BuildAndErase(erase), query)) + } + #[cfg(not(target_arch = "aarch64"))] + MaxSimIsa::Neon => Err(NotSupported { + isa, + reason: "aarch64 target only", + }), + MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::::new(query))), + } +} diff --git a/diskann-quantization/src/multi_vector/distance/isa.rs b/diskann-quantization/src/multi_vector/distance/isa.rs new file mode 100644 index 000000000..49768bc48 --- /dev/null +++ b/diskann-quantization/src/multi_vector/distance/isa.rs @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Instruction Set Architecture (ISA) selector for the multi-vector MaxSim +//! factory. + +/// Instruction Set Architecture (ISA) selector for which multi-vector MaxSim +/// kernel to build. +/// +/// `#[non_exhaustive]` so adding a variant (e.g. for a new in-tree kernel) is +/// not a breaking change. Deliberately **not** `Serialize`/`Deserialize` — +/// callers wanting JSON support maintain their own shadow enum and convert +/// via `From` / `TryFrom`, so the library is not pinned to a particular +/// serialization format. +#[non_exhaustive] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[allow(non_camel_case_types)] +pub enum MaxSimIsa { + /// Pick the highest ISA the host CPU supports. + Auto, + /// Pure-scalar (emulated SIMD) kernel — always available. + Scalar, + /// x86_64 AVX2 + FMA. + X86_64_V3, + /// x86_64 AVX-512. + X86_64_V4, + /// AArch64 Neon. + Neon, + /// Non-SIMD reference fallback. Slow; serves as a correctness baseline. + Reference, +} + +impl std::fmt::Display for MaxSimIsa { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Self::Auto => "auto", + Self::Scalar => "scalar", + Self::X86_64_V3 => "x86-64-v3", + Self::X86_64_V4 => "x86-64-v4", + Self::Neon => "neon", + Self::Reference => "reference", + }; + f.write_str(s) + } +} + +/// Returned by `build_max_sim_*` when the requested ISA cannot be produced on +/// the current host (e.g. x86_64 V4 requested on a non-AVX512 CPU, or Neon +/// requested on x86_64). +#[derive(Debug, Clone, Copy)] +pub struct NotSupported { + pub isa: MaxSimIsa, + pub reason: &'static str, +} + +impl std::fmt::Display for NotSupported { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} not supported: {}", self.isa, self.reason) + } +} + +impl std::error::Error for NotSupported {} diff --git a/diskann-quantization/src/multi_vector/distance/kernel.rs b/diskann-quantization/src/multi_vector/distance/kernel.rs new file mode 100644 index 000000000..a2fd530d9 --- /dev/null +++ b/diskann-quantization/src/multi_vector/distance/kernel.rs @@ -0,0 +1,53 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Object-safe kernel boundary trait plus BYOTE visitor trait. + +use crate::multi_vector::{MatRef, Standard}; + +/// Object-safe interface for computing per-query MaxSim scores. +/// +/// # Contract +/// +/// - `scores.len() == self.nrows()` (caller's precondition). +/// - The implementation must populate **all** `nrows()` entries of `scores`. +/// Callers that derive quantities from the full score vector (e.g. sums) +/// would silently corrupt their result if any trailing entry were left +/// unwritten. +pub trait MaxSimKernel: Send + Sync + std::fmt::Debug { + /// Number of query rows whose scores this kernel produces. + fn nrows(&self) -> usize; + + /// Compute per-query MaxSim scores against `doc` into `scores`. + fn compute_max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]); +} + +/// "Bring your own type erasure" visitor. The factory hands an implementation +/// to `erase`, which decides how to package / type-erase it. Lets different +/// callers produce different output shapes (e.g. `Box>`, +/// a chamfer-only closure, a batched evaluator, ...) from the same factory. +/// +/// See [`BoxErase`] for the default impl used by most callers. +pub trait Erase { + /// What the visitor produces. + type Output; + /// Visit the concrete kernel. `K` is generic so the body sees its concrete + /// type and the compiler can inline it into the wrapper. + fn erase + 'static>(self, kernel: K) -> Self::Output; +} + +/// Default [`Erase`] impl: produces `Box>`. +/// +/// Use this when the caller just wants a heap-allocated kernel object behind +/// a vtable. For custom packaging (chamfer-only, batched, composed), write +/// your own `Erase` impl and pass it to the factory in place of `BoxErase`. +#[derive(Debug, Clone, Copy)] +pub struct BoxErase; + +impl Erase for BoxErase { + type Output = Box>; + + fn erase + 'static>(self, kernel: K) -> Self::Output { + Box::new(kernel) + } +} diff --git a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs index bd9121a24..55108698d 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs @@ -3,9 +3,8 @@ //! Block-transposed SIMD kernels for multi-vector distance computation. //! -//! This module provides a SIMD-accelerated implementation that uses block-transposed -//! memory layout for **query** vectors (instead of documents), with documents remaining -//! in row-major format. +//! SIMD-accelerated implementation that uses block-transposed memory layout +//! for **query** vectors, with documents remaining in row-major format. //! //! # Memory Layout //! diff --git a/diskann-quantization/src/multi_vector/distance/mod.rs b/diskann-quantization/src/multi_vector/distance/mod.rs index 853f60753..9afb070c5 100644 --- a/diskann-quantization/src/multi_vector/distance/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/mod.rs @@ -5,15 +5,15 @@ //! //! Provides asymmetric distance primitives for multi-vector search: //! -//! - [`MaxSim`]: Per-query-vector maximum similarities. -//! - [`Chamfer`]: Sum of MaxSim scores (asymmetric Chamfer distance). -//! - [`QueryComputer`]: Architecture-dispatched query computer backed by -//! SIMD-accelerated block-transposed kernels. +//! - [`MaxSim`]: per-query-vector maximum similarities. +//! - [`Chamfer`]: sum of MaxSim scores (asymmetric Chamfer distance). +//! - [`MaxSimKernel`]: object-safe interface implemented by every concrete +//! kernel constructed through [`build_max_sim_f32`] / [`build_max_sim_f16`]. +//! - [`Erase`]: BYOTE visitor — caller decides how to type-erase the kernel. //! //! The fallback path uses a double-loop kernel over -//! [`InnerProduct`](diskann_vector::distance::InnerProduct). The optimised -//! path (via [`QueryComputer`]) uses block-transposed layout with -//! cache-tiled SIMD micro-kernels. +//! [`InnerProduct`](diskann_vector::distance::InnerProduct). The factory +//! functions return cache-tiled SIMD kernels selected by [`MaxSimIsa`]. //! //! # Example //! @@ -49,11 +49,15 @@ //! // scores[1] = 0.0 (query[1] has no good match: max IP was 0) //! ``` +mod factory; mod fallback; +mod isa; +mod kernel; mod kernels; mod max_sim; -mod query_computer; +pub use factory::{build_max_sim_f16, build_max_sim_f32}; pub use fallback::QueryMatRef; +pub use isa::{MaxSimIsa, NotSupported}; +pub use kernel::{BoxErase, Erase, MaxSimKernel}; pub use max_sim::{Chamfer, MaxSim, MaxSimError}; -pub use query_computer::QueryComputer; diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs deleted file mode 100644 index 9bb348a6a..000000000 --- a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT license. - -use diskann_wide::Architecture; -use diskann_wide::arch::Scalar; -#[cfg(target_arch = "aarch64")] -use diskann_wide::arch::aarch64::Neon; -#[cfg(target_arch = "x86_64")] -use diskann_wide::arch::x86_64::{V3, V4}; - -use super::{DynQueryComputer, Prepared, QueryComputer, build_prepared}; -use crate::multi_vector::distance::kernels::f16::F16Entry; -use crate::multi_vector::{BlockTransposed, BlockTransposedRef, MatRef, Standard}; -use diskann_utils::Reborrow; - -impl QueryComputer { - /// Build an f16 query computer, selecting the optimal architecture and - /// GROUP for the current CPU at runtime. - pub fn new(query: MatRef<'_, Standard>) -> Self { - diskann_wide::arch::dispatch1_no_features(BuildComputer, query) - } -} - -impl DynQueryComputer - for Prepared> -where - A: Architecture, - F16Entry: for<'a> diskann_wide::arch::Target3< - A, - (), - BlockTransposedRef<'a, half::f16, GROUP>, - MatRef<'a, Standard>, - &'a mut [f32], - >, -{ - fn compute_max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { - let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()]; - self.arch.run3( - F16Entry::, - self.prepared.reborrow(), - doc, - &mut scratch, - ); - for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) { - *dst = -src; - } - } - - fn nrows(&self) -> usize { - self.prepared.nrows() - } -} - -#[derive(Debug, Clone, Copy)] -pub(super) struct BuildComputer; - -impl diskann_wide::arch::Target1, MatRef<'_, Standard>> - for BuildComputer -{ - fn run(self, arch: Scalar, query: MatRef<'_, Standard>) -> QueryComputer { - QueryComputer { - inner: Box::new(build_prepared::(arch, query)), - } - } -} - -#[cfg(target_arch = "x86_64")] -impl diskann_wide::arch::Target1, MatRef<'_, Standard>> - for BuildComputer -{ - fn run(self, arch: V3, query: MatRef<'_, Standard>) -> QueryComputer { - QueryComputer { - inner: Box::new(build_prepared::(arch, query)), - } - } -} - -#[cfg(target_arch = "x86_64")] -impl diskann_wide::arch::Target1, MatRef<'_, Standard>> - for BuildComputer -{ - fn run(self, arch: V4, query: MatRef<'_, Standard>) -> QueryComputer { - let arch = arch.retarget(); - QueryComputer { - inner: Box::new(build_prepared::(arch, query)), - } - } -} - -#[cfg(target_arch = "aarch64")] -impl diskann_wide::arch::Target1, MatRef<'_, Standard>> - for BuildComputer -{ - fn run(self, arch: Neon, query: MatRef<'_, Standard>) -> QueryComputer { - let arch = arch.retarget(); - QueryComputer { - inner: Box::new(build_prepared::(arch, query)), - } - } -} diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs deleted file mode 100644 index 9ff16b8b4..000000000 --- a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT license. - -use diskann_wide::Architecture; -use diskann_wide::arch::Scalar; -#[cfg(target_arch = "aarch64")] -use diskann_wide::arch::aarch64::Neon; -#[cfg(target_arch = "x86_64")] -use diskann_wide::arch::x86_64::{V3, V4}; - -use super::{DynQueryComputer, Prepared, QueryComputer, build_prepared}; -use crate::multi_vector::distance::kernels::f32::F32Kernel; -use crate::multi_vector::{BlockTransposed, BlockTransposedRef, MatRef, Standard}; -use diskann_utils::Reborrow; - -impl QueryComputer { - /// Build an f32 query computer, selecting the optimal architecture and - /// GROUP for the current CPU at runtime. - pub fn new(query: MatRef<'_, Standard>) -> Self { - diskann_wide::arch::dispatch1_no_features(BuildComputer, query) - } -} - -impl DynQueryComputer for Prepared> -where - A: Architecture, - F32Kernel: for<'a> diskann_wide::arch::Target3< - A, - (), - BlockTransposedRef<'a, f32, GROUP>, - MatRef<'a, Standard>, - &'a mut [f32], - >, -{ - fn compute_max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { - let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()]; - self.arch.run3( - F32Kernel::, - self.prepared.reborrow(), - doc, - &mut scratch, - ); - for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) { - *dst = -src; - } - } - - fn nrows(&self) -> usize { - self.prepared.nrows() - } -} - -#[derive(Debug, Clone, Copy)] -pub(super) struct BuildComputer; - -impl diskann_wide::arch::Target1, MatRef<'_, Standard>> - for BuildComputer -{ - fn run(self, arch: Scalar, query: MatRef<'_, Standard>) -> QueryComputer { - QueryComputer { - inner: Box::new(build_prepared::(arch, query)), - } - } -} - -#[cfg(target_arch = "x86_64")] -impl diskann_wide::arch::Target1, MatRef<'_, Standard>> - for BuildComputer -{ - fn run(self, arch: V3, query: MatRef<'_, Standard>) -> QueryComputer { - QueryComputer { - inner: Box::new(build_prepared::(arch, query)), - } - } -} - -#[cfg(target_arch = "x86_64")] -impl diskann_wide::arch::Target1, MatRef<'_, Standard>> - for BuildComputer -{ - fn run(self, arch: V4, query: MatRef<'_, Standard>) -> QueryComputer { - // V4 delegates to V3 — the V3 micro-kernel is valid on V4 hardware. - let arch = arch.retarget(); - QueryComputer { - inner: Box::new(build_prepared::(arch, query)), - } - } -} - -#[cfg(target_arch = "aarch64")] -impl diskann_wide::arch::Target1, MatRef<'_, Standard>> - for BuildComputer -{ - fn run(self, arch: Neon, query: MatRef<'_, Standard>) -> QueryComputer { - // Neon delegates to Scalar. - let arch = arch.retarget(); - QueryComputer { - inner: Box::new(build_prepared::(arch, query)), - } - } -} diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs b/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs deleted file mode 100644 index fbe84fcd3..000000000 --- a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs +++ /dev/null @@ -1,290 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT license. - -//! Architecture-opaque query computer with runtime dispatch. -//! -//! [`QueryComputer`] wraps a block-transposed query and a captured -//! architecture token behind a trait-object vtable. CPU detection happens -//! once at construction; every subsequent distance call goes through -//! [`Architecture::run3`](diskann_wide::Architecture::run3) with full -//! `#[target_feature]` propagation — no re-dispatch and no enum matching -//! on the hot path. -//! -//! # Usage -//! -//! ``` -//! use diskann_quantization::multi_vector::{ -//! QueryComputer, MatRef, Standard, -//! }; -//! -//! let query_data = [1.0f32, 0.0, 0.0, 1.0]; -//! let doc_data = [1.0f32, 0.0, 0.0, 1.0]; -//! -//! let query = MatRef::new(Standard::new(2, 2).unwrap(), &query_data).unwrap(); -//! let doc = MatRef::new(Standard::new(2, 2).unwrap(), &doc_data).unwrap(); -//! -//! // Build — runtime detects arch, picks optimal GROUP, captures both -//! let computer = QueryComputer::::new(query); -//! -//! // Distance — vtable → arch.run3 with target_feature propagation -//! let dist = computer.chamfer(doc); -//! assert_eq!(dist, -2.0); -//! ``` - -mod f16; -mod f32; - -use crate::multi_vector::{BlockTransposed, MatRef, Standard}; - -/// Architecture-dispatched query computer for multi-vector distance. -#[derive(Debug)] -pub struct QueryComputer { - inner: Box>, -} - -impl QueryComputer { - /// Number of logical (non-padded) query vectors. - #[inline] - pub fn nrows(&self) -> usize { - self.inner.nrows() - } - - /// Compute Chamfer distance (sum of per-query max similarities, negated). - /// - /// Returns `0.0` if the document has zero vectors. - pub fn chamfer(&self, doc: MatRef<'_, Standard>) -> f32 { - let nq = self.nrows(); - if doc.num_vectors() == 0 { - return 0.0; - } - let mut scores = vec![0.0f32; nq]; - self.max_sim(doc, &mut scores); - scores.iter().sum() - } - - /// Compute per-query-vector max similarities into `scores`. - /// - /// `scores` must have length equal to [`nrows()`](Self::nrows). - /// Each entry is the negated max inner product for that query vector. - /// - /// # Panics - /// - /// Panics if `scores.len() != self.nrows()`. - pub fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { - let nq = self.nrows(); - assert_eq!( - scores.len(), - nq, - "scores buffer not right size: {} != {}", - scores.len(), - nq - ); - - if doc.num_vectors() == 0 { - return; - } - - self.inner.compute_max_sim(doc, scores); - } -} - -trait DynQueryComputer: std::fmt::Debug + Send + Sync { - fn compute_max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]); - fn nrows(&self) -> usize; -} - -#[derive(Debug)] -struct Prepared { - arch: A, - prepared: Q, -} - -fn build_prepared( - arch: A, - query: MatRef<'_, Standard>, -) -> Prepared> { - let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); - Prepared { arch, prepared } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::multi_vector::{Chamfer, MaxSim, QueryMatRef}; - use diskann_vector::distance::InnerProduct; - use diskann_vector::{DistanceFunctionMut, PureDistanceFunction}; - - trait FromF32 { - fn from_f32(v: f32) -> Self; - } - - impl FromF32 for f32 { - fn from_f32(v: f32) -> Self { - v - } - } - - impl FromF32 for half::f16 { - fn from_f32(v: f32) -> Self { - diskann_wide::cast_f32_to_f16(v) - } - } - - fn make_mat(data: &[T], nrows: usize, ncols: usize) -> MatRef<'_, Standard> { - MatRef::new(Standard::new(nrows, ncols).unwrap(), data).unwrap() - } - - fn make_test_data(len: usize, ceil: usize, shift: usize) -> Vec { - (0..len) - .map(|v| T::from_f32(((v + shift) % ceil) as f32)) - .collect() - } - - /// Shapes for the `chamfer_matches_fallback` / `max_sim_matches_fallback` - /// agreement checks: (num_queries, num_docs, dim). - /// - /// This matrix targets the API-layer wiring that lives above the - /// kernel — `QueryComputer::new` query setup, `chamfer` row - /// summation, `max_sim` per-row writeback, and the f16 query - /// conversion path — not kernel correctness. A small - /// representative set is sufficient because exhaustive shape - /// coverage (panel boundaries, B-remainder classes, prime `k`, - /// degenerate dims) is pinned one layer below in - /// `kernels::tiled_reduce::tests::NAIVE_CASES`, and structural - /// loop-path coverage in `tiled_reduce_all_loop_paths_match_naive`. - const TEST_CASES: &[(usize, usize, usize)] = &[ - (1, 1, 4), // Degenerate - (5, 3, 5), // Prime k; nq > 1 and nd > 1 exercise chamfer summation - // and per-row max_sim writeback on a non-trivial shape - (17, 4, 64), // A-panel remainder crossing both Scalar and V3 panel widths - (16, 6, 32), // B-remainder ≠ 1 (V3 b_remainder = 2) - ]; - - fn check_chamfer_matches( - build: fn(MatRef<'_, Standard>) -> QueryComputer, - tol: f32, - label: &str, - ) where - InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, - { - for &(nq, nd, dim) in TEST_CASES { - let query_data = make_test_data::(nq * dim, dim, dim / 2); - let doc_data = make_test_data::(nd * dim, dim, dim); - - let query = make_mat(&query_data, nq, dim); - let doc = make_mat(&doc_data, nd, dim); - - let expected = Chamfer::evaluate(QueryMatRef::from(query), doc); - let actual = build(query).chamfer(doc); - - assert!( - (actual - expected).abs() < tol, - "{label}Chamfer mismatch for ({nq},{nd},{dim}): actual={actual}, expected={expected}", - ); - } - } - - fn check_max_sim_matches( - build: fn(MatRef<'_, Standard>) -> QueryComputer, - tol: f32, - label: &str, - ) where - InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, - { - for &(nq, nd, dim) in TEST_CASES { - let query_data = make_test_data::(nq * dim, dim, dim / 2); - let doc_data = make_test_data::(nd * dim, dim, dim); - - let query = make_mat(&query_data, nq, dim); - let doc = make_mat(&doc_data, nd, dim); - - let mut expected_scores = vec![0.0f32; nq]; - let _ = MaxSim::new(&mut expected_scores) - .unwrap() - .evaluate(QueryMatRef::from(query), doc); - - let computer = build(query); - let mut actual_scores = vec![0.0f32; nq]; - computer.max_sim(doc, &mut actual_scores); - - for i in 0..nq { - assert!( - (actual_scores[i] - expected_scores[i]).abs() < tol, - "{label}MaxSim[{i}] mismatch for ({nq},{nd},{dim}): actual={}, expected={}", - actual_scores[i], - expected_scores[i], - ); - } - } - } - - #[test] - fn query_computer_dimensions() { - let data = vec![1.0f32; 5 * 8]; - let query = make_mat(&data, 5, 8); - let computer = QueryComputer::::new(query); - - assert_eq!(computer.nrows(), 5); - } - - #[test] - fn query_computer_f16_dimensions() { - let data = vec![diskann_wide::cast_f32_to_f16(1.0); 5 * 8]; - let query = make_mat(data.as_slice(), 5, 8); - let computer = QueryComputer::::new(query); - - assert_eq!(computer.nrows(), 5); - } - - #[test] - fn chamfer_with_zero_docs() { - let query = make_mat(&[1.0f32, 0.0, 0.0, 1.0], 2, 2); - let computer = QueryComputer::::new(query); - let doc = make_mat(&[], 0, 2); - assert_eq!(computer.chamfer(doc), 0.0); - } - - #[test] - fn max_sim_with_zero_docs() { - let query = make_mat(&[1.0f32, 0.0, 0.0, 1.0], 2, 2); - let computer = QueryComputer::::new(query); - let doc = make_mat::(&[], 0, 2); - let mut scores = vec![0.0f32; 2]; - computer.max_sim(doc, &mut scores); - // With zero docs the scores buffer is left untouched. - for &s in &scores { - assert_eq!(s, 0.0, "zero-doc MaxSim should leave scores untouched"); - } - } - - #[test] - #[should_panic(expected = "scores buffer not right size")] - fn max_sim_panics_on_size_mismatch() { - let query = make_mat(&[1.0f32, 2.0, 3.0, 4.0], 2, 2); - let computer = QueryComputer::::new(query); - let doc = make_mat(&[1.0, 1.0], 1, 2); - let mut scores = vec![0.0f32; 3]; // Wrong size - computer.max_sim(doc, &mut scores); - } - - macro_rules! test_matches_fallback { - ($mod_name:ident, $ty:ty, $tol:expr, $label:literal) => { - mod $mod_name { - use super::*; - - #[test] - fn chamfer_matches_fallback() { - check_chamfer_matches(QueryComputer::<$ty>::new, $tol, $label); - } - - #[test] - fn max_sim_matches_fallback() { - check_max_sim_matches(QueryComputer::<$ty>::new, $tol, $label); - } - } - }; - } - - test_matches_fallback!(f32, f32, 1e-10, "f32 "); - test_matches_fallback!(f16, half::f16, 1e-10, "f16 "); -} diff --git a/diskann-quantization/src/multi_vector/matrix.rs b/diskann-quantization/src/multi_vector/matrix.rs index 70629d44c..31c430995 100644 --- a/diskann-quantization/src/multi_vector/matrix.rs +++ b/diskann-quantization/src/multi_vector/matrix.rs @@ -712,6 +712,13 @@ impl Clone for Mat { } impl Mat> { + /// Construct a [`Mat`] by calling `f` once per element in row-major order. + pub fn from_fn T>(repr: Standard, mut f: F) -> Self { + let b: Box<[T]> = (0..repr.num_elements()).map(|_| f()).collect(); + // SAFETY: `b` has length `repr.num_elements()` by construction. + unsafe { repr.box_to_mat(b) } + } + /// Returns the raw dimension (columns) of the vectors in the matrix. #[inline] pub fn vector_dim(&self) -> usize { diff --git a/diskann-quantization/src/multi_vector/mod.rs b/diskann-quantization/src/multi_vector/mod.rs index 3670b1aaf..d2ad0e7bc 100644 --- a/diskann-quantization/src/multi_vector/mod.rs +++ b/diskann-quantization/src/multi_vector/mod.rs @@ -20,9 +20,11 @@ //! | [`BlockTransposedRef`] | Immutable view of a block-transposed matrix | //! | [`BlockTransposedMut`] | Mutable view of a block-transposed matrix | //! | [`QueryMatRef`] | Query wrapper for asymmetric distances | -//! | [`QueryComputer`] | Architecture-dispatched SIMD query computer | //! | [`MaxSim`] | Per-query-vector max similarity computation | //! | [`Chamfer`] | Asymmetric Chamfer distance (sum of MaxSim) | +//! | [`MaxSimKernel`] | Object-safe kernel returned by [`build_max_sim_f32`] / [`build_max_sim_f16`] | +//! | [`MaxSimIsa`] | ISA selector for the factory functions | +//! | [`Erase`] | BYOTE visitor used by the factory | //! //! # Example //! @@ -72,7 +74,10 @@ pub mod distance; pub(crate) mod matrix; pub use block_transposed::{BlockTransposed, BlockTransposedMut, BlockTransposedRef}; -pub use distance::{Chamfer, MaxSim, MaxSimError, QueryComputer, QueryMatRef}; +pub use distance::{ + BoxErase, Chamfer, Erase, MaxSim, MaxSimError, MaxSimIsa, MaxSimKernel, NotSupported, + QueryMatRef, build_max_sim_f16, build_max_sim_f32, +}; pub use matrix::{ Defaulted, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef, Overflow, Repr, ReprMut, ReprOwned, SliceError, Standard, From 597330a1daf273f59fff6b55f3e0d79e110b47bd Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Tue, 19 May 2026 23:01:58 +0530 Subject: [PATCH 12/13] Address review comments --- .../src/backend/multi_vector/README.md | 43 -- .../src/backend/multi_vector/driver.rs | 18 +- .../src/backend/multi_vector/kernels.rs | 286 ++++++------- .../src/backend/multi_vector/mod.rs | 108 ++++- .../src/multi_vector/distance/factory.rs | 377 +++++++++++++----- .../src/multi_vector/distance/isa.rs | 6 +- .../src/multi_vector/distance/mod.rs | 8 +- diskann-quantization/src/multi_vector/mod.rs | 7 +- 8 files changed, 505 insertions(+), 348 deletions(-) delete mode 100644 diskann-benchmark/src/backend/multi_vector/README.md diff --git a/diskann-benchmark/src/backend/multi_vector/README.md b/diskann-benchmark/src/backend/multi_vector/README.md deleted file mode 100644 index f24d38ff7..000000000 --- a/diskann-benchmark/src/backend/multi_vector/README.md +++ /dev/null @@ -1,43 +0,0 @@ -# Multi-vector benchmark — kernel-author workflow - -The multi-vector benchmark dispatches through `diskann-quantization`'s -`build_max_sim_f32` / `build_max_sim_f16` factory. Selection is driven by a -non-exhaustive `MaxSimIsa` enum. To add a new in-tree experimental kernel, -extend the enum + factory + the benchmark's shadow enum. - -## Steps - -1. **Library: variant + factory arm.** In - `diskann-quantization::multi_vector::distance`: - - Add a new variant to `MaxSimIsa` (in `isa.rs`). - - Implement `MaxSimKernel` for your kernel struct (in `factory.rs`, - next to `Prepared` and `ReferenceKernel`). - - Add a matching arm to `build_max_sim_f32` and/or `build_max_sim_f16` - that constructs your kernel and hands it to `erase.erase(...)`. - -2. **Benchmark: matching shadow variant.** In - `diskann-benchmark::inputs::multi_vector`: - - Add the same variant to `BenchIsa`. - - Add the matching arm to `From for MaxSimIsa`. - -3. **Run.** Set `"isa": "your-variant"` in the JSON job; the existing - `KernelF32` / `KernelF16` benchmark entries handle the rest. No new - `Benchmark` registration required. - -## Why two enums? - -`MaxSimIsa` (library) and `BenchIsa` (benchmark) are kept separate so the -library doesn't pin its public API on a serde version or a particular JSON -shape. The benchmark owns its kebab-case JSON layout; the library is -serde-agnostic. Mirroring variant-for-variant is intentional — small price -for keeping the library boundary clean. - -## Background - -The factory follows the BYOTE ("Bring your own type erasure") pattern -described in [RFC #1068]. If you want your kernel packaged as something -other than `Box>` (e.g. composed with chamfer summing, -or wrapped in a custom thin trait), implement your own `Erase` and pass -it to the factory in place of `BoxErase`. - -[RFC #1068]: https://github.com/microsoft/DiskANN/pull/1068 diff --git a/diskann-benchmark/src/backend/multi_vector/driver.rs b/diskann-benchmark/src/backend/multi_vector/driver.rs index c9ac8b488..e59f24ac2 100644 --- a/diskann-benchmark/src/backend/multi_vector/driver.rs +++ b/diskann-benchmark/src/backend/multi_vector/driver.rs @@ -27,6 +27,7 @@ use rand::{ use serde::{Deserialize, Serialize}; use crate::inputs::multi_vector::Run; +use crate::utils::DisplayWrapper; ////////////////////// // Tolerance // @@ -137,10 +138,7 @@ impl Distance for BoxedKernel { // Timing harness // ////////////////////// -fn run_loops(run: &Run, mut body: F) -> RunResult -where - F: FnMut(), -{ +fn run_loops(run: &Run, body: &mut dyn FnMut()) -> RunResult { let mut latencies = Vec::with_capacity(run.num_measurements.get()); for _ in 0..run.num_measurements.get() { @@ -168,7 +166,7 @@ pub(super) fn run_with_distance( dist: &dyn Distance, ) -> RunResult { let mut scores = vec![0.0f32; run.num_query_vectors.get()]; - run_loops(run, || { + run_loops(run, &mut || { dist.max_sim(doc, &mut scores); std::hint::black_box(&mut scores); }) @@ -178,16 +176,6 @@ pub(super) fn run_with_distance( // Result types // ////////////////////// -#[derive(Debug, Clone, Copy)] -pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T); - -impl std::ops::Deref for DisplayWrapper<'_, T> { - type Target = T; - fn deref(&self) -> &T { - self.0 - } -} - #[derive(Debug, Serialize, Deserialize)] pub(super) struct RunResult { /// The configuration for this run. diff --git a/diskann-benchmark/src/backend/multi_vector/kernels.rs b/diskann-benchmark/src/backend/multi_vector/kernels.rs index c9359b705..cc4e63b89 100644 --- a/diskann-benchmark/src/backend/multi_vector/kernels.rs +++ b/diskann-benchmark/src/backend/multi_vector/kernels.rs @@ -5,216 +5,158 @@ //! `Benchmark` impls for the multi-vector MaxSim factory. //! -//! One entry per element type. Each `try_match` checks `element_type` only; -//! the `isa` field is passed to the library factory at run time. ISA -//! unavailability surfaces as `NotSupported`, which becomes a job-level -//! error. +//! A single generic [`Kernel`] carrier supplies the `Benchmark` and +//! `Regression` impls for every element type accepted by the library's +//! [`MaxSimElement`] sealed trait. Each `try_match` checks `element_type` +//! only; the JSON `isa` field is passed to the library factory at run time, +//! and arch unavailability surfaces as a job-level error via +//! [`NotSupported`](diskann_quantization::multi_vector::NotSupported). use std::io::Write; +use std::marker::PhantomData; use diskann_benchmark_runner::{ benchmark::{FailureScore, MatchScore, PassFail, Regression}, utils::{datatype::AsDataType, num::relative_change}, Benchmark, Checkpoint, Output, Registry, }; -use diskann_quantization::multi_vector::{ - build_max_sim_f16, build_max_sim_f32, BoxErase, MaxSimKernel, -}; +use diskann_quantization::multi_vector::{build_max_sim, BoxErase, MaxSimElement}; use rand::distr::{Distribution, StandardUniform}; use super::driver::{ - run_with_distance, BoxedKernel, CheckResult, Comparison, Data, DisplayWrapper, - MultiVectorTolerance, RunResult, + run_with_distance, BoxedKernel, CheckResult, Comparison, Data, MultiVectorTolerance, RunResult, }; use crate::inputs::multi_vector::MultiVectorOp; +use crate::utils::DisplayWrapper; // ───────────────────────────────────────────────────────────────────────── -// Per-element-type `Benchmark` carriers. +// Kernel — generic carrier registered once per element type. // ───────────────────────────────────────────────────────────────────────── #[derive(Debug)] -pub(super) struct KernelF32; - -#[derive(Debug)] -pub(super) struct KernelF16; - -/// Per-element-type bridge: factory entry name + factory call. -/// -/// Data-type matching (`DATA_TYPE`, `is_match`, `describe`) comes from the -/// framework's [`AsDataType`] trait, which is already implemented for `f32`, -/// `half::f16`, etc. -trait ElementType: AsDataType + Copy { - const ENTRY_NAME: &'static str; - fn build( - isa: diskann_quantization::multi_vector::MaxSimIsa, - query: diskann_quantization::multi_vector::MatRef< - '_, - diskann_quantization::multi_vector::Standard, - >, - ) -> Result>, diskann_quantization::multi_vector::NotSupported>; -} +pub(super) struct Kernel(PhantomData); -impl ElementType for f32 { - const ENTRY_NAME: &'static str = "multi-vector-op-f32"; - fn build( - isa: diskann_quantization::multi_vector::MaxSimIsa, - query: diskann_quantization::multi_vector::MatRef< - '_, - diskann_quantization::multi_vector::Standard, - >, - ) -> Result>, diskann_quantization::multi_vector::NotSupported> { - build_max_sim_f32(isa, query, BoxErase) +impl Kernel { + pub(super) const fn new() -> Self { + Self(PhantomData) } } -impl ElementType for half::f16 { - const ENTRY_NAME: &'static str = "multi-vector-op-f16"; - fn build( - isa: diskann_quantization::multi_vector::MaxSimIsa, - query: diskann_quantization::multi_vector::MatRef< - '_, - diskann_quantization::multi_vector::Standard, - >, - ) -> Result>, diskann_quantization::multi_vector::NotSupported> - { - build_max_sim_f16(isa, query, BoxErase) - } -} - -fn run_benchmark(input: &MultiVectorOp) -> anyhow::Result> +impl Benchmark for Kernel where + T: MaxSimElement + AsDataType, StandardUniform: Distribution, { - let mut results = Vec::with_capacity(input.runs.len()); - for run in input.runs.iter() { - let data = Data::::new(run); - let kernel = T::build(input.isa.into(), data.queries.as_view())?; - let dist = BoxedKernel(kernel); - results.push(run_with_distance(run, data.docs.as_view(), &dist)); - } - Ok(results) -} - -// ───────────────────────────────────────────────────────────────────────── -// Benchmark + Regression impls. -// ───────────────────────────────────────────────────────────────────────── + type Input = MultiVectorOp; + type Output = Vec; -macro_rules! impl_benchmark { - ($ty:ident, $T:ty) => { - impl Benchmark for $ty - where - StandardUniform: Distribution<$T>, - { - type Input = MultiVectorOp; - type Output = Vec; - - fn try_match(&self, from: &MultiVectorOp) -> Result { - crate::utils::match_data_type::<$T>(from.element_type) - } - - fn run( - &self, - input: &MultiVectorOp, - _: Checkpoint<'_>, - mut output: &mut dyn Output, - ) -> anyhow::Result { - writeln!(output, "{}", input)?; - let results = run_benchmark::<$T>(input)?; - writeln!(output, "\n\n{}", DisplayWrapper(&*results))?; - Ok(results) - } + fn try_match(&self, from: &MultiVectorOp) -> Result { + crate::utils::match_data_type::(from.element_type) + } - fn description( - &self, - f: &mut std::fmt::Formatter<'_>, - input: Option<&MultiVectorOp>, - ) -> std::fmt::Result { - match input { - None => writeln!(f, "- Element Type: {}", <$T as AsDataType>::DATA_TYPE)?, - Some(input) => { - let desc = <$T as AsDataType>::describe(input.element_type); - if !desc.is_match() { - writeln!(f, "\n - Mismatched element type: {}", desc)?; - } - } - } - Ok(()) - } + fn run( + &self, + input: &MultiVectorOp, + _: Checkpoint<'_>, + mut output: &mut dyn Output, + ) -> anyhow::Result { + writeln!(output, "{}", input)?; + let mut results = Vec::with_capacity(input.runs.len()); + for run in input.runs.iter() { + let data = Data::::new(run); + let kernel = build_max_sim::(input.isa.into(), data.queries.as_view(), BoxErase)?; + let dist = BoxedKernel(kernel); + results.push(run_with_distance(run, data.docs.as_view(), &dist)); } + writeln!(output, "\n\n{}", DisplayWrapper(&*results))?; + Ok(results) + } - impl Regression for $ty - where - StandardUniform: Distribution<$T>, - { - type Tolerances = MultiVectorTolerance; - type Pass = CheckResult; - type Fail = CheckResult; - - fn check( - &self, - tolerance: &MultiVectorTolerance, - _input: &MultiVectorOp, - before: &Vec, - after: &Vec, - ) -> anyhow::Result> { - anyhow::ensure!( - before.len() == after.len(), - "before has {} runs but after has {}", - before.len(), - after.len(), - ); - - let mut passed = true; - let checks: Vec = std::iter::zip(before.iter(), after.iter()) - .enumerate() - .map(|(i, (b, a))| { - anyhow::ensure!(b.run == a.run, "run {i} mismatched"); - - let computations_per_latency = b.computations_per_latency() as f64; - let before_min = - b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; - let after_min = - a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; - - let comparison = Comparison { - run: b.run.clone(), - tolerance: *tolerance, - before_min, - after_min, - }; - - match relative_change(before_min, after_min) { - Ok(change) => { - if change > tolerance.min_time_regression.get() { - passed = false; - } - } - Err(_) => passed = false, - }; - - Ok(comparison) - }) - .collect::>>()?; - - Ok(if passed { - PassFail::Pass(CheckResult { checks }) - } else { - PassFail::Fail(CheckResult { checks }) - }) + fn description( + &self, + f: &mut std::fmt::Formatter<'_>, + input: Option<&MultiVectorOp>, + ) -> std::fmt::Result { + match input { + None => writeln!(f, "- Element Type: {}", ::DATA_TYPE)?, + Some(input) => { + let desc = ::describe(input.element_type); + if !desc.is_match() { + writeln!(f, "\n - Mismatched element type: {}", desc)?; + } } } - }; + Ok(()) + } } -impl_benchmark!(KernelF32, f32); -impl_benchmark!(KernelF16, half::f16); +impl Regression for Kernel +where + T: MaxSimElement + AsDataType, + StandardUniform: Distribution, +{ + type Tolerances = MultiVectorTolerance; + type Pass = CheckResult; + type Fail = CheckResult; + + fn check( + &self, + tolerance: &MultiVectorTolerance, + _input: &MultiVectorOp, + before: &Vec, + after: &Vec, + ) -> anyhow::Result> { + anyhow::ensure!( + before.len() == after.len(), + "before has {} runs but after has {}", + before.len(), + after.len(), + ); + + let mut passed = true; + let checks: Vec = std::iter::zip(before.iter(), after.iter()) + .enumerate() + .map(|(i, (b, a))| { + anyhow::ensure!(b.run == a.run, "run {i} mismatched"); + + let computations_per_latency = b.computations_per_latency() as f64; + let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + + let comparison = Comparison { + run: b.run.clone(), + tolerance: *tolerance, + before_min, + after_min, + }; + + match relative_change(before_min, after_min) { + Ok(change) => { + if change > tolerance.min_time_regression.get() { + passed = false; + } + } + Err(_) => passed = false, + }; + + Ok(comparison) + }) + .collect::>>()?; + + Ok(if passed { + PassFail::Pass(CheckResult { checks }) + } else { + PassFail::Fail(CheckResult { checks }) + }) + } +} // ───────────────────────────────────────────────────────────────────────── // Registration. // ───────────────────────────────────────────────────────────────────────── pub(super) fn register(registry: &mut Registry) -> anyhow::Result<()> { - registry.register_regression(::ENTRY_NAME, KernelF32)?; - registry.register_regression(::ENTRY_NAME, KernelF16)?; + registry.register_regression("multi-vector-op-f32", Kernel::::new())?; + registry.register_regression("multi-vector-op-f16", Kernel::::new())?; Ok(()) } diff --git a/diskann-benchmark/src/backend/multi_vector/mod.rs b/diskann-benchmark/src/backend/multi_vector/mod.rs index c3ffffeaf..2cbb2d9a6 100644 --- a/diskann-benchmark/src/backend/multi_vector/mod.rs +++ b/diskann-benchmark/src/backend/multi_vector/mod.rs @@ -5,15 +5,56 @@ //! Multi-vector MaxSim distance benchmarks with regression detection. //! -//! Registers one `Benchmark` entry per supported element type; the JSON `isa` -//! field selects the kernel at run time via the library's -//! [`build_max_sim_f32`] / [`build_max_sim_f16`] factories. +//! Registers one `Benchmark` entry per supported element type; the JSON +//! `isa` field selects the kernel at run time via the library's +//! [`build_max_sim`] factory. The set of accepted element types is gated by +//! the sealed [`MaxSimElement`] trait. //! -//! See [`README.md`](./README.md) for the in-tree workflow when authoring a new -//! experimental kernel. +//! # Adding a new in-tree experimental kernel //! -//! [`build_max_sim_f32`]: diskann_quantization::multi_vector::build_max_sim_f32 -//! [`build_max_sim_f16`]: diskann_quantization::multi_vector::build_max_sim_f16 +//! 1. **Library: variant + dispatch arm.** In +//! `diskann-quantization::multi_vector::distance`: +//! - Add a new variant to [`MaxSimIsa`] (in `isa.rs`). +//! - Implement [`MaxSimKernel`] for your kernel struct (in +//! `factory.rs`, next to `Prepared` and `ReferenceKernel`). +//! - Add a matching arm to the [`MaxSimElement::build`] impl for each +//! element type your kernel supports — the arm constructs your kernel +//! and hands it to `erase.erase(...)`. +//! +//! 2. **Benchmark: matching shadow variant.** In +//! [`crate::inputs::multi_vector`]: +//! - Add the same variant to [`BenchIsa`]. +//! - Add the matching arm to `From for MaxSimIsa`. +//! +//! 3. **Run.** Set `"isa": "your-variant"` in the JSON job; the existing +//! `Kernel` benchmark entries (registered once per element type) +//! handle the rest. No new `Benchmark` registration required. +//! +//! # Why two enums? +//! +//! [`MaxSimIsa`] (library) and [`BenchIsa`] are kept separate so the library +//! doesn't pin its public API on a serde version or a particular JSON +//! shape. The benchmark owns its kebab-case JSON layout; the library is +//! serde-agnostic. Mirroring variant-for-variant is intentional — small +//! price for keeping the library boundary clean. +//! +//! # Background +//! +//! The factory follows the BYOTE ("Bring your own type erasure") pattern +//! described in [RFC #1068]. If you want your kernel packaged as something +//! other than `Box>` (e.g. composed with chamfer +//! summing, or wrapped in a custom thin trait), implement your own +//! [`Erase`] and pass it to the factory in place of [`BoxErase`]. +//! +//! [`build_max_sim`]: diskann_quantization::multi_vector::build_max_sim +//! [`MaxSimIsa`]: diskann_quantization::multi_vector::MaxSimIsa +//! [`MaxSimElement`]: diskann_quantization::multi_vector::MaxSimElement +//! [`MaxSimElement::build`]: diskann_quantization::multi_vector::MaxSimElement::build +//! [`MaxSimKernel`]: diskann_quantization::multi_vector::MaxSimKernel +//! [`Erase`]: diskann_quantization::multi_vector::Erase +//! [`BoxErase`]: diskann_quantization::multi_vector::BoxErase +//! [`BenchIsa`]: crate::inputs::multi_vector::BenchIsa +//! [RFC #1068]: https://github.com/microsoft/DiskANN/pull/1068 use diskann_benchmark_runner::Registry; @@ -47,7 +88,7 @@ mod tests { }; use super::driver::{CheckResult, Comparison, MultiVectorTolerance, RunResult}; - use super::kernels::KernelF32; + use super::kernels::Kernel; use crate::inputs::multi_vector::{BenchIsa, MultiVectorOp, Run}; fn tiny_run() -> Run { @@ -86,7 +127,7 @@ mod tests { #[test] fn check_rejects_mismatched_runs() { - let kernel = KernelF32; + let kernel = Kernel::::new(); // Build a result whose `run` diverges from `tiny_run()` so the // regression check's `b.run == a.run` invariant fires. @@ -115,7 +156,7 @@ mod tests { #[test] fn check_allows_negative_relative_change() { - let kernel = KernelF32; + let kernel = Kernel::::new(); let result = kernel .check( @@ -131,7 +172,7 @@ mod tests { #[test] fn check_passes_on_tolerance_boundary() { - let kernel = KernelF32; + let kernel = Kernel::::new(); let result = kernel .check( @@ -147,7 +188,7 @@ mod tests { #[test] fn check_fails_above_tolerance_boundary() { - let kernel = KernelF32; + let kernel = Kernel::::new(); let result = kernel .check( @@ -186,7 +227,7 @@ mod tests { /// require at least a non-zero value. #[test] fn zero_values_rejected() { - let kernel = KernelF32; + let kernel = Kernel::::new(); let result = kernel .check( @@ -199,4 +240,45 @@ mod tests { assert!(matches!(result, PassFail::Fail(_))); } + + ////////////////////// + // BoxedKernel // + ////////////////////// + // + // The library's `MaxSimKernel` trait makes no zero-doc / size-assert + // guarantees — those contracts live on the `BoxedKernel` wrapper in + // `driver.rs`. The tests below pin that wrapper's behaviour. + + use super::driver::{BoxedKernel, Distance}; + use diskann_quantization::multi_vector::{ + build_max_sim, BoxErase, MatRef as LibMatRef, MaxSimIsa, Standard as LibStandard, + }; + + fn boxed_kernel_f32_two_rows() -> BoxedKernel { + let data = [1.0f32, 0.0, 0.0, 1.0]; + let query = LibMatRef::new(LibStandard::new(2, 2).unwrap(), data.as_slice()).unwrap(); + BoxedKernel(build_max_sim::(MaxSimIsa::Auto, query, BoxErase).unwrap()) + } + + #[test] + fn boxed_kernel_max_sim_with_zero_docs_leaves_scores_untouched() { + let kernel = boxed_kernel_f32_two_rows(); + let empty: [f32; 0] = []; + let doc = LibMatRef::new(LibStandard::new(0, 2).unwrap(), empty.as_slice()).unwrap(); + let mut scores = vec![0.0f32; 2]; + kernel.max_sim(doc, &mut scores); + for &s in &scores { + assert_eq!(s, 0.0, "zero-doc max_sim should leave scores untouched"); + } + } + + #[test] + #[should_panic(expected = "scores buffer not right size")] + fn boxed_kernel_max_sim_panics_on_size_mismatch() { + let kernel = boxed_kernel_f32_two_rows(); + let doc_data = [1.0f32, 1.0]; + let doc = LibMatRef::new(LibStandard::new(1, 2).unwrap(), doc_data.as_slice()).unwrap(); + let mut scores = vec![0.0f32; 3]; // Wrong size: 3 vs kernel's nrows() = 2. + kernel.max_sim(doc, &mut scores); + } } diff --git a/diskann-quantization/src/multi_vector/distance/factory.rs b/diskann-quantization/src/multi_vector/distance/factory.rs index 78d15273e..0bfe82fc1 100644 --- a/diskann-quantization/src/multi_vector/distance/factory.rs +++ b/diskann-quantization/src/multi_vector/distance/factory.rs @@ -2,8 +2,8 @@ // Licensed under the MIT license. //! Factory + concrete `MaxSimKernel` implementations for the multi-vector -//! distance API. See [`build_max_sim_f32`] / [`build_max_sim_f16`] for the -//! BYOTE entry points. +//! distance API. See [`build_max_sim`] for the BYOTE entry point and +//! [`MaxSimElement`] for the sealed trait that gates accepted element types. use diskann_utils::Reborrow; use diskann_vector::distance::InnerProduct; @@ -101,7 +101,7 @@ where // ───────────────────────────────────────────────────────────────────────── /// `MaxSimIsa::Reference` path. Owns the query as a `Mat>` and -/// delegates to the existing `MaxSim` fallback per `compute_max_sim` call. +/// delegates to [`MaxSim`] per `compute_max_sim` call. struct ReferenceKernel { query: Mat>, } @@ -153,9 +153,10 @@ where // BuildAndErase — Target1 impls used by `dispatch1_no_features` (Auto). // ───────────────────────────────────────────────────────────────────────── -/// Internal Target1 carrier used only by the `MaxSimIsa::Auto` arm of -/// `build_max_sim_*`. `dispatch1_no_features` picks the highest available -/// arch on the host CPU and calls the matching `Target1::run` below. +/// Internal `Target1` carrier used by the `MaxSimIsa::Auto` arm of +/// [`MaxSimElement::build`]. `dispatch1_no_features` picks the highest +/// available arch on the host CPU and calls the matching `Target1::run` +/// below. struct BuildAndErase(E); // ───── f32 Target1 impls ───── @@ -184,7 +185,7 @@ impl> diskann_wide::arch::Target1 { fn run(self, arch: V4, query: MatRef<'_, Standard>) -> E::Output { - // V4 has no dedicated kernel yet; retarget to V3. + // V4 dispatches to V3 (no V4-specific kernel). let arch = arch.retarget(); let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); self.0.erase(Prepared { arch, prepared }) @@ -196,7 +197,7 @@ impl> diskann_wide::arch::Target1 { fn run(self, arch: Neon, query: MatRef<'_, Standard>) -> E::Output { - // Neon has no dedicated kernel yet; retarget to Scalar. + // Neon dispatches to Scalar (no Neon-specific kernel). let arch = arch.retarget(); let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); self.0.erase(Prepared { arch, prepared }) @@ -232,6 +233,7 @@ impl> for BuildAndErase { fn run(self, arch: V4, query: MatRef<'_, Standard>) -> E::Output { + // V4 dispatches to V3 (no V4-specific kernel). let arch = arch.retarget(); let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); self.0.erase(Prepared { arch, prepared }) @@ -244,6 +246,7 @@ impl> for BuildAndErase { fn run(self, arch: Neon, query: MatRef<'_, Standard>) -> E::Output { + // Neon dispatches to Scalar (no Neon-specific kernel). let arch = arch.retarget(); let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); self.0.erase(Prepared { arch, prepared }) @@ -251,110 +254,292 @@ impl> } // ───────────────────────────────────────────────────────────────────────── -// Factory functions. +// MaxSimElement — sealed trait gating accepted element types. // ───────────────────────────────────────────────────────────────────────── -/// Build a multi-vector MaxSim kernel for `f32` queries. +mod sealed { + pub trait Sealed {} +} + +/// Scalar element types accepted by the multi-vector MaxSim factory. /// -/// Dispatches on `isa`, constructs the corresponding concrete kernel, and -/// hands it to `erase.erase(...)`. Returns [`NotSupported`] when the requested -/// ISA cannot run on this build (e.g. AVX-512 unavailable; aarch64 on x86_64). -pub fn build_max_sim_f32>( - isa: MaxSimIsa, - query: MatRef<'_, Standard>, - erase: E, -) -> Result { - match isa { - MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features( - BuildAndErase(erase), - query, - )), - MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)), - #[cfg(target_arch = "x86_64")] - MaxSimIsa::X86_64_V3 => { - let arch = V3::new_checked().ok_or(NotSupported { +/// Sealed: external crates cannot add impls. The library ships impls for +/// `f32` and `half::f16`. Quantized representations (PQ, SQ, packed sub-byte) +/// do not fit this trait — they carry per-vector codebook/scale state and +/// will get dedicated factory functions when they are added. +pub trait MaxSimElement: sealed::Sealed + Sized + Copy + Send + Sync + 'static { + /// Build the concrete kernel for this element type and hand it to + /// `erase.erase(...)`. Returns [`NotSupported`] when the requested ISA + /// cannot run on this build (e.g. AVX-512 unavailable; aarch64 on x86_64). + fn build>( + isa: MaxSimIsa, + query: MatRef<'_, Standard>, + erase: E, + ) -> Result; +} + +impl sealed::Sealed for f32 {} +impl sealed::Sealed for half::f16 {} + +impl MaxSimElement for f32 { + fn build>( + isa: MaxSimIsa, + query: MatRef<'_, Standard>, + erase: E, + ) -> Result { + match isa { + MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features( + BuildAndErase(erase), + query, + )), + MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)), + #[cfg(target_arch = "x86_64")] + MaxSimIsa::X86_64_V3 => { + let arch = V3::new_checked().ok_or(NotSupported { + isa, + reason: "AVX2/FMA unavailable on this CPU", + })?; + Ok(arch.run1(BuildAndErase(erase), query)) + } + #[cfg(target_arch = "x86_64")] + MaxSimIsa::X86_64_V4 => { + let arch = V4::new_checked().ok_or(NotSupported { + isa, + reason: "AVX-512 unavailable on this CPU", + })?; + Ok(arch.run1(BuildAndErase(erase), query)) + } + #[cfg(not(target_arch = "x86_64"))] + MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported { isa, - reason: "AVX2/FMA unavailable on this CPU", - })?; - Ok(arch.run1(BuildAndErase(erase), query)) - } - #[cfg(target_arch = "x86_64")] - MaxSimIsa::X86_64_V4 => { - let arch = V4::new_checked().ok_or(NotSupported { + reason: "x86_64 target only", + }), + #[cfg(target_arch = "aarch64")] + MaxSimIsa::Neon => { + let arch = Neon::new_checked().ok_or(NotSupported { + isa, + reason: "Neon unavailable on this CPU", + })?; + Ok(arch.run1(BuildAndErase(erase), query)) + } + #[cfg(not(target_arch = "aarch64"))] + MaxSimIsa::Neon => Err(NotSupported { isa, - reason: "AVX-512 unavailable on this CPU", - })?; - Ok(arch.run1(BuildAndErase(erase), query)) + reason: "aarch64 target only", + }), + MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::::new(query))), } - #[cfg(not(target_arch = "x86_64"))] - MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported { - isa, - reason: "x86_64 target only", - }), - #[cfg(target_arch = "aarch64")] - MaxSimIsa::Neon => { - let arch = Neon::new_checked().ok_or(NotSupported { + } +} + +impl MaxSimElement for half::f16 { + fn build>( + isa: MaxSimIsa, + query: MatRef<'_, Standard>, + erase: E, + ) -> Result { + match isa { + MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features( + BuildAndErase(erase), + query, + )), + MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)), + #[cfg(target_arch = "x86_64")] + MaxSimIsa::X86_64_V3 => { + let arch = V3::new_checked().ok_or(NotSupported { + isa, + reason: "AVX2/FMA unavailable on this CPU", + })?; + Ok(arch.run1(BuildAndErase(erase), query)) + } + #[cfg(target_arch = "x86_64")] + MaxSimIsa::X86_64_V4 => { + let arch = V4::new_checked().ok_or(NotSupported { + isa, + reason: "AVX-512 unavailable on this CPU", + })?; + Ok(arch.run1(BuildAndErase(erase), query)) + } + #[cfg(not(target_arch = "x86_64"))] + MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported { + isa, + reason: "x86_64 target only", + }), + #[cfg(target_arch = "aarch64")] + MaxSimIsa::Neon => { + let arch = Neon::new_checked().ok_or(NotSupported { + isa, + reason: "Neon unavailable on this CPU", + })?; + Ok(arch.run1(BuildAndErase(erase), query)) + } + #[cfg(not(target_arch = "aarch64"))] + MaxSimIsa::Neon => Err(NotSupported { isa, - reason: "Neon unavailable on this CPU", - })?; - Ok(arch.run1(BuildAndErase(erase), query)) + reason: "aarch64 target only", + }), + MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::::new(query))), } - #[cfg(not(target_arch = "aarch64"))] - MaxSimIsa::Neon => Err(NotSupported { - isa, - reason: "aarch64 target only", - }), - MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::::new(query))), } } -/// Build a multi-vector MaxSim kernel for `half::f16` queries. Same contract -/// as [`build_max_sim_f32`]. -pub fn build_max_sim_f16>( +// ───────────────────────────────────────────────────────────────────────── +// Factory entry point. +// ───────────────────────────────────────────────────────────────────────── + +/// Build a multi-vector MaxSim kernel for any [`MaxSimElement`] type. +/// +/// Thin wrapper over [`MaxSimElement::build`] — exists so generic callers can +/// write `build_max_sim::(isa, query, erase)` without naming the trait +/// at the call site. Returns [`NotSupported`] when the requested ISA cannot +/// run on this build (e.g. AVX-512 unavailable; aarch64 on x86_64). +pub fn build_max_sim>( isa: MaxSimIsa, - query: MatRef<'_, Standard>, + query: MatRef<'_, Standard>, erase: E, ) -> Result { - match isa { - MaxSimIsa::Auto => Ok(diskann_wide::arch::dispatch1_no_features( - BuildAndErase(erase), - query, - )), - MaxSimIsa::Scalar => Ok(Scalar::new().run1(BuildAndErase(erase), query)), - #[cfg(target_arch = "x86_64")] - MaxSimIsa::X86_64_V3 => { - let arch = V3::new_checked().ok_or(NotSupported { - isa, - reason: "AVX2/FMA unavailable on this CPU", - })?; - Ok(arch.run1(BuildAndErase(erase), query)) + T::build(isa, query, erase) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::multi_vector::{BoxErase, Chamfer, MaxSim, QueryMatRef}; + + /// Local helper trait — picks a sane test value of `T` from an `f32` + /// so both `f32` and `half::f16` parameterizations share the same data + /// generator. + trait FromF32 { + fn from_f32(v: f32) -> Self; + } + + impl FromF32 for f32 { + fn from_f32(v: f32) -> Self { + v } - #[cfg(target_arch = "x86_64")] - MaxSimIsa::X86_64_V4 => { - let arch = V4::new_checked().ok_or(NotSupported { - isa, - reason: "AVX-512 unavailable on this CPU", - })?; - Ok(arch.run1(BuildAndErase(erase), query)) + } + + impl FromF32 for half::f16 { + fn from_f32(v: f32) -> Self { + diskann_wide::cast_f32_to_f16(v) } - #[cfg(not(target_arch = "x86_64"))] - MaxSimIsa::X86_64_V3 | MaxSimIsa::X86_64_V4 => Err(NotSupported { - isa, - reason: "x86_64 target only", - }), - #[cfg(target_arch = "aarch64")] - MaxSimIsa::Neon => { - let arch = Neon::new_checked().ok_or(NotSupported { - isa, - reason: "Neon unavailable on this CPU", - })?; - Ok(arch.run1(BuildAndErase(erase), query)) + } + + fn make_mat(data: &[T], nrows: usize, ncols: usize) -> MatRef<'_, Standard> { + MatRef::new(Standard::new(nrows, ncols).unwrap(), data).unwrap() + } + + fn make_test_data(len: usize, ceil: usize, shift: usize) -> Vec { + (0..len) + .map(|v| T::from_f32(((v + shift) % ceil) as f32)) + .collect() + } + + /// Shapes for the `chamfer_matches_fallback` / `max_sim_matches_fallback` + /// agreement checks: `(num_queries, num_docs, dim)`. + /// + /// Targets the factory wiring (query setup, score writeback) above the + /// kernel layer; exhaustive panel/remainder coverage is pinned in + /// `kernels::tiled_reduce::tests`. + const TEST_CASES: &[(usize, usize, usize)] = &[ + (1, 1, 4), // Degenerate + (5, 3, 5), // Prime k; nq > 1 and nd > 1 exercise per-row writeback + (17, 4, 64), // A-panel remainder crossing both Scalar and V3 panel widths + (16, 6, 32), // B-remainder ≠ 1 (V3 b_remainder = 2) + ]; + + fn check_chamfer_matches(tol: f32, label: &str) + where + T: MaxSimElement + FromF32, + InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, + { + for &(nq, nd, dim) in TEST_CASES { + let query_data = make_test_data::(nq * dim, dim, dim / 2); + let doc_data = make_test_data::(nd * dim, dim, dim); + + let query = make_mat(&query_data, nq, dim); + let doc = make_mat(&doc_data, nd, dim); + + let expected = Chamfer::evaluate(QueryMatRef::from(query), doc); + + let kernel = build_max_sim::(MaxSimIsa::Auto, query, BoxErase).unwrap(); + let mut scores = vec![0.0f32; nq]; + kernel.compute_max_sim(doc, &mut scores); + let actual: f32 = scores.iter().sum(); + + assert!( + (actual - expected).abs() < tol, + "{label}Chamfer mismatch for ({nq},{nd},{dim}): actual={actual}, expected={expected}", + ); + } + } + + fn check_max_sim_matches(tol: f32, label: &str) + where + T: MaxSimElement + FromF32, + InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, + { + for &(nq, nd, dim) in TEST_CASES { + let query_data = make_test_data::(nq * dim, dim, dim / 2); + let doc_data = make_test_data::(nd * dim, dim, dim); + + let query = make_mat(&query_data, nq, dim); + let doc = make_mat(&doc_data, nd, dim); + + let mut expected_scores = vec![0.0f32; nq]; + let _ = MaxSim::new(&mut expected_scores) + .unwrap() + .evaluate(QueryMatRef::from(query), doc); + + let kernel = build_max_sim::(MaxSimIsa::Auto, query, BoxErase).unwrap(); + let mut actual_scores = vec![0.0f32; nq]; + kernel.compute_max_sim(doc, &mut actual_scores); + + for i in 0..nq { + assert!( + (actual_scores[i] - expected_scores[i]).abs() < tol, + "{label}MaxSim[{i}] mismatch for ({nq},{nd},{dim}): actual={}, expected={}", + actual_scores[i], + expected_scores[i], + ); + } } - #[cfg(not(target_arch = "aarch64"))] - MaxSimIsa::Neon => Err(NotSupported { - isa, - reason: "aarch64 target only", - }), - MaxSimIsa::Reference => Ok(erase.erase(ReferenceKernel::::new(query))), } + + #[test] + fn dimensions_f32() { + let data = vec![1.0f32; 5 * 8]; + let query = make_mat(&data, 5, 8); + let kernel = build_max_sim::(MaxSimIsa::Auto, query, BoxErase).unwrap(); + assert_eq!(kernel.nrows(), 5); + } + + #[test] + fn dimensions_f16() { + let data = vec![diskann_wide::cast_f32_to_f16(1.0); 5 * 8]; + let query = make_mat(data.as_slice(), 5, 8); + let kernel = build_max_sim::(MaxSimIsa::Auto, query, BoxErase).unwrap(); + assert_eq!(kernel.nrows(), 5); + } + + macro_rules! test_matches_fallback { + ($mod_name:ident, $ty:ty, $tol:expr, $label:literal) => { + mod $mod_name { + use super::*; + + #[test] + fn chamfer_matches_fallback() { + check_chamfer_matches::<$ty>($tol, $label); + } + + #[test] + fn max_sim_matches_fallback() { + check_max_sim_matches::<$ty>($tol, $label); + } + } + }; + } + + test_matches_fallback!(f32, f32, 1e-10, "f32 "); + test_matches_fallback!(f16, half::f16, 1e-10, "f16 "); } diff --git a/diskann-quantization/src/multi_vector/distance/isa.rs b/diskann-quantization/src/multi_vector/distance/isa.rs index 49768bc48..d4495dd55 100644 --- a/diskann-quantization/src/multi_vector/distance/isa.rs +++ b/diskann-quantization/src/multi_vector/distance/isa.rs @@ -44,9 +44,9 @@ impl std::fmt::Display for MaxSimIsa { } } -/// Returned by `build_max_sim_*` when the requested ISA cannot be produced on -/// the current host (e.g. x86_64 V4 requested on a non-AVX512 CPU, or Neon -/// requested on x86_64). +/// Returned by [`build_max_sim`](super::build_max_sim) when the requested +/// ISA cannot be produced on the current host (e.g. x86_64 V4 requested on +/// a non-AVX512 CPU, or Neon requested on x86_64). #[derive(Debug, Clone, Copy)] pub struct NotSupported { pub isa: MaxSimIsa, diff --git a/diskann-quantization/src/multi_vector/distance/mod.rs b/diskann-quantization/src/multi_vector/distance/mod.rs index 9afb070c5..d4bc2725d 100644 --- a/diskann-quantization/src/multi_vector/distance/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/mod.rs @@ -8,12 +8,14 @@ //! - [`MaxSim`]: per-query-vector maximum similarities. //! - [`Chamfer`]: sum of MaxSim scores (asymmetric Chamfer distance). //! - [`MaxSimKernel`]: object-safe interface implemented by every concrete -//! kernel constructed through [`build_max_sim_f32`] / [`build_max_sim_f16`]. +//! kernel constructed through [`build_max_sim`]. //! - [`Erase`]: BYOTE visitor — caller decides how to type-erase the kernel. +//! - [`MaxSimElement`]: sealed trait gating which element types the factory +//! accepts. //! //! The fallback path uses a double-loop kernel over //! [`InnerProduct`](diskann_vector::distance::InnerProduct). The factory -//! functions return cache-tiled SIMD kernels selected by [`MaxSimIsa`]. +//! returns cache-tiled SIMD kernels selected by [`MaxSimIsa`]. //! //! # Example //! @@ -56,7 +58,7 @@ mod kernel; mod kernels; mod max_sim; -pub use factory::{build_max_sim_f16, build_max_sim_f32}; +pub use factory::{MaxSimElement, build_max_sim}; pub use fallback::QueryMatRef; pub use isa::{MaxSimIsa, NotSupported}; pub use kernel::{BoxErase, Erase, MaxSimKernel}; diff --git a/diskann-quantization/src/multi_vector/mod.rs b/diskann-quantization/src/multi_vector/mod.rs index d2ad0e7bc..edeca4ef0 100644 --- a/diskann-quantization/src/multi_vector/mod.rs +++ b/diskann-quantization/src/multi_vector/mod.rs @@ -22,7 +22,8 @@ //! | [`QueryMatRef`] | Query wrapper for asymmetric distances | //! | [`MaxSim`] | Per-query-vector max similarity computation | //! | [`Chamfer`] | Asymmetric Chamfer distance (sum of MaxSim) | -//! | [`MaxSimKernel`] | Object-safe kernel returned by [`build_max_sim_f32`] / [`build_max_sim_f16`] | +//! | [`MaxSimKernel`] | Object-safe kernel returned by [`build_max_sim`] | +//! | [`MaxSimElement`] | Sealed trait gating element types the factory accepts | //! | [`MaxSimIsa`] | ISA selector for the factory functions | //! | [`Erase`] | BYOTE visitor used by the factory | //! @@ -75,8 +76,8 @@ pub(crate) mod matrix; pub use block_transposed::{BlockTransposed, BlockTransposedMut, BlockTransposedRef}; pub use distance::{ - BoxErase, Chamfer, Erase, MaxSim, MaxSimError, MaxSimIsa, MaxSimKernel, NotSupported, - QueryMatRef, build_max_sim_f16, build_max_sim_f32, + BoxErase, Chamfer, Erase, MaxSim, MaxSimElement, MaxSimError, MaxSimIsa, MaxSimKernel, + NotSupported, QueryMatRef, build_max_sim, }; pub use matrix::{ Defaulted, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef, Overflow, From 03d61197b127d11d21aea2228f2a26feca7d91b4 Mon Sep 17 00:00:00 2001 From: Suryansh Gupta Date: Tue, 19 May 2026 23:52:00 +0530 Subject: [PATCH 13/13] Fix after main merge --- .../src/backend/multi_vector/driver.rs | 27 ++++++++---------- diskann-benchmark/src/inputs/multi_vector.rs | 28 +++++++++---------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/diskann-benchmark/src/backend/multi_vector/driver.rs b/diskann-benchmark/src/backend/multi_vector/driver.rs index e59f24ac2..57446ae9b 100644 --- a/diskann-benchmark/src/backend/multi_vector/driver.rs +++ b/diskann-benchmark/src/backend/multi_vector/driver.rs @@ -16,7 +16,7 @@ use diskann_benchmark_runner::{ num::{relative_change, NonNegativeFinite}, percentiles, MicroSeconds, }, - Any, CheckDeserialization, Checker, Input, + Checker, Input, }; use diskann_quantization::multi_vector::{Mat, MatRef, MaxSimKernel, Standard}; use rand::{ @@ -42,33 +42,30 @@ pub(super) struct MultiVectorTolerance { pub(super) min_time_regression: NonNegativeFinite, } -impl CheckDeserialization for MultiVectorTolerance { - fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { - Ok(()) - } -} - impl Input for MultiVectorTolerance { + type Raw = Self; + fn tag() -> &'static str { "multi-vector-tolerance" } - fn try_deserialize( - serialized: &serde_json::Value, - checker: &mut Checker, - ) -> anyhow::Result { - checker.any(Self::deserialize(serialized)?) + fn from_raw(raw: Self::Raw, _checker: &mut Checker) -> anyhow::Result { + Ok(raw) } - fn example() -> anyhow::Result { + fn serialize(&self) -> anyhow::Result { + Ok(serde_json::to_value(self)?) + } + + fn example() -> Self { const EXAMPLE: NonNegativeFinite = match NonNegativeFinite::new(0.05) { Ok(v) => v, Err(_) => panic!("use a non-negative finite please"), }; - Ok(serde_json::to_value(MultiVectorTolerance { + MultiVectorTolerance { min_time_regression: EXAMPLE, - })?) + } } } diff --git a/diskann-benchmark/src/inputs/multi_vector.rs b/diskann-benchmark/src/inputs/multi_vector.rs index 9d863c13a..cbb1c255b 100644 --- a/diskann-benchmark/src/inputs/multi_vector.rs +++ b/diskann-benchmark/src/inputs/multi_vector.rs @@ -5,18 +5,10 @@ use std::num::NonZeroUsize; -use diskann_benchmark_runner::{utils::datatype::DataType, CheckDeserialization, Checker}; +use diskann_benchmark_runner::{utils::datatype::DataType, Checker, Input}; use diskann_quantization::multi_vector::MaxSimIsa; use serde::{Deserialize, Serialize}; -use crate::inputs::{as_input, Example}; - -////////////// -// Registry // -////////////// - -as_input!(MultiVectorOp); - //////////////// // Enum types // //////////////// @@ -100,13 +92,21 @@ impl MultiVectorOp { } } -impl CheckDeserialization for MultiVectorOp { - fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { - Ok(()) +impl Input for MultiVectorOp { + type Raw = Self; + + fn tag() -> &'static str { + Self::tag() + } + + fn from_raw(raw: Self::Raw, _checker: &mut Checker) -> anyhow::Result { + Ok(raw) + } + + fn serialize(&self) -> anyhow::Result { + Ok(serde_json::to_value(self)?) } -} -impl Example for MultiVectorOp { fn example() -> Self { const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap(); const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap();