diff --git a/Cargo.lock b/Cargo.lock index 078e1b29fa..7b6801958f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8746,6 +8746,14 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "ruvector-bet4-ivf-bench" +version = "0.1.0" +dependencies = [ + "rand 0.8.5", + "ruvector-rairs", +] + [[package]] name = "ruvector-cli" version = "2.2.3" diff --git a/Cargo.toml b/Cargo.toml index 38128585a2..d92de77db0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -233,6 +233,8 @@ members = [ "crates/ruvllm_retrieval_diffusion", # RAIRS IVF: Redundant Assignment + Amplified Inverse Residual (ADR-193) "crates/ruvector-rairs", + # BET 4 (SepRAG #534): LB-B&B IVF probing vs plain IVF nprobe + "crates/ruvector-bet4-ivf-bench", ] resolver = "2" diff --git a/crates/ruvector-bet4-ivf-bench/Cargo.toml b/crates/ruvector-bet4-ivf-bench/Cargo.toml new file mode 100644 index 0000000000..fdc1e82776 --- /dev/null +++ b/crates/ruvector-bet4-ivf-bench/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "ruvector-bet4-ivf-bench" +version = "0.1.0" +edition = "2021" +license = "MIT" +publish = false +description = "BET 4 (SepRAG #534): LB-ordered branch-and-bound IVF probing vs plain IVF nprobe" + +[dependencies] +ruvector-rairs = { path = "../ruvector-rairs" } +rand = "0.8" + +[lib] +crate-type = ["rlib"] diff --git a/crates/ruvector-bet4-ivf-bench/examples/ivf_pruning_sweep.rs b/crates/ruvector-bet4-ivf-bench/examples/ivf_pruning_sweep.rs new file mode 100644 index 0000000000..8691ccf4ac --- /dev/null +++ b/crates/ruvector-bet4-ivf-bench/examples/ivf_pruning_sweep.rs @@ -0,0 +1,198 @@ +//! BET 4 matched-recall sweep (M2/M3): LB-ordered branch-and-bound IVF probing vs the tuned plain +//! `IvfFlat` `nprobe` incumbent, on real 128-d arxiv embeddings AND a PCA-8 low-dim control. +//! +//! Three contenders share one index per `nclusters` (built once): plain `nprobe` (incumbent), +//! B&B in **LB-order** (the faithful BET-2 `RegionPruneIvf` kernel), and the **steelman** B&B — +//! centroid-distance order + LB-skip (the strongest version: if it can't beat `nprobe`, the bound +//! doesn't pay). Reports the exact-regime pruning fraction, matched-recall cost, and checks the +//! FROZEN gate (docs/plans/bet4-ivf-pruning/PRE-REGISTRATION.md) on the steelman ratio. +//! +//! Run: `cargo run --release -p ruvector-bet4-ivf-bench --example ivf_pruning_sweep -- [N]` + +use ruvector_bet4_ivf_bench::data::load_feat_csv; +use ruvector_bet4_ivf_bench::kernel::BnBIvf; +use ruvector_bet4_ivf_bench::oracle::{brute_force_topk, recall_at_k}; +use ruvector_bet4_ivf_bench::pca::project_topm; +use ruvector_rairs::SearchResult; +use std::time::Instant; + +const K: usize = 10; +const R_TARGET: f64 = 0.95; +const NCLUSTERS: [usize; 3] = [64, 256, 1024]; + +fn main() { + let args: Vec = std::env::args().collect(); + let n_req: usize = args.get(1).and_then(|s| s.parse().ok()).unwrap_or(20_000); + let data = + std::env::var("BET4_DATA").unwrap_or_else(|_| "target/m1-data/node-feat-100k.csv".into()); + + let corpus = load_feat_csv(&data, n_req).unwrap_or_else(|e| { + eprintln!("failed to load {data}: {e}"); + std::process::exit(1); + }); + let n = corpus.len(); + let dim = corpus.first().map(|v| v.len()).unwrap_or(0); + println!("# BET4 sweep n={n} dim={dim} k={K} R_target={R_TARGET} data={data}\n"); + + run_regime("128-d (real arxiv features)", &corpus); + + println!("\n# Projecting to PCA-8 (low-dim control)…"); + let t = Instant::now(); + let corpus8 = project_topm(&corpus, 8, 60); + println!("# PCA done in {:?}\n", t.elapsed()); + run_regime("PCA-8 (low-dim control — bound should be TIGHT, B&B should WIN)", &corpus8); +} + +fn run_regime(label: &str, corpus: &[Vec]) { + let n = corpus.len(); + let dim = corpus[0].len(); + let nq = 200.min(n); + let queries: Vec = (0..nq).collect(); + let truth: Vec> = queries + .iter() + .map(|&q| brute_force_topk(corpus, &corpus[q], K)) + .collect(); + + println!("════ REGIME: {label} (dim={dim}) ════"); + let mut cells: Vec = Vec::new(); + + for &nc in &NCLUSTERS { + let t_build = Instant::now(); + let idx = BnBIvf::build(corpus, nc, 15, 42); + let nc_eff = idx.num_lists(); + let build = t_build.elapsed(); + + // Exact-regime pruning fraction (LB-order full budget). + let mut pruned = 0.0; + for &q in &queries { + let (_r, _e, probed) = idx.search(&corpus[q], K, None); + pruned += (nc_eff - probed) as f64 / nc_eff as f64; + } + let prune_frac = pruned / nq as f64; + + let grid = knob_grid(nc_eff); + let plain = matched(&queries, corpus, &truth, &grid, |q, knob| { + let (r, ev, _) = idx.search_nprobe(q, K, knob); + (ids(&r), ev) + }); + let bnb_lb = matched(&queries, corpus, &truth, &grid, |q, knob| { + let (r, ev, _) = idx.search(q, K, Some(knob)); + (ids(&r), ev) + }); + let bnb_skip = matched(&queries, corpus, &truth, &grid, |q, knob| { + let (r, ev, _) = idx.search_bnb_skip(q, K, Some(knob)); + (ids(&r), ev) + }); + + let eval_ratio = plain.evals / bnb_skip.evals.max(1.0); + let wall_ratio = plain.wall_ns as f64 / bnb_skip.wall_ns.max(1) as f64; + + println!("\n## nclusters={nc_eff} (build {build:?}) exact-regime prune={:.1}%", prune_frac * 100.0); + print_row("plain nprobe (incumbent)", &plain); + print_row("B&B LB-order (BET-2 kernel)", &bnb_lb); + print_row("B&B steelman (cdist+LB-skip)", &bnb_skip); + println!( + " steelman vs incumbent: eval {eval_ratio:.2}x wall {wall_ratio:.2}x" + ); + + cells.push(Cell { nc: nc_eff, eval_ratio, wall_ratio, prune_frac }); + } + + verdict(label, &cells); +} + +struct Cell { + nc: usize, + eval_ratio: f64, + wall_ratio: f64, + prune_frac: f64, +} + +struct Matched { + knob: usize, + recall: f64, + evals: f64, + wall_ns: u128, +} + +fn print_row(name: &str, m: &Matched) { + println!( + " {name:<32} knob={:<4} recall={:.4} evals/q={:>8.0} wall/q={:>6}µs", + m.knob, + m.recall, + m.evals, + m.wall_ns / 1000 + ); +} + +/// First knob (ascending) whose mean recall ≥ `R_TARGET`, with its mean member-evals and wall-time; +/// falls back to the largest knob if none reaches target. +fn matched( + queries: &[usize], + corpus: &[Vec], + truth: &[Vec], + grid: &[usize], + search: F, +) -> Matched +where + F: Fn(&[f32], usize) -> (Vec, usize), +{ + let mut last = Matched { knob: 0, recall: 0.0, evals: 0.0, wall_ns: 0 }; + for &knob in grid { + let t = Instant::now(); + let mut rec = 0.0; + let mut ev = 0usize; + for (qi, &q) in queries.iter().enumerate() { + let (got, e) = search(&corpus[q], knob); + ev += e; + rec += recall_at_k(&truth[qi], &got, K); + } + let wall_ns = t.elapsed().as_nanos() / queries.len() as u128; + last = Matched { + knob, + recall: rec / queries.len() as f64, + evals: ev as f64 / queries.len() as f64, + wall_ns, + }; + if last.recall >= R_TARGET { + return last; + } + } + last +} + +fn knob_grid(maxv: usize) -> Vec { + let mut g = Vec::new(); + let mut x = 1usize; + while x < maxv { + g.push(x); + x = ((x as f64) * 1.5).ceil() as usize; + } + g.push(maxv); + g.dedup(); + g +} + +fn ids(res: &[SearchResult]) -> Vec { + res.iter().map(|r| r.id).collect() +} + +fn verdict(label: &str, cells: &[Cell]) { + let all_win = cells.iter().all(|c| c.eval_ratio >= 2.0 && c.wall_ratio > 1.0); + let any_kill = cells.iter().any(|c| c.eval_ratio < 1.5 || c.wall_ratio < 1.0); + let v = if all_win { + "WIN (≥2× evals AND wall-clock win across all nclusters)" + } else if any_kill { + "KILL / NO-GO (<1.5× somewhere or wall reversed — bound too loose to pay)" + } else { + "QUALIFIED (1.5–2×, or mixed)" + }; + println!("\n ── verdict [{label}] ──"); + for c in cells { + println!( + " nclusters={:<5} steelman eval={:.2}x wall={:.2}x exact-prune={:.1}%", + c.nc, c.eval_ratio, c.wall_ratio, c.prune_frac * 100.0 + ); + } + println!(" => {v}"); +} diff --git a/crates/ruvector-bet4-ivf-bench/src/data.rs b/crates/ruvector-bet4-ivf-bench/src/data.rs new file mode 100644 index 0000000000..2d2ec1184c --- /dev/null +++ b/crates/ruvector-bet4-ivf-bench/src/data.rs @@ -0,0 +1,29 @@ +//! Loader for the aligned ogbn-arxiv 128-d node-feature CSV (row `i` = node `i`), the same +//! public corpus used by ADR-201/202/204. Data lives under `target/m1-data/` (gitignored). + +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::path::Path; + +/// Load up to `limit` rows of comma-separated f32 features. Blank lines are skipped. Each +/// returned row is one node's feature vector (all rows share the file's column count, 128 for +/// the arxiv features). +pub fn load_feat_csv>(path: P, limit: usize) -> std::io::Result>> { + let reader = BufReader::new(File::open(path)?); + let mut out = Vec::with_capacity(limit); + for line in reader.lines() { + if out.len() >= limit { + break; + } + let line = line?; + if line.trim().is_empty() { + continue; + } + let row: Vec = line + .split(',') + .map(|s| s.trim().parse::().unwrap_or(0.0)) + .collect(); + out.push(row); + } + Ok(out) +} diff --git a/crates/ruvector-bet4-ivf-bench/src/kernel.rs b/crates/ruvector-bet4-ivf-bench/src/kernel.rs new file mode 100644 index 0000000000..04a18addcc --- /dev/null +++ b/crates/ruvector-bet4-ivf-bench/src/kernel.rs @@ -0,0 +1,234 @@ +//! `BnBIvf` — the BET 4 contender: an IVF index probed in **lower-bound order with +//! branch-and-bound early termination**, over the same `ruvector-rairs` k-means substrate as +//! the plain-`IvfFlat` incumbent. +//! +//! For a query `q` and cluster `c` with centroid `μ_c` and radius `r_c = max_{v∈c} ‖v−μ_c‖`, +//! the triangle inequality gives a lower bound on the distance to *any* member of `c`: +//! `LB(q,c) = max(0, ‖q−μ_c‖ − r_c)`. Probing clusters in ascending `LB` while tracking the +//! running k-th-best distance `τ`, we may stop the instant `LB(c) ≥ τ`: every not-yet-probed +//! cluster has an even larger `LB`, so none can contain a top-k point. That single break makes +//! full-budget B&B **exact** (recall → 1.0) yet lets it skip clusters a fixed `nprobe` would +//! scan. A `max_probe` cap turns it into an approximate knob (the analogue of `nprobe`) for the +//! matched-recall comparison. + +use crate::oracle::l2; +use ruvector_rairs::{kmeans, SearchResult}; +use std::cmp::Ordering; +use std::collections::BinaryHeap; + +/// IVF index supporting lower-bound-ordered branch-and-bound probing. +pub struct BnBIvf { + centroids: Vec>, + /// Per cluster: `(id, vector)` of its members. + lists: Vec)>>, + /// Per cluster: max member distance to its centroid (the B&B radius). + radii: Vec, +} + +/// Top-k accumulator element. `BinaryHeap` is a max-heap, so the **worst** (largest distance) +/// candidate sits on top and is the one evicted when a closer point arrives. +struct Cand { + dist: f32, + id: usize, +} +impl PartialEq for Cand { + fn eq(&self, o: &Self) -> bool { + self.dist == o.dist + } +} +impl Eq for Cand {} +impl PartialOrd for Cand { + fn partial_cmp(&self, o: &Self) -> Option { + Some(self.cmp(o)) + } +} +impl Ord for Cand { + fn cmp(&self, o: &Self) -> Ordering { + self.dist.total_cmp(&o.dist) + } +} + +/// Offer candidate `(id, d)` to a bounded top-`k` max-heap: insert while under capacity, else +/// replace the current worst iff `d` is closer. Shared by both probe strategies so they accumulate +/// results identically — only their cluster-visit order/stopping differs. +#[inline] +fn consider(heap: &mut BinaryHeap, k: usize, id: usize, d: f32) { + if heap.len() < k { + heap.push(Cand { dist: d, id }); + } else if d < heap.peek().unwrap().dist { + heap.pop(); + heap.push(Cand { dist: d, id }); + } +} + +/// Drain a top-`k` heap into an ascending-distance result vector. +fn finalize(heap: BinaryHeap) -> Vec { + let mut res: Vec = heap + .into_iter() + .map(|c| SearchResult { + id: c.id, + distance: c.dist, + }) + .collect(); + res.sort_by(|a, b| a.distance.total_cmp(&b.distance)); + res +} + +impl BnBIvf { + /// Build over `corpus` using `ruvector-rairs` k-means (`nclusters`, `max_iter`, `seed`). + /// Using the same `(corpus, nclusters, max_iter, seed)` as `IvfFlat::train` yields identical + /// centroids — the shared-index guarantee the pre-registration requires. + pub fn build(corpus: &[Vec], nclusters: usize, max_iter: usize, seed: u64) -> Self { + assert!(!corpus.is_empty(), "empty corpus"); + let k = nclusters.min(corpus.len()).max(1); + let (centroids, assignments) = kmeans::train(corpus, k, max_iter, seed); + let kc = centroids.len(); + let mut lists: Vec)>> = vec![Vec::new(); kc]; + for (i, v) in corpus.iter().enumerate() { + lists[assignments[i]].push((i, v.clone())); + } + let radii: Vec = (0..kc) + .map(|c| { + lists[c] + .iter() + .map(|(_, v)| l2(v, ¢roids[c])) + .fold(0.0f32, f32::max) + }) + .collect(); + Self { + centroids, + lists, + radii, + } + } + + /// Number of inverted lists (clusters). + pub fn num_lists(&self) -> usize { + self.centroids.len() + } + + /// Search for the top-`k` neighbours of `q`. + /// + /// `max_probe = None` runs full-budget B&B (**exact**); `Some(m)` probes at most `m` + /// clusters in lower-bound order (approximate, the `nprobe` analogue). Returns the top-k + /// (ascending distance), the number of **member** distance-evals charged, and the number of + /// clusters actually probed. The `nclusters` centroid evals (routing) are *not* folded into + /// the member count — the harness charges them separately and equally to both contenders. + pub fn search( + &self, + q: &[f32], + k: usize, + max_probe: Option, + ) -> (Vec, usize, usize) { + let nclusters = self.centroids.len(); + // Routing: lower bound per cluster, then ascending-LB order. + let mut order: Vec<(f32, usize)> = (0..nclusters) + .map(|c| { + let lb = (l2(q, &self.centroids[c]) - self.radii[c]).max(0.0); + (lb, c) + }) + .collect(); + order.sort_by(|a, b| a.0.total_cmp(&b.0)); + + let cap = max_probe.unwrap_or(nclusters).min(nclusters); + let mut heap: BinaryHeap = BinaryHeap::with_capacity(k + 1); + let mut member_evals = 0usize; + let mut probed = 0usize; + + for (lb, c) in order { + if probed >= cap { + break; + } + // Branch-and-bound: once the heap is full and the best possible distance in this + // (and every later) cluster is no better than the current k-th best, stop. + if heap.len() == k { + let kth = heap.peek().unwrap().dist; + if lb >= kth { + break; + } + } + for (id, v) in &self.lists[c] { + member_evals += 1; + consider(&mut heap, k, *id, l2(q, v)); + } + probed += 1; + } + + (finalize(heap), member_evals, probed) + } + + /// The **steelman B&B**: visit clusters in centroid-distance order (the effective `nprobe` + /// ordering, so τ tightens fast), but **skip** scanning any cluster the lower bound proves + /// cannot hold a top-k point (`LB(q,c) ≥ τ`). Unlike [`search`](Self::search)'s global early + /// `break`, skipping is correctness-safe in *any* visit order (a skipped cluster genuinely + /// cannot contain a closer point); a global break would be unsound here because a later, + /// large-radius cluster can have a *smaller* LB than the current one. + /// + /// `max_probe` caps the number of clusters **considered** (the apples-to-apples budget against + /// `nprobe`); LB-skips save member scans within that budget. This is the strongest version of + /// the bet — if it cannot beat `nprobe`, the bound itself doesn't pay. Returns + /// `(top-k, member_evals, clusters_considered)`. + pub fn search_bnb_skip( + &self, + q: &[f32], + k: usize, + max_probe: Option, + ) -> (Vec, usize, usize) { + let nclusters = self.centroids.len(); + let mut order: Vec<(f32, usize)> = (0..nclusters) + .map(|c| (l2(q, &self.centroids[c]), c)) + .collect(); + order.sort_by(|a, b| a.0.total_cmp(&b.0)); + let cap = max_probe.unwrap_or(nclusters).min(nclusters); + + let mut heap: BinaryHeap = BinaryHeap::with_capacity(k + 1); + let mut member_evals = 0usize; + let mut considered = 0usize; + for (dc, c) in order { + if considered >= cap { + break; + } + considered += 1; + if heap.len() == k { + let kth = heap.peek().unwrap().dist; + if (dc - self.radii[c]).max(0.0) >= kth { + continue; // LB-skip: provably cannot improve the top-k + } + } + for (id, v) in &self.lists[c] { + member_evals += 1; + consider(&mut heap, k, *id, l2(q, v)); + } + } + (finalize(heap), member_evals, considered) + } + + /// The **plain-IVF incumbent** strategy on this same shared index: visit the `nprobe` nearest + /// centroids (by centroid distance) and scan **all** their members — no lower-bound ordering, + /// no early termination. This is exactly `ruvector-rairs::IvfFlat::search`'s algorithm + /// (validated equal by `instrumented_nprobe_matches_rairs`), instrumented to count member + /// distance-evals and sharing B&B's centroids/lists so the comparison isolates the probe loop. + pub fn search_nprobe( + &self, + q: &[f32], + k: usize, + nprobe: usize, + ) -> (Vec, usize, usize) { + let nclusters = self.centroids.len(); + let mut cd: Vec<(f32, usize)> = (0..nclusters) + .map(|c| (l2(q, &self.centroids[c]), c)) + .collect(); + cd.sort_by(|a, b| a.0.total_cmp(&b.0)); + let np = nprobe.clamp(1, nclusters); + + let mut heap: BinaryHeap = BinaryHeap::with_capacity(k + 1); + let mut member_evals = 0usize; + for &(_, c) in cd.iter().take(np) { + for (id, v) in &self.lists[c] { + member_evals += 1; + consider(&mut heap, k, *id, l2(q, v)); + } + } + (finalize(heap), member_evals, np) + } +} diff --git a/crates/ruvector-bet4-ivf-bench/src/lib.rs b/crates/ruvector-bet4-ivf-bench/src/lib.rs new file mode 100644 index 0000000000..c4cd77e46f --- /dev/null +++ b/crates/ruvector-bet4-ivf-bench/src/lib.rs @@ -0,0 +1,17 @@ +//! BET 4 (SepRAG, ruvnet/RuVector #534): does **lower-bound-ordered branch-and-bound** +//! IVF probing beat a tuned plain `IvfFlat` `nprobe` on unfiltered ANN over real 128-d +//! embeddings, at matched recall@10? +//! +//! This closes the BET 4 caveat left open by ADR-201: the region-pruning IVF kernel was +//! only ever run against ACORN (BET 2), never head-to-head against its natural incumbent — +//! plain IVF `nprobe`. The B&B kernel is rebuilt self-contained here (BET 2's lives only on +//! the #536 branch), over the same `ruvector-rairs` k-means substrate as the incumbent. +//! +//! Frozen gate: `docs/plans/bet4-ivf-pruning/PRE-REGISTRATION.md`. + +pub mod data; +pub mod kernel; +pub mod oracle; +pub mod pca; + +pub use kernel::BnBIvf; diff --git a/crates/ruvector-bet4-ivf-bench/src/oracle.rs b/crates/ruvector-bet4-ivf-bench/src/oracle.rs new file mode 100644 index 0000000000..5ddef5ee80 --- /dev/null +++ b/crates/ruvector-bet4-ivf-bench/src/oracle.rs @@ -0,0 +1,39 @@ +//! Brute-force exact kNN ground truth + recall, and the shared L2 helper. +//! +//! The triangle-inequality lower bound the kernel relies on holds for the **metric** L2, not +//! its square — so radii, centroid distances, and member distances all use true L2 (`sqrt`). +//! Keeping one `l2` here guarantees the bound and the ranking use an identical metric. + +/// Euclidean (L2) distance between two equal-length vectors. +#[inline] +pub fn l2(a: &[f32], b: &[f32]) -> f32 { + a.iter() + .zip(b) + .map(|(x, y)| { + let d = x - y; + d * d + }) + .sum::() + .sqrt() +} + +/// Exact top-`k` neighbour ids of `q` over `corpus` under L2 (ascending distance). +/// +/// `q` may itself be a corpus point; self (distance 0) is **not** excluded — it lands in both +/// the oracle set and any contender's result, so it cancels and does not bias recall. +pub fn brute_force_topk(corpus: &[Vec], q: &[f32], k: usize) -> Vec { + let mut scored: Vec<(f32, usize)> = corpus + .iter() + .enumerate() + .map(|(i, v)| (l2(q, v), i)) + .collect(); + scored.sort_by(|a, b| a.0.total_cmp(&b.0)); + scored.into_iter().take(k).map(|(_, i)| i).collect() +} + +/// recall@k = |truth_k ∩ got_k| / k. Tolerant of tie-reshuffling (set intersection, not order). +pub fn recall_at_k(truth: &[usize], got: &[usize], k: usize) -> f64 { + let t: std::collections::HashSet = truth.iter().take(k).copied().collect(); + let hits = got.iter().take(k).filter(|g| t.contains(g)).count(); + hits as f64 / k.max(1) as f64 +} diff --git a/crates/ruvector-bet4-ivf-bench/src/pca.rs b/crates/ruvector-bet4-ivf-bench/src/pca.rs new file mode 100644 index 0000000000..c6358ffd97 --- /dev/null +++ b/crates/ruvector-bet4-ivf-bench/src/pca.rs @@ -0,0 +1,73 @@ +//! Minimal top-`m` PCA via power iteration + deflation — for BET 4's **low-dimensional control**. +//! +//! Projecting the real arxiv features onto their top principal components gives the *same data* +//! at low intrinsic dimensionality, where the triangle-inequality cluster bound should be tight +//! and the B&B kernel is expected to WIN — proving the kernel/harness are sound and isolating +//! high-dimensional distance concentration as the cause of any 128-d NO-GO. No linalg dependency. + +/// Project `data` (n × dim) onto its top `m` principal components, returning n × m coordinates. +/// Data is mean-centered first; components found by power iteration with deflation (`iters` steps +/// each). f64 accumulation for numerical stability. +pub fn project_topm(data: &[Vec], m: usize, iters: usize) -> Vec> { + let n = data.len(); + if n == 0 { + return Vec::new(); + } + let dim = data[0].len(); + + let mut mean = vec![0.0f64; dim]; + for v in data { + for (d, &x) in v.iter().enumerate() { + mean[d] += x as f64; + } + } + for x in &mut mean { + *x /= n as f64; + } + let centered: Vec> = data + .iter() + .map(|v| (0..dim).map(|d| v[d] as f64 - mean[d]).collect()) + .collect(); + + let mut comps: Vec> = Vec::with_capacity(m.min(dim)); + for c in 0..m.min(dim) { + let mut v = vec![0.0f64; dim]; + v[c % dim] = 1.0; + for _ in 0..iters { + // u = Σ_i (x_i · v) x_i — covariance-times-v without forming the covariance matrix. + let mut u = vec![0.0f64; dim]; + for x in ¢ered { + let dot: f64 = x.iter().zip(&v).map(|(a, b)| a * b).sum(); + for (d, &xd) in x.iter().enumerate() { + u[d] += dot * xd; + } + } + // Deflate against already-found components (Gram–Schmidt). + for prev in &comps { + let proj: f64 = u.iter().zip(prev).map(|(a, b)| a * b).sum(); + for (d, &pd) in prev.iter().enumerate() { + u[d] -= proj * pd; + } + } + let norm = u.iter().map(|x| x * x).sum::().sqrt(); + if norm < 1e-12 { + break; + } + for x in &mut u { + *x /= norm; + } + v = u; + } + comps.push(v); + } + + centered + .iter() + .map(|x| { + comps + .iter() + .map(|comp| x.iter().zip(comp).map(|(a, b)| a * b).sum::() as f32) + .collect() + }) + .collect() +} diff --git a/crates/ruvector-bet4-ivf-bench/tests/oracle_gate.rs b/crates/ruvector-bet4-ivf-bench/tests/oracle_gate.rs new file mode 100644 index 0000000000..675dd0beb8 --- /dev/null +++ b/crates/ruvector-bet4-ivf-bench/tests/oracle_gate.rs @@ -0,0 +1,102 @@ +//! M0 gate: full-budget `BnBIvf` must be **exact** — its top-10 must match the brute-force +//! oracle (recall ≈ 1.0) on a real arxiv slice. This certifies the branch-and-bound invariant +//! (ascending-LB order + `break` when `LB ≥ τ`) on real data before any matched-recall claim. + +use ruvector_bet4_ivf_bench::data::load_feat_csv; +use ruvector_bet4_ivf_bench::kernel::BnBIvf; +use ruvector_bet4_ivf_bench::oracle::{brute_force_topk, recall_at_k}; +use ruvector_rairs::{AnnIndex, IvfFlat}; + +/// Repo-root-relative path to the gitignored arxiv feature slice. +const DATA: &str = "../../target/m1-data/node-feat-2000.csv"; + +#[test] +fn bnb_full_budget_is_exact() { + let corpus = match load_feat_csv(DATA, 2000) { + Ok(c) if c.len() >= 500 => c, + _ => { + eprintln!("skipping bnb_full_budget_is_exact: {DATA} not available"); + return; + } + }; + let k = 10; + let idx = BnBIvf::build(&corpus, 64, 25, 42); + let nq = 100; + let mut acc = 0.0; + for q in 0..nq { + let truth = brute_force_topk(&corpus, &corpus[q], k); + let (res, _evals, _probed) = idx.search(&corpus[q], k, None); // None = full budget = exact + let got: Vec = res.iter().map(|r| r.id).collect(); + acc += recall_at_k(&truth, &got, k); + } + let recall = acc / nq as f64; + assert!( + recall >= 0.999, + "full-budget B&B must be exact (B&B invariant broken): recall@10={recall:.4}" + ); +} + +#[test] +fn capped_probe_reduces_member_evals() { + let corpus = match load_feat_csv(DATA, 2000) { + Ok(c) if c.len() >= 500 => c, + _ => { + eprintln!("skipping capped_probe_reduces_member_evals: {DATA} not available"); + return; + } + }; + let idx = BnBIvf::build(&corpus, 64, 25, 42); + let (_r_full, evals_full, _p) = idx.search(&corpus[0], 10, None); + let (_r_cap, evals_cap, probed_cap) = idx.search(&corpus[0], 10, Some(4)); + assert!(probed_cap <= 4, "cap must bound clusters probed"); + assert!( + evals_cap <= evals_full, + "capped probe should not cost more member-evals than full budget" + ); +} + +#[test] +fn instrumented_nprobe_matches_rairs() { + // The cost-measured incumbent (BnBIvf::search_nprobe) must be algorithmically identical to the + // real ruvector-rairs::IvfFlat at the same (nclusters, max_iter, seed, nprobe) — same k-means + // substrate => same centroids/lists => same results. This legitimises measuring the incumbent's + // member-evals on the shared index rather than driving rairs separately. + let corpus = match load_feat_csv(DATA, 2000) { + Ok(c) if c.len() >= 500 => c, + _ => { + eprintln!("skipping instrumented_nprobe_matches_rairs: {DATA} not available"); + return; + } + }; + let (dim, k, nclusters, max_iter, seed, nprobe) = (corpus[0].len(), 10, 64, 25, 42u64, 8); + + let mine = BnBIvf::build(&corpus, nclusters, max_iter, seed); + let mut rairs = IvfFlat::new(dim, nclusters, max_iter, seed); + rairs.train(&corpus).unwrap(); + rairs.add(&corpus).unwrap(); + + let nq = 100; + let (mut r_mine, mut r_rairs) = (0.0, 0.0); + for q in 0..nq { + let truth = brute_force_topk(&corpus, &corpus[q], k); + let got_mine: Vec = mine + .search_nprobe(&corpus[q], k, nprobe) + .0 + .iter() + .map(|r| r.id) + .collect(); + let got_rairs: Vec = rairs + .search(&corpus[q], k, nprobe) + .unwrap() + .iter() + .map(|r| r.id) + .collect(); + r_mine += recall_at_k(&truth, &got_mine, k); + r_rairs += recall_at_k(&truth, &got_rairs, k); + } + let (r_mine, r_rairs) = (r_mine / nq as f64, r_rairs / nq as f64); + assert!( + (r_mine - r_rairs).abs() < 0.01, + "instrumented incumbent must match rairs IvfFlat: mine={r_mine:.4} rairs={r_rairs:.4}" + ); +} diff --git a/docs/adr/ADR-205-region-pruned-ivf-vs-plain-ivf-nprobe.md b/docs/adr/ADR-205-region-pruned-ivf-vs-plain-ivf-nprobe.md new file mode 100644 index 0000000000..f55f422651 --- /dev/null +++ b/docs/adr/ADR-205-region-pruned-ivf-vs-plain-ivf-nprobe.md @@ -0,0 +1,146 @@ +--- +adr: 205 +title: "Triangle-Inequality Cluster Pruning vs Tuned Plain IVF nprobe — Structural NO-GO" +status: proposed +date: 2026-06-05 +authors: [ofershaal, claude-flow] +related: [ADR-193, ADR-199, ADR-201] +tags: [ruvector, retrieval, ann, ivf, rairs, pruning, branch-and-bound, no-go] +--- + +# ADR-205 — Triangle-Inequality Cluster Pruning vs Tuned Plain IVF `nprobe` (Structural NO-GO) + +## Status + +**Proposed — NO-GO (robust, structural), 2026-06-05.** Closes the BET 4 caveat left open by +ADR-201: the region-pruning IVF kernel (`RegionPruneIvf`) was built and validated *exact* there but +only ever run as BET 2's mechanism **against ACORN** — never head-to-head against its natural +incumbent, **plain IVF `nprobe`**, on unfiltered ANN. This is that head-to-head. The gate was +**pre-registered and frozen before any run** (`docs/plans/bet4-ivf-pruning/PRE-REGISTRATION.md`). + +**Lower-bound branch-and-bound IVF probing provides essentially zero benefit over a tuned plain +`nprobe` — a flat 1.00× member-eval ratio in every cell, at both n=20k and n=50k, in both 128-d and +a PCA-8 low-dim control.** The cause is **structural, not dimensional**: the triangle-inequality +cluster bound can only prune *far* clusters, which a tuned `nprobe` already never visits — so the +bound is **redundant** with `nprobe`'s centroid-distance cutoff. High dimensionality only makes the +faithful BET-2 kernel (which probes in *LB order*) strictly **worse** (0.18–0.25×). + +## Context + +`ruvector-rairs::IvfFlat` (ADR-193) is plain IVF: k-means centroids + inverted lists; +`search(q, k, nprobe)` scans all members of the `nprobe` nearest-centroid lists. BET 4 asked whether +adding a triangle-inequality lower bound — `LB(q,c) = max(0, ‖q−μ_c‖ − r_c)`, `r_c` the cluster +radius — and probing with branch-and-bound (skip/stop on clusters that provably cannot hold a +top-k point) beats tuned `nprobe` at matched recall@10, on real 128-d arxiv embeddings. + +The kernel was rebuilt self-contained (`crates/ruvector-bet4-ivf-bench`), off clean `main`, over the +same `ruvector-rairs` k-means substrate as the incumbent (BET 2's kernel lives only on the #536 +branch). Two correctness gates passed before any claim: full-budget B&B is **exact** (recall ≥ 0.999 +vs brute force), and the instrumented incumbent **matches `IvfFlat`** within 0.01 recall at matched +params (so its measured cost is the real incumbent's). + +Three contenders share one index per `nclusters` (only the probe loop differs): +- **plain `nprobe`** — the incumbent. +- **B&B LB-order** — the faithful BET-2 `RegionPruneIvf`: probe in ascending `LB`, global `break` + when `LB ≥ τ` (exact at full budget). +- **B&B steelman** — centroid-distance order (the effective `nprobe` ordering, so τ tightens fast) + + per-cluster **LB-skip** (correctness-safe in any order). The *strongest* cluster-level B&B: if + it cannot beat `nprobe`, the bound does not pay. + +## Decision / Finding + +**NO-GO.** Cost at matched recall@10 = 0.95, 200 queries; member distance-evals per query +(steelman is the strongest contender, so it sets the verdict): + +**n = 50,000, 128-d (real arxiv features):** + +| nclusters | exact-prune | plain `nprobe` | B&B LB-order | **B&B steelman** | steelman ratio | +|---|---|---|---|---|---| +| 64 | 0.0% | 11,102 ev | 49,182 (recall 0.99) | **11,102** | **1.00×** | +| 256 | 4.7% | 7,890 ev | 49,979 (recall 1.00) | **7,890** | **1.00×** | +| 1024 | 13.1% | 5,682 ev | 45,373 (recall 1.00) | **5,682** | **1.00×** | + +**n = 50,000, PCA-8 (low-dim control — bound is tight here):** + +| nclusters | exact-prune | plain `nprobe` | **B&B steelman** | steelman ratio | +|---|---|---|---|---| +| 64 | 8.0% | 4,393 ev | **4,393** | **1.00×** | +| 256 | 45.1% | 1,835 ev | **1,835** | **1.00×** | +| 1024 | 82.5% | 731 ev | **731** | **1.00×** | + +n=20k reproduces identically (steelman 1.00× in all six cells). Wall-clock tracks the eval ratio +(0.94–1.02×) — no reversal, but no win either. + +**Mechanism (structural, the key result).** The true top-k neighbours live in the *nearest* +clusters; any method must scan those members to find them. The LB bound only lets B&B *skip far +clusters* — but a tuned `nprobe` already does not visit them. So at matched recall the steelman +scans **exactly** the members `nprobe` scans (the near clusters all have `LB < τ`, so nothing is +skipped inside the operating budget) → 1.00×, **in every dimension**. The win is not "hard"; it is +**structurally impossible** against a tuned incumbent, because the bound and `nprobe`'s +centroid-distance cutoff exploit the *same* locality. + +**Why the LB-order kernel is strictly worse (0.18–0.25×).** Ordering clusters by `LB = max(0, d − +r_c)` pushes any *large-radius* cluster toward `LB ≈ 0` regardless of how far its centroid is, so +B&B probes far, low-yield clusters early and needs ~all clusters to reach 0.95. LB-order is correct +for *exact* early termination but a poor *priority* for approximate probing — centroid distance is +better. High-dimensional concentration (large radii) makes this pathology severe. + +## The pre-registered low-dim control — an honest deviation + +The frozen pre-registration expected the **PCA-8 control to show B&B *winning*** ("tight bound ⇒ +B&B beats tuned `nprobe`; if it does not win even at 8-d, the implementation is suspect"). **It did +not** — the steelman is 1.00× at PCA-8 too. That expectation was built on a **false premise**: a +tight bound implies beating *full exact scan*, **not** beating *tuned `nprobe`*. The control still +did its real job two ways, so the 128-d NO-GO is **interpretable, not voided**: + +1. **The kernel is sound.** The exact-regime pruning fraction scales correctly and strongly with + dimension — 0–13% at 128-d vs 8–82.5% at PCA-8 (n=50k). The bound *does* prune hard when it can; + the harness measures it correctly. The implementation is not suspect. +2. **It replaced the predicted mechanism with a better one.** The control is what revealed the kill + is *structural redundancy* (dimension-independent), not *dimensional looseness*. The bound prunes + 87% of clusters vs full-scan at PCA-8 yet still ties `nprobe`, because `nprobe`'s tuning already + captures that same pruning. + +Recording the deviation — the control disproved my predicted sign and taught the real finding — is +the point, per the prove-not-hype protocol (cf. ADR-203's three documented deviations). + +## Consequences + +**Positive (a clean, general kill).** +- **Companion to ADR-199.** Classical exact-pruning structures do not pay on embedding retrieval: + graph separators/contraction there (high treewidth), triangle-inequality cluster bounds here + (redundant with `nprobe`). The kills keep sharpening *where* these ideas work — and IVF `nprobe` + is simply already near-optimal at exploiting cluster locality. +- **No code to ship, and that is the right outcome.** `ruvector-rairs::IvfFlat` needs no B&B add-on; + the result protects it from a complexity-adding non-improvement. + +**Boundaries / honest caveats.** +- **Scope: cluster-level bounds vs tuned `nprobe`, recall@10 ≈ 0.95.** This does **not** speak to + finer techniques — IVFADC / product-quantized asymmetric distance, per-member bounds, or learned + routing — which prune *within* lists by a different mechanism and are outside the frozen claim. +- **The structural argument predicts the same sign at other recall targets** (neighbours still live + in the near clusters at R=0.99), but only R=0.95 was measured. +- **`nprobe` is the right incumbent precisely because it is already tuned.** Against an *untuned* + full-exact-scan baseline the bound wins (that is the exact-prune fraction) — but that baseline is + not what anyone ships. + +## Scoreboard + +**2 WINS** (ADR-200/202 reuse+periodic; ADR-204 incremental high-recall tier) / +**4 KILLS** (ADR-199 CCH-on-embeddings; ADR-201 filtered-ANN vs ACORN; ADR-203 KG-treewidth; +ADR-205 IVF cluster-pruning vs `nprobe`). + +## Next steps + +1. If IVF acceleration is ever revisited, the open lever is **within-list** pruning + (PQ/IVFADC asymmetric distance), a different mechanism than the cluster-level bound killed here. +2. None for this kernel — the structural redundancy is dimension-independent and reproduced at two + scales; further `n`/recall sweeps would only reconfirm. + +## Alternatives considered + +- **B&B in LB order** (the faithful BET-2 kernel) — measured; strictly worse than `nprobe` + (0.18–0.25×) because LB is a poor approximate priority. +- **B&B steelman** (centroid order + LB-skip) — the strongest cluster-level variant; ties `nprobe` + (1.00×). Retained as the verdict-setting contender. +- **Within-list / PQ pruning** — not built; a different mechanism, noted as the only open lever. diff --git a/docs/plans/bet4-ivf-pruning/PRE-REGISTRATION.md b/docs/plans/bet4-ivf-pruning/PRE-REGISTRATION.md new file mode 100644 index 0000000000..706a7ad4ee --- /dev/null +++ b/docs/plans/bet4-ivf-pruning/PRE-REGISTRATION.md @@ -0,0 +1,136 @@ +# BET 4 — Pre-Registration (FROZEN): LB-ordered branch-and-bound IVF probing vs tuned plain `IvfFlat` + +**Status: FROZEN (2026-06-05, user-confirmed).** No gate, threshold, metric, dataset, or +control below may change after this commit. Deviations are limited to the explicitly +pre-authorised list at the end; any other change voids the run. + +Thread: SepRAG (ruvnet/RuVector issue #534). This closes the BET 4 caveat left open by ADR-201 +(#536): the region-pruning IVF kernel was built and validated *exact* there, but only ever run as +BET 2's mechanism **against ACORN** — never head-to-head against its own natural incumbent, **plain +IVF `nprobe` probing**. This is that head-to-head, on **unfiltered** ANN (no predicate — the +filtered question is BET 2, resolved NO-GO). + +Independent of #535/#537/#539: this branch (`feat/seprag-bet4-ivf-pruning`) is cut off **clean +main**. The incumbent (`ruvector-rairs::IvfFlat`) is on main; the B&B kernel (which lives only on +the BET 2 branch) is **rebuilt self-contained** here, so the result is valid regardless of any +other PR's fate. + +## Claim (one claim, one number) + +> On unfiltered ANN over real **128-d** arxiv embeddings, **lower-bound-ordered branch-and-bound +> IVF probing** scans **≥ 2× fewer member distance-evals** than a **tuned plain `IvfFlat` +> `nprobe`**, at **matched recall@10**, **and wins on wall-clock**. + +## Incumbent (tuned, in-repo — no straw man) + +`ruvector-rairs::IvfFlat` (`crates/ruvector-rairs/src/ivf.rs`): k-means centroids + inverted lists; +`search(query, k, nprobe)` scans **all** members of the `nprobe` nearest-centroid lists, then +finalises top-k. Tuned = sweep `nclusters ∈ {64, 256, 1024}` × `nprobe ∈ [1, nclusters]` to its +best (recall, cost) frontier. **Both contenders share the same k-means centroids and seed** — only +the *probing strategy* differs, so the comparison isolates the strategy, not clustering luck. + +## Contender (the bet — rebuilt standalone) + +`BnBIvf` over the same centroids/lists: +- Precompute per-cluster radius `r_c = max_{v ∈ list_c} ‖v − centroid_c‖`. +- For a query `q`: compute `‖q − centroid_c‖` for all `c` (routing cost, charged); lower bound + `LB(q,c) = max(0, ‖q − centroid_c‖ − r_c)`. +- Probe clusters in **ascending `LB`** order, maintaining a running k-th-best distance `τ`; scan a + cluster's members (each a charged distance-eval), update `τ`; **break when `LB(c) ≥ τ`** (no + unscanned cluster can contain a top-k point → provably done). +- **Exact** at full budget (recall → 1.0). A `max_probe` cap (probe at most that many clusters) is + the approx knob used to hit a sub-1.0 recall target for the matched-recall comparison — the + analogue of `nprobe`. + +## Data + +`target/m1-data/node-feat-100k.csv` — ogbn-arxiv 128-d node features (public, aligned, the same +corpus used by ADR-201/202/204). N-sweep at **20,000 and 100,000**. Queries: 200 held-out points. +Ground truth: brute-force exact L2 kNN@10 recomputed on the corpus. + +## Metrics + +- **Primary: member distance-evals at matched recall@10.** The count of query↔member L2 + evaluations (the dominant cost). Charged identically for both contenders. *Both* are additionally + charged the `nclusters` query↔centroid routing evals (equal for both) and B&B's radius + bookkeeping is build-time (reported separately, not hidden). +- **Secondary (honesty guard): wall-clock per query.** An eval win that **reverses on wall-clock** + is reported as **"inconclusive," never WIN** (ADR-201 precedent). +- **Reported regardless: exact-regime pruning fraction** — the mean % of clusters B&B skips at + recall → 1.0. The mechanistic explainer for whichever verdict lands. + +## Matched-recall protocol + +Pick recall target **R = 0.95**. Tune plain IVF `nprobe` (per `nclusters`) to the smallest value +reaching mean recall@10 ≥ R; record its member-evals. Cap `BnBIvf`'s `max_probe` to the smallest +value reaching ≥ R; record its member-evals. Compare. Repeat per `nclusters ∈ {64, 256, 1024}` and +per N ∈ {20k, 100k}. (Also report the **exact** regime R → 1.0: B&B full-budget vs `nprobe = +nclusters` full scan.) + +## Gate (FROZEN) + +| Verdict | Condition | +|---|---| +| **WIN** | member-scan reduction **≥ 2×** vs tuned `nprobe` at matched recall@10 (R = 0.95) **AND** wall-clock win **AND** holds across all three `nclusters` settings (at ≥ one N). | +| **KILL (NO-GO)** | reduction **< 1.5×** at matched recall **OR** wall-clock reverses. Interpretation: the triangle-inequality bound is too loose in 128-d (distance concentration) to pay. | +| **Qualified** | between 1.5× and 2×, or wins at some `nclusters`/N but not all → report as a **narrow/conditional edge** with the regime named (not a clean WIN). | +| **Report always** | exact-regime pruning fraction; the full (recall, member-evals, wall-clock) frontier per cell. | + +## Controls (the teeth — both mandatory) + +1. **Exact-vs-exact probe** (R → 1.0): `BnBIvf` full-budget vs `IvfFlat` `nprobe = nclusters` + (full scan). Directly measures whether the LB bound prunes **at all** in 128-d. If ~0% of + clusters are pruned here, that *mechanistically* predicts the KILL — and would make any + matched-recall WIN suspect (must be reconciled). +2. **Low-dimensional control:** rerun the entire protocol on a **low-intrinsic-dim** input — + PCA-project the arxiv features to **8-d** (retain the top-8 principal components). The bound is + expected to be tight here, so `BnBIvf` **should WIN** the low-d control. This proves the kernel + and harness are *sound* and isolates **high-d concentration** as the cause of any 128-d NO-GO — + BET 4's analogue of BET 3's roadNet control and BET 1's stale-index control. If the kernel does + **not** win even at 8-d, the implementation is suspect and the 128-d result is uninterpretable. + +## Adversarial checks (pre-committed) + +- **No free routing:** B&B is charged the `nclusters` centroid evals every query; the win must + survive that charge (it is identical for plain IVF, so it cancels, but it is *counted*, not + ignored). +- **Wall-clock guard** (above): eval win must not reverse on wall-clock. +- **Shared index:** identical centroids/seed/lists for both contenders; the *only* difference is + the probe loop. No re-clustering between contenders. +- **Pruning-fraction reconciliation:** a matched-recall WIN with ~0% exact-regime pruning is + internally inconsistent and must be explained before being reported as a WIN. + +## Honest prior (stated before any run, per protocol) + +I lean **NO-GO at 128-d.** Under distance concentration the per-cluster radius `r_c` tends to be +large relative to inter-centroid gaps, so `LB = max(0, d − r_c) ≈ 0` for most clusters → little +pruning → proving exactness scans nearly everything, costing more than a tuned `nprobe` that +accepts < 100% recall. That would be a clean kill, the IVF-level companion to ADR-199 (Euclidean +embedding geometry defeats classical pruning structures — separators there, triangle-inequality +cluster bounds here). A WIN would be a genuine shippable `IvfFlat` upgrade. Either outcome is a +tidy, **consumer-independent** finding — the reason this is the chosen next bet. + +## Pre-authorised deviations (anything else voids the run) + +- Substitute PCA-to-8-d with a synthetic low-d clustered set **only if** PCA is impractical to + implement cleanly; the *role* (a tight-bound low-d control) is fixed. +- Reduce N from 100k to a smaller second scale if 100k brute-force truth is prohibitively slow, + **provided** at least two distinct scales are reported and the larger is ≥ 50k. +- Adjust query count upward (≥ 200) for noise control; never below 200. +- Add `nclusters` settings; never drop one of {64, 256, 1024}. + +## Plan + +- **M0** — self-contained crate `crates/ruvector-bet4-ivf-bench` (deps: `ruvector-rairs`, `rand`): + data loader, `BnBIvf` kernel, brute-force oracle; **gate test** `BnBIvf` full-budget == oracle + (recall 1.0). clippy clean. +- **M1** — instrument member-eval + wall-clock counting on both contenders (shared index). +- **M2** — matched-recall sweep harness (`examples/ivf_pruning_sweep.rs`): the `nclusters` × N grid, + exact-regime probe, frontier print. +- **M3** — low-d (PCA-8) control; adversarial reconciliation; verdict against this gate. +- **M4** — ADR-205 (WIN, NO-GO, or qualified — honest, ADR-199/201 precedent); one PR at M4 linked + to #534; #534 scoreboard comment. + +--- + +**Frozen.** Build starts at M0 against this document; the gate is not revisited.