From 23615ae998e038e595b5be771c23b1cee83379f4 Mon Sep 17 00:00:00 2001
From: Ofer Shaal <oshaal@phase2technology.com>
Date: Thu, 4 Jun 2026 14:44:27 -0400
Subject: [PATCH 1/8] =?UTF-8?q?feat(filtered-bench):=20pre-register=20BET?=
 =?UTF-8?q?=202=E2=8A=974=20+=20M0=20substrate/oracle=20gate=20(issue=20#5?=
 =?UTF-8?q?34)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Region-pruned filtered ANN vs tuned ACORN. New self-contained crate
ruvector-filtered-bench, depending only on ruvector-acorn (incumbent + oracle)
and ruvector-rairs (IVF) — independent of ruvector-seprag/PR #535.

Pre-registration (docs/plans/bet2-filtered-ann/PRE-REGISTRATION.md) freezes a
selectivity-shaped win/kill gate before any contender runs: at correlation
rho>=0.7, contender A within 2% filtered-recall@10 of tuned ACORN at >=5x fewer
distance-evals/query at sel<=1% (>=2x at sel=5%), monotonic in selectivity;
graceful-degradation and wall-clock honesty guards; rho=0 recall-collapse kill
control.

M0 (plumbing, pre-freeze-safe):
- data.rs: aligned ogbn-arxiv feat/label/year loader.
- predicate.rs: rho-correlation knob holding selectivity exactly constant across
  rho, plus natural label/year predicate families.
- tests/oracle_gate.rs: exact_filtered_knn cross-checked against an independent
  brute force on a real arxiv slice (sel x rho grid). 5 tests green, clippy clean.
---
 Cargo.lock                                    |   9 +
 Cargo.toml                                    |   2 +
 crates/ruvector-filtered-bench/Cargo.toml     |  15 ++
 crates/ruvector-filtered-bench/src/data.rs    | 114 ++++++++++
 crates/ruvector-filtered-bench/src/lib.rs     |  26 +++
 .../ruvector-filtered-bench/src/predicate.rs  | 197 ++++++++++++++++++
 .../tests/oracle_gate.rs                      |  85 ++++++++
 .../bet2-filtered-ann/PRE-REGISTRATION.md     | 120 +++++++++++
 8 files changed, 568 insertions(+)
 create mode 100644 crates/ruvector-filtered-bench/Cargo.toml
 create mode 100644 crates/ruvector-filtered-bench/src/data.rs
 create mode 100644 crates/ruvector-filtered-bench/src/lib.rs
 create mode 100644 crates/ruvector-filtered-bench/src/predicate.rs
 create mode 100644 crates/ruvector-filtered-bench/tests/oracle_gate.rs
 create mode 100644 docs/plans/bet2-filtered-ann/PRE-REGISTRATION.md

diff --git a/Cargo.lock b/Cargo.lock
index 078e1b29fa..234c7f4b4b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9309,6 +9309,15 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "ruvector-filtered-bench"
+version = "0.1.0"
+dependencies = [
+ "rand 0.8.5",
+ "ruvector-acorn",
+ "ruvector-rairs",
+]
+
 [[package]]
 name = "ruvector-fpga-transformer"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 38128585a2..7e1fe37464 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -233,6 +233,8 @@ members = [
     "crates/ruvllm_retrieval_diffusion",
     # RAIRS IVF: Redundant Assignment + Amplified Inverse Residual (ADR-193)
     "crates/ruvector-rairs",
+    # BET 2 ⊗ BET 4: region-pruned filtered ANN vs ACORN (SepRAG issue #534, off main)
+    "crates/ruvector-filtered-bench",
 ]
 resolver = "2"
 
diff --git a/crates/ruvector-filtered-bench/Cargo.toml b/crates/ruvector-filtered-bench/Cargo.toml
new file mode 100644
index 0000000000..00f5260f41
--- /dev/null
+++ b/crates/ruvector-filtered-bench/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name        = "ruvector-filtered-bench"
+version     = "0.1.0"
+edition     = "2021"
+description = "BET 2 ⊗ BET 4: region-pruned filtered ANN (IVF cluster-skip) vs tuned ACORN — pre-registered head-to-head on ogbn-arxiv. Self-contained; independent of ruvector-seprag/PR #535."
+authors     = ["ofershaal", "claude-flow"]
+license     = "MIT OR Apache-2.0"
+repository  = "https://github.com/ruvnet/ruvector"
+keywords    = ["ann", "filtered-search", "ivf", "acorn", "benchmark"]
+categories  = ["algorithms", "data-structures"]
+
+[dependencies]
+ruvector-acorn = { path = "../ruvector-acorn" }
+ruvector-rairs = { path = "../ruvector-rairs" }
+rand           = "0.8"
diff --git a/crates/ruvector-filtered-bench/src/data.rs b/crates/ruvector-filtered-bench/src/data.rs
new file mode 100644
index 0000000000..c280d512da
--- /dev/null
+++ b/crates/ruvector-filtered-bench/src/data.rs
@@ -0,0 +1,114 @@
+//! M0 — load aligned ogbn-arxiv features / labels / years.
+//!
+//! Row `i` of every file is node `i` (ogbn-arxiv node-index order), so the three
+//! arrays align by position. Features are pre-extracted to plain CSV (128 comma-
+//! separated f32 per line); labels/years are the gunzipped single-column files.
+//!
+//! One-time extraction (already done in `target/m1-data/`):
+//! ```text
+//! gunzip -kc target/m1-data/arxiv/raw/node-label.csv.gz > target/m1-data/node-label.csv
+//! gunzip -kc target/m1-data/arxiv/raw/node_year.csv.gz  > target/m1-data/node-year.csv
+//! # features: target/m1-data/node-feat-100k.csv (first 100k rows already extracted)
+//! ```
+
+use std::path::Path;
+
+/// Default in-repo paths (relative to workspace root).
+pub const FEAT_100K: &str = "target/m1-data/node-feat-100k.csv";
+pub const LABELS: &str = "target/m1-data/node-label.csv";
+pub const YEARS: &str = "target/m1-data/node-year.csv";
+
+/// An aligned ogbn-arxiv slice: `feats[i]`, `labels[i]`, `years[i]` all describe node `i`.
+#[derive(Clone)]
+pub struct Dataset {
+    pub feats: Vec<Vec<f32>>,
+    pub labels: Vec<u32>,
+    pub years: Vec<i32>,
+    pub dim: usize,
+}
+
+impl Dataset {
+    pub fn len(&self) -> usize {
+        self.feats.len()
+    }
+    pub fn is_empty(&self) -> bool {
+        self.feats.is_empty()
+    }
+
+    /// Load `max_n` aligned rows (capped by the shortest file). Panics on malformed
+    /// input — this is a benchmark harness, not a service; failing loud is correct.
+    pub fn load(
+        feat_path: impl AsRef<Path>,
+        label_path: impl AsRef<Path>,
+        year_path: impl AsRef<Path>,
+        max_n: usize,
+    ) -> Dataset {
+        let feats = read_feats(feat_path.as_ref(), max_n);
+        let labels = read_ints(label_path.as_ref(), max_n);
+        let years = read_ints(year_path.as_ref(), max_n);
+
+        // Truncate all three to the common minimum so alignment is exact.
+        let n = feats.len().min(labels.len()).min(years.len());
+        let dim = feats.first().map(|v| v.len()).unwrap_or(0);
+        assert!(n > 0, "empty dataset after load");
+        assert!(
+            feats.iter().take(n).all(|v| v.len() == dim),
+            "ragged feature rows — dim must be constant"
+        );
+
+        Dataset {
+            feats: feats.into_iter().take(n).collect(),
+            labels: labels.into_iter().take(n).map(|v| v as u32).collect(),
+            years: years.into_iter().take(n).map(|v| v as i32).collect(),
+            dim,
+        }
+    }
+
+    /// Convenience: load the standard in-repo 100k arxiv slice.
+    pub fn load_arxiv(max_n: usize) -> Dataset {
+        Dataset::load(FEAT_100K, LABELS, YEARS, max_n)
+    }
+}
+
+fn read_feats(path: &Path, max_n: usize) -> Vec<Vec<f32>> {
+    let raw = std::fs::read_to_string(path)
+        .unwrap_or_else(|e| panic!("read features {}: {e}", path.display()));
+    raw.lines()
+        .take(max_n)
+        .map(|line| {
+            line.split(',')
+                .map(|f| f.trim().parse::<f32>().expect("parse feature f32"))
+                .collect::<Vec<f32>>()
+        })
+        .collect()
+}
+
+fn read_ints(path: &Path, max_n: usize) -> Vec<i64> {
+    let raw = std::fs::read_to_string(path)
+        .unwrap_or_else(|e| panic!("read ints {}: {e}", path.display()));
+    raw.lines()
+        .take(max_n)
+        .map(|line| line.trim().parse::<i64>().expect("parse int"))
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn loads_aligned_slice() {
+        // Small slice keeps the test fast; skips cleanly if data isn't extracted.
+        if !Path::new(FEAT_100K).exists() {
+            eprintln!("skip: {FEAT_100K} not extracted");
+            return;
+        }
+        let ds = Dataset::load_arxiv(2000);
+        assert_eq!(ds.len(), 2000);
+        assert_eq!(ds.labels.len(), 2000);
+        assert_eq!(ds.years.len(), 2000);
+        assert_eq!(ds.dim, 128);
+        assert!(ds.labels.iter().all(|&l| l < 40), "arxiv has 40 subject labels");
+        assert!(ds.years.iter().all(|&y| (1900..=2025).contains(&y)));
+    }
+}
diff --git a/crates/ruvector-filtered-bench/src/lib.rs b/crates/ruvector-filtered-bench/src/lib.rs
new file mode 100644
index 0000000000..b3206e97aa
--- /dev/null
+++ b/crates/ruvector-filtered-bench/src/lib.rs
@@ -0,0 +1,26 @@
+//! BET 2 ⊗ BET 4 — Region-pruned filtered ANN vs tuned ACORN.
+//!
+//! Pre-registered head-to-head (see `docs/plans/bet2-filtered-ann/PRE-REGISTRATION.md`):
+//! does IVF **cluster-skip** pruning beat predicate-agnostic ACORN on *correlated*
+//! predicates at low selectivity, by ≥5× distance-evals/query at equal (±2%) recall?
+//!
+//! This crate is **self-contained**: it depends only on `ruvector-acorn` (the incumbent and
+//! the `exact_filtered_knn` oracle) and `ruvector-rairs` (the IVF substrate). It has no
+//! dependency on `ruvector-seprag` (PR #535), so it ships as an independent PR.
+//!
+//! ## Module map (filled across milestones)
+//! - `data` (M0)      — load ogbn-arxiv features / labels / years.
+//! - `predicate` (M0) — predicate families + ρ-correlation knob + selectivity targeting.
+//! - `prune` (M2)     — contender A: region-pruned IVF filtered search + eval counters.
+
+pub mod data;
+pub mod predicate;
+
+// Re-export the substrate + incumbent + oracle so the head-to-head harness has one entry
+// point and the dependency graph is exercised at build time.
+pub use ruvector_acorn::{recall_at_k, AcornIndexGamma, FilteredIndex, FlatFilteredIndex};
+pub use ruvector_rairs::ivf::IvfFlat;
+
+/// Exact filtered k-NN oracle (brute force) — ground truth for every contender.
+/// Thin re-export of the in-repo incumbent's oracle to keep one source of truth.
+pub use ruvector_acorn::graph::exact_filtered_knn;
diff --git a/crates/ruvector-filtered-bench/src/predicate.rs b/crates/ruvector-filtered-bench/src/predicate.rs
new file mode 100644
index 0000000000..0abe48be58
--- /dev/null
+++ b/crates/ruvector-filtered-bench/src/predicate.rs
@@ -0,0 +1,197 @@
+//! M0 — predicate families, the ρ-correlation knob, and selectivity targeting.
+//!
+//! A [`Predicate`] is a boolean membership mask over node ids `[0, n)`. The harness
+//! passes it to ACORN / the oracle as `|id| pred.test(id)`.
+//!
+//! ## The ρ-knob (the controlled instrument)
+//!
+//! [`correlated`] builds a predicate of an *exact* target selectivity whose correlation
+//! with embedding geometry is tunable: ρ=1 is a tight, structurally-clustered set (built
+//! from subject-label classes, which occupy regions of the embedding space); ρ=0 is a
+//! random set of the same size (ACORN's home turf, the kill control); intermediate ρ
+//! replaces a fraction `1−ρ` of structured members with random non-members. Selectivity is
+//! held fixed across ρ so cost differences are attributable to correlation, not set size.
+
+use rand::seq::SliceRandom;
+use rand::Rng;
+
+/// A predicate over node ids, plus the construction parameters that produced it.
+#[derive(Clone)]
+pub struct Predicate {
+    mask: Vec<bool>,
+    /// Number of matching nodes (`mask` trues).
+    pub n_match: usize,
+    /// Requested selectivity (matches/n); see [`Predicate::selectivity`] for the realized value.
+    pub target_sel: f64,
+    /// Construction correlation knob in `[0,1]` (1 = structured, 0 = random). `NaN` for
+    /// natural-family predicates where ρ is not a construction parameter.
+    pub rho: f64,
+}
+
+impl Predicate {
+    #[inline]
+    pub fn test(&self, id: u32) -> bool {
+        self.mask[id as usize]
+    }
+
+    /// `Fn(u32) -> bool` view for ACORN / oracle APIs.
+    pub fn as_fn(&self) -> impl Fn(u32) -> bool + Copy + '_ {
+        move |id| self.mask[id as usize]
+    }
+
+    /// Realized selectivity = matches / n.
+    pub fn selectivity(&self) -> f64 {
+        self.n_match as f64 / self.mask.len() as f64
+    }
+
+    pub fn len(&self) -> usize {
+        self.mask.len()
+    }
+    pub fn is_empty(&self) -> bool {
+        self.mask.is_empty()
+    }
+
+    fn from_mask(mask: Vec<bool>, target_sel: f64, rho: f64) -> Predicate {
+        let n_match = mask.iter().filter(|&&b| b).count();
+        Predicate { mask, n_match, target_sel, rho }
+    }
+}
+
+/// Natural categorical predicate: nodes whose subject label equals `class`.
+pub fn from_label(labels: &[u32], class: u32) -> Predicate {
+    let mask = labels.iter().map(|&l| l == class).collect::<Vec<_>>();
+    let sel = mask.iter().filter(|&&b| b).count() as f64 / labels.len() as f64;
+    Predicate::from_mask(mask, sel, f64::NAN)
+}
+
+/// Natural ordinal predicate: nodes with `year >= y`.
+pub fn year_ge(years: &[i32], y: i32) -> Predicate {
+    let mask = years.iter().map(|&yr| yr >= y).collect::<Vec<_>>();
+    let sel = mask.iter().filter(|&&b| b).count() as f64 / years.len() as f64;
+    Predicate::from_mask(mask, sel, f64::NAN)
+}
+
+/// The controlled instrument: a predicate of exact selectivity `target_sel` with tunable
+/// geometric correlation `rho ∈ [0,1]`.
+///
+/// - `seed_class_rank` selects which size-ranked label class seeds the structured set
+///   (0 = largest); rotating it lets M3 average over several regions to remove
+///   region-specific bias.
+/// - The structured pool is the union of label classes (in size order from the seed),
+///   truncated to exactly `m = round(target_sel · n)` members. `keep = round(rho · m)` of
+///   those are retained; the remaining `m − keep` are random non-members, so |set| = m for
+///   every ρ.
+pub fn correlated(
+    labels: &[u32],
+    target_sel: f64,
+    rho: f64,
+    seed_class_rank: usize,
+    rng: &mut impl Rng,
+) -> Predicate {
+    let n = labels.len();
+    let m = ((target_sel * n as f64).round() as usize).clamp(1, n);
+    let rho = rho.clamp(0.0, 1.0);
+
+    // Label classes sorted by descending size; rotate by seed_class_rank.
+    let n_classes = (labels.iter().copied().max().unwrap_or(0) as usize) + 1;
+    let mut counts = vec![0usize; n_classes];
+    for &l in labels {
+        counts[l as usize] += 1;
+    }
+    let mut class_order: Vec<u32> = (0..n_classes as u32).collect();
+    class_order.sort_by_key(|&c| std::cmp::Reverse(counts[c as usize]));
+    if !class_order.is_empty() {
+        let rot = seed_class_rank % class_order.len();
+        class_order.rotate_left(rot);
+    }
+
+    // Accumulate node ids class-by-class until the pool reaches m, then truncate.
+    let mut structured: Vec<u32> = Vec::with_capacity(m);
+    'fill: for &c in &class_order {
+        for (id, &l) in labels.iter().enumerate() {
+            if l == c {
+                structured.push(id as u32);
+                if structured.len() >= m {
+                    break 'fill;
+                }
+            }
+        }
+    }
+
+    let keep = ((rho * m as f64).round() as usize).min(structured.len());
+    let mut mask = vec![false; n];
+    for &id in &structured[..keep] {
+        mask[id as usize] = true;
+    }
+
+    // Fill the rest with random non-members so realized selectivity == m/n exactly.
+    let need = m - keep;
+    if need > 0 {
+        let mut pool: Vec<u32> = (0..n as u32).filter(|&id| !mask[id as usize]).collect();
+        let (picked, _) = pool.partial_shuffle(rng, need);
+        for &id in picked.iter() {
+            mask[id as usize] = true;
+        }
+    }
+
+    Predicate::from_mask(mask, target_sel, rho)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::rngs::StdRng;
+    use rand::SeedableRng;
+
+    fn synth_labels(n: usize, n_classes: u32) -> Vec<u32> {
+        // Deterministic block labels: class regions are contiguous id ranges (a proxy for
+        // geometric clustering, sufficient to test the ρ mechanism).
+        (0..n).map(|i| (i as u32 * n_classes / n as u32).min(n_classes - 1)).collect()
+    }
+
+    #[test]
+    fn selectivity_is_exact_across_rho() {
+        let labels = synth_labels(10_000, 8);
+        let mut rng = StdRng::seed_from_u64(1);
+        for &rho in &[0.0, 0.3, 0.7, 1.0] {
+            let p = correlated(&labels, 0.05, rho, 0, &mut rng);
+            assert_eq!(p.n_match, 500, "exact selectivity must hold for ρ={rho}");
+            assert!((p.selectivity() - 0.05).abs() < 1e-9);
+        }
+    }
+
+    #[test]
+    fn rho1_is_structured_rho0_is_spread() {
+        // ρ=1 concentrates in few classes; ρ=0 spreads across all. Use distinct-class
+        // count of the matched set as a cheap structure proxy.
+        let labels = synth_labels(10_000, 8);
+        let mut rng = StdRng::seed_from_u64(2);
+        let distinct = |p: &Predicate| {
+            let mut s = std::collections::HashSet::new();
+            for id in 0..labels.len() as u32 {
+                if p.test(id) {
+                    s.insert(labels[id as usize]);
+                }
+            }
+            s.len()
+        };
+        let p1 = correlated(&labels, 0.05, 1.0, 0, &mut rng);
+        let p0 = correlated(&labels, 0.05, 0.0, 0, &mut rng);
+        assert!(
+            distinct(&p1) < distinct(&p0),
+            "ρ=1 should span fewer classes ({}) than ρ=0 ({})",
+            distinct(&p1),
+            distinct(&p0)
+        );
+    }
+
+    #[test]
+    fn from_label_matches_count() {
+        let labels = vec![0u32, 1, 1, 2, 1];
+        let p = from_label(&labels, 1);
+        assert_eq!(p.n_match, 3);
+        // labels = [0,1,1,2,1] → ids 1,2,4 match; 0,3 do not.
+        assert!(p.test(1) && p.test(2) && p.test(4));
+        assert!(!p.test(0) && !p.test(3));
+    }
+}
diff --git a/crates/ruvector-filtered-bench/tests/oracle_gate.rs b/crates/ruvector-filtered-bench/tests/oracle_gate.rs
new file mode 100644
index 0000000000..89c6412a45
--- /dev/null
+++ b/crates/ruvector-filtered-bench/tests/oracle_gate.rs
@@ -0,0 +1,85 @@
+//! M0 gate — "trust the oracle."
+//!
+//! Every contender (A/B/C/D) is scored against `ruvector-acorn::exact_filtered_knn`. If that
+//! oracle is wrong, every downstream recall number is meaningless. This test cross-checks it
+//! against a **fully independent** brute-force filtered k-NN (separate distance code, separate
+//! sort) on a real ogbn-arxiv slice, exercising the whole data → predicate → oracle path.
+//!
+//! Skips cleanly when the arxiv data isn't extracted (CI without the dataset).
+
+use ruvector_filtered_bench::data::{Dataset, FEAT_100K};
+use ruvector_filtered_bench::exact_filtered_knn;
+use ruvector_filtered_bench::predicate;
+
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::path::Path;
+
+/// Independent brute force: no shared code with the oracle. Plain scalar L2, stable sort by
+/// (distance, id) so ties (which don't occur on real float embeddings) are still deterministic.
+fn independent_filtered_knn(
+    feats: &[Vec<f32>],
+    labels_mask: &dyn Fn(u32) -> bool,
+    query: &[f32],
+    k: usize,
+) -> Vec<u32> {
+    let mut scored: Vec<(f64, u32)> = (0..feats.len() as u32)
+        .filter(|&id| labels_mask(id))
+        .map(|id| {
+            let d: f64 = feats[id as usize]
+                .iter()
+                .zip(query)
+                .map(|(a, b)| {
+                    let diff = (*a - *b) as f64;
+                    diff * diff
+                })
+                .sum();
+            (d, id)
+        })
+        .collect();
+    scored.sort_by(|a, b| a.0.total_cmp(&b.0).then(a.1.cmp(&b.1)));
+    scored.into_iter().take(k).map(|(_, id)| id).collect()
+}
+
+#[test]
+fn oracle_matches_independent_brute_force() {
+    if !Path::new(FEAT_100K).exists() {
+        eprintln!("skip: arxiv data not extracted ({FEAT_100K})");
+        return;
+    }
+    let ds = Dataset::load_arxiv(3000);
+    let k = 10;
+    let mut rng = StdRng::seed_from_u64(42);
+
+    // Sweep a few selectivities; each must keep #matches >= k (the M0 selectivity floor).
+    for &sel in &[0.02_f64, 0.05, 0.20] {
+        for &rho in &[0.0_f64, 1.0] {
+            let pred = predicate::correlated(&ds.labels, sel, rho, 0, &mut rng);
+            assert!(
+                pred.n_match >= k,
+                "selectivity floor violated: sel={sel} ρ={rho} → only {} matches < k={k}",
+                pred.n_match
+            );
+            let pf = pred.as_fn();
+
+            // 8 random queries drawn from the corpus.
+            for _ in 0..8 {
+                let qi = rng.gen_range(0..ds.len());
+                let q = &ds.feats[qi];
+
+                let oracle = exact_filtered_knn(&ds.feats, q, k, pf);
+                let truth = independent_filtered_knn(&ds.feats, &pf, q, k);
+
+                assert_eq!(
+                    oracle, truth,
+                    "oracle disagrees with independent brute force (sel={sel} ρ={rho} q={qi})"
+                );
+                // Every returned id must actually satisfy the predicate.
+                assert!(
+                    oracle.iter().all(|&id| pf(id)),
+                    "oracle returned a non-matching id (sel={sel} ρ={rho})"
+                );
+            }
+        }
+    }
+}
diff --git a/docs/plans/bet2-filtered-ann/PRE-REGISTRATION.md b/docs/plans/bet2-filtered-ann/PRE-REGISTRATION.md
new file mode 100644
index 0000000000..b0c0309b40
--- /dev/null
+++ b/docs/plans/bet2-filtered-ann/PRE-REGISTRATION.md
@@ -0,0 +1,120 @@
+# BET 2 ⊗ BET 4 — Region-Pruned Filtered ANN vs tuned ACORN
+
+**Status:** Pre-registered (gate frozen before any run) · **Date:** 2026-06-04 ·
+**Research line:** SepRAG (ruvnet/RuVector issue #534) · **Self-contained:** depends only on
+crates already on `main` (`ruvector-acorn`, `ruvector-rairs`) — **independent of PR #535.** ·
+**Builds on (by reference, not by compile):** ADR-200 (BET 1 WIN), ADR-193 (`ruvector-rairs`
+IVF), ADR-199 (CCH NO-GO → why IVF, not separators) ·
+**Outcome ADR:** ADR-201 (written from the result — WIN *or* NO-GO).
+
+> This document is the **pre-registration**, committed before the harness runs. A loss is an
+> acceptable, reportable outcome (cf. ADR-199). Editing the gate after seeing results voids
+> the bet. Plumbing (M0) may be built before freeze; contender runs (M1+) may not.
+
+## Prove-not-hype protocol (mandatory — all five)
+
+1. **One claim, one number.** 2. **Beat the strongest in-repo incumbent, tuned.**
+3. **Public data + ground truth.** 4. **Pre-register WIN *and* KILL.** 5. **Adversarial check.**
+
+## Thesis (one claim, one number)
+
+> For predicates whose membership **correlates with embedding-cluster structure** (ρ ≥ 0.7) at
+> **selectivity ≤ 1%**, IVF **region-pruned** filtered search reaches **filtered-recall@10
+> within 2%** of tuned ACORN at **≥ 5× fewer distance-evaluations per query** — and the cost
+> advantage **grows monotonically as selectivity falls** (the mechanism signature).
+
+Primary cost = **distance-evals/query** (hardware-independent, as ADR-200). Wall-clock is
+reported and acts as an honesty guard (below).
+
+## Why this scope is the honest one (central insight)
+
+ACORN (SIGMOD 2024, arXiv:2403.04871; `ruvector-acorn::AcornIndexGamma`) is
+**predicate-agnostic by design**: a denser γ·M graph + expand-all-neighbors traversal stay
+navigable *through* predicate-failing nodes, computing a distance for every expanded node,
+pass or fail. So ACORN's per-query distance count is **flat-to-rising as selectivity drops** —
+and ACORN **owns** the uncorrelated case. Attacking it there is a guaranteed loss.
+
+Region-pruning wins the opposite case: when the predicate correlates with cluster membership,
+whole clusters with zero matches are skipped, and a cheap O(1) predicate test gates the
+expensive 128-d distance — so A pays distance-evals only for `routing (≈√n centroids) +
+actual matches in probed clusters`, which **shrinks as selectivity drops**. That asymmetry is
+the entire bet, and it is the production-RAG metadata-filter case (`tenant_id`, `doc_type`,
+`language`, `year≥Y`, `category=X`).
+
+On embeddings the pruning kernel **cannot** live on graph separators (ADR-199: embedding
+graphs are high-treewidth → CCH contraction blew up). Its only viable, treewidth-immune
+substrate is the **IVF hierarchy** (`ruvector-rairs`) — i.e. BET 4. **BET 2 (benchmark +
+incumbent) and BET 4 (mechanism) are one experiment.**
+
+## Data & predicates (real, public — ogbn-arxiv)
+
+n ≈ 169,343, 128-d features (`target/m1-data/arxiv/raw/`, in hand). Oracle =
+`ruvector-acorn::exact_filtered_knn`.
+
+| Predicate | Correlation ρ | Source |
+|---|---|---|
+| Subject-area label = c (one of 40) | **high** | `node-label.csv.gz` |
+| Year ≥ Y / year ∈ [a,b] | **medium** | `node_year.csv.gz` |
+| Random Bernoulli(p), equal selectivity | **ρ = 0 (kill control)** | synthetic |
+
+**Correlation knob ρ:** interpolate a real label predicate toward a random one of equal
+selectivity by shuffling a fraction `1−ρ` of membership. Sweep ρ ∈ {0, 0.3, 0.5, 0.7, 1.0}.
+**Selectivity sweep:** {0.1, 0.5, 1, 5, 10, 30}% (sub-10% is where post-filter collapses).
+
+## Contenders
+
+| ID | Index | Role |
+|---|---|---|
+| **A** | IVF region-pruned filtered search (`rairs::IvfFlat` + per-cluster match-count pruning, predicate-gated distance) | **the bet** |
+| **B** | `AcornIndexGamma`, tuned (γ∈{2,3}, ef∈{64,128,200}; best cost at equal recall) | strong incumbent |
+| **D** | ACORN + predicate-aware entry points | adversarial "tune harder" (rule #5) |
+| **C** | flat / post-filter | floor — proves benchmark teeth (recall collapse at low sel) |
+
+All scored against `exact_filtered_knn` ground truth.
+
+## Pre-registered gate
+
+- **WIN** — at **ρ ≥ 0.7**: A within **2%** filtered-recall@10 of best{B, D} **and** the
+  distance-eval ratio is **≥ 5× at sel ≤ 1%** and **≥ 2× at sel = 5%**, **monotonically
+  increasing as selectivity falls** (the mechanism must be visible, not a single lucky cell).
+- **Graceful-degradation guard** — in ACORN's regime (sel ≥ 10% **or** ρ ≤ 0.3) A may lose,
+  but by **≤ 1.5×** in distance-evals (no catastrophic blowup). Cost-axis analogue of the
+  recall-collapse control.
+- **Wall-clock honesty guard** — wall-clock reported alongside; a distance-eval win that
+  **reverses on wall-clock → "inconclusive," not WIN** (IVF cluster scans vs ACORN's graph
+  walk have different cache behavior; the win must survive both).
+- **KILL (reportable NO-GO)** — *either* A's recall **collapses** at the ρ=0 control (must
+  degrade *safely* to ≈ the floor, not catastrophically), *or* no (selectivity, ρ) cell meets
+  the WIN bar.
+- **Reported regardless:** the crossover correlation **ρ\*** (and crossover selectivity) where
+  ACORN overtakes A on cost — the iso-cost frontier is itself a publishable result.
+
+**Named live risk (not a formality):** ACORN on correlated subgraphs may already be cheap
+enough that ≥5× is unreachable → that is a clean, reportable KILL, written up like ADR-199.
+
+## Where it lives (self-contained off main)
+
+New crate **`crates/ruvector-filtered-bench`**, depending only on `ruvector-acorn` +
+`ruvector-rairs` (+ `rand`). Contender A and the predicate / ρ-knob / selectivity generators
+live in `src/`; the harness is `examples/filtered_ann_pruning.rs`. No dependency on
+`ruvector-seprag` (PR #535) — this PR stands alone.
+
+## Milestones
+
+- **M0 — substrate + oracle wiring.** Load arxiv feat+label+year; build `IvfFlat`; confirm
+  `exact_filtered_knn` + `recall_at_k` on a slice (use a selectivity floor so #matches ≥ k=10).
+  Predicate + ρ-knob + selectivity generators. *Gate: oracle matches brute-force exactly.*
+- **M1 — contenders B/C/D.** Tuned ACORN sweep + post-filter floor; reproduce the documented
+  low-selectivity post-filter recall collapse (proves teeth).
+- **M2 — contender A.** Cluster probe order (match-count, then centroid distance); zero-match
+  cluster skip; predicate-gated distance; per-query distance-eval + wall-clock counters.
+- **M3 — full sweep + gate eval.** selectivity × ρ grid; emit WIN/KILL table; find ρ\*;
+  apply the wall-clock honesty guard.
+- **M4 — ADR-201.** Write the outcome (WIN or NO-GO) with ADR-199/200 honesty.
+
+## Out of scope (named, not silently assumed)
+
+- The uncorrelated/agnostic regime as a *target* (kill control only — ACORN owns it).
+- Multi-predicate conjunctions, streaming updates, the live-GNN metric (BET 1's frontier).
+- Disk-resident / billion-scale (in-memory ogbn-arxiv is the stage).
+</content>

From d56ff201b94d5affd2ba553cfdb6fa646ed04371 Mon Sep 17 00:00:00 2001
From: Ofer Shaal <oshaal@phase2technology.com>
Date: Thu, 4 Jun 2026 14:56:52 -0400
Subject: [PATCH 2/8] feat(filtered-bench): M1 incumbents + post-filter teeth +
 tuned ACORN baseline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instrument ruvector-acorn with additive, result-preserving counted-search variants
(acorn_search_counted, flat_filtered_search_counted) so distance-evals — the
pre-registered primary cost metric — are measured exactly on ACORN-as-shipped.
13 acorn tests pass incl. a counted==uncounted + flat-evals==#matches invariant.

filtered-bench contenders (src/contenders.rs):
- B: ACORN predicate-agnostic search (the incumbent), exact eval counts.
- C: classic post-filter (retrieve top-pool unfiltered, then filter) — the floor.

M1 findings (n=20k arxiv, ρ=1, k=10):
- TEETH (examples/teeth.rs): at the gate-relevant low selectivity, post-filter
  collapses while ACORN holds — sel=0.1%: 73.7% vs 22.7%; sel=0.5%: 90.4% vs 59.7%;
  sel=1%: 92.6% vs 79.3%. At sel>=5% post-filter is fine (as theory predicts).
  Benchmark is demonstrably sensitive (50+ pt recall swing) — the negative control.
- TUNED ACORN (examples/acorn_tune.rs): ACORN reaches ~92.6% recall at sel=1% with
  gamma=2, ef=512, at ~1622 evals/query; evals are ~flat in ef (early-termination
  bound), so "tuned" = crank ef for recall at near-constant cost. This is the fair
  incumbent baseline for the M3 gate, and it validates the >=5x bar: contender A must
  reach >=90.6% recall at <=~324 evals/query to win.
---
 crates/ruvector-acorn/src/search.rs           |  96 +++++++++++++-
 .../examples/acorn_tune.rs                    |  78 +++++++++++
 .../ruvector-filtered-bench/examples/teeth.rs |  96 ++++++++++++++
 .../ruvector-filtered-bench/src/contenders.rs | 124 ++++++++++++++++++
 crates/ruvector-filtered-bench/src/lib.rs     |   1 +
 5 files changed, 389 insertions(+), 6 deletions(-)
 create mode 100644 crates/ruvector-filtered-bench/examples/acorn_tune.rs
 create mode 100644 crates/ruvector-filtered-bench/examples/teeth.rs
 create mode 100644 crates/ruvector-filtered-bench/src/contenders.rs

diff --git a/crates/ruvector-acorn/src/search.rs b/crates/ruvector-acorn/src/search.rs
index 98e2ee61a8..1983600cc2 100644
--- a/crates/ruvector-acorn/src/search.rs
+++ b/crates/ruvector-acorn/src/search.rs
@@ -33,6 +33,34 @@ pub fn acorn_search(
     k: usize,
     ef: usize,
     predicate: impl Fn(u32) -> bool,
+) -> Vec<(u32, f32)> {
+    let mut evals = 0u64;
+    acorn_search_impl(graph, query, k, ef, predicate, &mut evals)
+}
+
+/// Like [`acorn_search`] but also returns the exact number of distance
+/// evaluations (`l2_sq` calls) performed — the hardware-independent cost metric
+/// used by the filtered-ANN benchmark (`ruvector-filtered-bench`). Results are
+/// identical to [`acorn_search`]; only the eval counter is added.
+pub fn acorn_search_counted(
+    graph: &AcornGraph,
+    query: &[f32],
+    k: usize,
+    ef: usize,
+    predicate: impl Fn(u32) -> bool,
+) -> (Vec<(u32, f32)>, u64) {
+    let mut evals = 0u64;
+    let out = acorn_search_impl(graph, query, k, ef, predicate, &mut evals);
+    (out, evals)
+}
+
+fn acorn_search_impl(
+    graph: &AcornGraph,
+    query: &[f32],
+    k: usize,
+    ef: usize,
+    predicate: impl Fn(u32) -> bool,
+    evals: &mut u64,
 ) -> Vec<(u32, f32)> {
     if graph.is_empty() {
         return vec![];
@@ -44,12 +72,19 @@ pub fn acorn_search(
     // point. O(probes × D) overhead vs O(n × D) for flat — negligible.
     let n_probes = (n as f64).sqrt().ceil() as usize;
     let n_probes = n_probes.clamp(4, 64);
-    let entry = (0..n_probes)
-        .map(|i| (i * n / n_probes) as u32)
-        .min_by(|&a, &b| {
-            l2_sq(query, graph.row(a as usize)).total_cmp(&l2_sq(query, graph.row(b as usize)))
-        })
-        .unwrap_or(0);
+    // Count each probe distance once (result-identical to the min_by form, which
+    // recomputed l2_sq inside the comparator — the count reflects fundamental work).
+    let mut entry = 0u32;
+    let mut best = f32::INFINITY;
+    for i in 0..n_probes {
+        let cand = (i * n / n_probes) as u32;
+        let d = l2_sq(query, graph.row(cand as usize));
+        *evals += 1;
+        if d < best {
+            best = d;
+            entry = cand;
+        }
+    }
 
     let mut visited: Vec<bool> = vec![false; n];
     // Min-heap by distance — pop closest unexplored candidate first.
@@ -61,6 +96,7 @@ pub fn acorn_search(
     let mut farthest_in_beam: BinaryHeap<OrdF32> = BinaryHeap::with_capacity(ef + 1);
 
     let d0 = l2_sq(query, graph.row(entry as usize));
+    *evals += 1;
     candidates.push(Reverse((OrdF32(d0), entry)));
     farthest_in_beam.push(OrdF32(d0));
     visited[entry as usize] = true;
@@ -93,6 +129,7 @@ pub fn acorn_search(
             }
             visited[ni] = true;
             let nd = l2_sq(query, graph.row(ni));
+            *evals += 1;
 
             // Bounded beam: only admit if there's room or the new candidate
             // is closer than the worst pending one.
@@ -129,6 +166,30 @@ pub fn flat_filtered_search(
     query: &[f32],
     k: usize,
     predicate: impl Fn(u32) -> bool,
+) -> Vec<(u32, f32)> {
+    let mut evals = 0u64;
+    flat_filtered_search_impl(data, query, k, predicate, &mut evals)
+}
+
+/// Like [`flat_filtered_search`] but also returns the exact distance-eval count
+/// (one `l2_sq` per predicate-passing vector). Results identical.
+pub fn flat_filtered_search_counted(
+    data: &[Vec<f32>],
+    query: &[f32],
+    k: usize,
+    predicate: impl Fn(u32) -> bool,
+) -> (Vec<(u32, f32)>, u64) {
+    let mut evals = 0u64;
+    let out = flat_filtered_search_impl(data, query, k, predicate, &mut evals);
+    (out, evals)
+}
+
+fn flat_filtered_search_impl(
+    data: &[Vec<f32>],
+    query: &[f32],
+    k: usize,
+    predicate: impl Fn(u32) -> bool,
+    evals: &mut u64,
 ) -> Vec<(u32, f32)> {
     let mut heap: BinaryHeap<(OrdF32, u32)> = BinaryHeap::with_capacity(k + 1);
 
@@ -137,6 +198,7 @@ pub fn flat_filtered_search(
             continue;
         }
         let d = l2_sq(v, query);
+        *evals += 1;
         if heap.len() < k {
             heap.push((OrdF32(d), i as u32));
         } else if let Some(&(OrdF32(worst), _)) = heap.peek() {
@@ -199,6 +261,28 @@ mod tests {
         }
     }
 
+    #[test]
+    fn counted_variants_match_uncounted_and_count_evals() {
+        // The benchmark depends on this invariant: *_counted returns identical
+        // results to the plain fn, plus a positive, finite eval count.
+        let data = unit_data(40);
+        let graph = AcornGraph::build(data.clone(), 8).unwrap();
+        let query = vec![17.0_f32, 0.0];
+        let pred = |id: u32| id % 3 == 0;
+
+        let plain = acorn_search(&graph, &query, 5, 60, pred);
+        let (counted, evals) = acorn_search_counted(&graph, &query, 5, 60, pred);
+        assert_eq!(plain, counted, "counted search must match plain search");
+        assert!(evals > 0, "must record at least the entry probes");
+
+        let fplain = flat_filtered_search(&data, &query, 5, pred);
+        let (fcounted, fevals) = flat_filtered_search_counted(&data, &query, 5, pred);
+        assert_eq!(fplain, fcounted);
+        // Flat does exactly one eval per predicate-passing vector.
+        let n_pass = (0..data.len() as u32).filter(|&i| pred(i)).count() as u64;
+        assert_eq!(fevals, n_pass, "flat evals == #matches");
+    }
+
     #[test]
     fn acorn_search_half_predicate() {
         let data = unit_data(30);
diff --git a/crates/ruvector-filtered-bench/examples/acorn_tune.rs b/crates/ruvector-filtered-bench/examples/acorn_tune.rs
new file mode 100644
index 0000000000..a5325269ff
--- /dev/null
+++ b/crates/ruvector-filtered-bench/examples/acorn_tune.rs
@@ -0,0 +1,78 @@
+//! M1 — find ACORN's *tuned* operating point (rule #2: beat the incumbent tuned).
+//!
+//! Sweeps ef × γ for filtered recall@10 at a representative low selectivity (ρ=1), so the
+//! later head-to-head compares against ACORN at its best, not an under-tuned strawman.
+//!
+//! Run: cargo run --release -p ruvector-filtered-bench --example acorn_tune -- [N] [Q] [sel]
+
+use ruvector_acorn::graph::exact_filtered_knn;
+use ruvector_filtered_bench::contenders::{recall, Acorn};
+use ruvector_filtered_bench::data::{Dataset, FEAT_100K};
+use ruvector_filtered_bench::predicate;
+
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::path::Path;
+
+fn main() {
+    let args: Vec<String> = std::env::args().collect();
+    let n: usize = args.get(1).and_then(|s| s.parse().ok()).unwrap_or(20_000);
+    let q_count: usize = args.get(2).and_then(|s| s.parse().ok()).unwrap_or(200);
+    let sel: f64 = args.get(3).and_then(|s| s.parse().ok()).unwrap_or(0.01);
+
+    if !Path::new(FEAT_100K).exists() {
+        eprintln!("data not extracted ({FEAT_100K}); skipping.");
+        return;
+    }
+
+    let k = 10;
+    let ds = Dataset::load_arxiv(n);
+    let n = ds.len();
+    let mut rng = StdRng::seed_from_u64(7);
+    let pred = predicate::correlated(&ds.labels, sel, 1.0, 0, &mut rng);
+    let pf = pred.as_fn();
+    let queries: Vec<usize> = (0..q_count).map(|_| rng.gen_range(0..n)).collect();
+
+    // Precompute truth once per query (independent of ef/γ).
+    let truths: Vec<Vec<u32>> = queries
+        .iter()
+        .map(|&qi| {
+            exact_filtered_knn(&ds.feats, &ds.feats[qi], k + 1, pf)
+                .into_iter()
+                .filter(|&id| id as usize != qi)
+                .take(k)
+                .collect()
+        })
+        .collect();
+
+    println!(
+        "\n=== ACORN tuning: filtered recall@{k} (n={n}, sel={sel}, #match={}, Q={q_count}) ===",
+        pred.n_match
+    );
+    println!("{:>5} {:>6} | {:>10} {:>11}", "γ", "ef", "recall", "evals/q");
+    println!("{}", "-".repeat(40));
+
+    for &gamma in &[2usize, 3] {
+        let acorn = Acorn::build(&ds.feats, gamma, 64); // ef field unused; we pass ef below
+        for &ef in &[64usize, 128, 256, 512, 1024] {
+            let (mut rec, mut ev) = (0.0, 0u64);
+            for (qi, truth) in queries.iter().zip(&truths) {
+                let (got, evals) =
+                    ruvector_acorn::search::acorn_search_counted(&acorn.graph, &ds.feats[*qi], k, ef, pf);
+                let got: Vec<u32> = got
+                    .into_iter()
+                    .map(|(id, _)| id)
+                    .filter(|&id| id as usize != *qi)
+                    .collect();
+                rec += recall(truth, &got);
+                ev += evals;
+            }
+            let nq = queries.len() as f64;
+            println!(
+                "{gamma:>5} {ef:>6} | {:>9.1}% {:>11}",
+                100.0 * rec / nq,
+                ev / queries.len() as u64
+            );
+        }
+    }
+}
diff --git a/crates/ruvector-filtered-bench/examples/teeth.rs b/crates/ruvector-filtered-bench/examples/teeth.rs
new file mode 100644
index 0000000000..acf7b63d95
--- /dev/null
+++ b/crates/ruvector-filtered-bench/examples/teeth.rs
@@ -0,0 +1,96 @@
+//! M1 — "the benchmark has teeth."
+//!
+//! Before claiming contender A beats ACORN, we must show the problem is real: at low
+//! selectivity, the classic **post-filter** baseline (retrieve top-`pool` ignoring the
+//! predicate, then filter) collapses, while ACORN's predicate-agnostic search holds recall.
+//! Both run on the *same* ACORN-γ graph, so the only variable is the traversal policy —
+//! isolating post-filter as the cause of the collapse (not graph density).
+//!
+//! This is the negative-control analogue of ADR-200's stale-index control: if post-filter
+//! did *not* collapse, the benchmark would be insensitive and any later "win" meaningless.
+//!
+//! Run: cargo run --release -p ruvector-filtered-bench --example teeth -- [N] [Q] [seed]
+
+use ruvector_acorn::graph::exact_filtered_knn;
+use ruvector_filtered_bench::contenders::{recall, Acorn};
+use ruvector_filtered_bench::data::{Dataset, FEAT_100K};
+use ruvector_filtered_bench::predicate;
+
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::path::Path;
+
+fn main() {
+    let args: Vec<String> = std::env::args().collect();
+    let n: usize = args.get(1).and_then(|s| s.parse().ok()).unwrap_or(20_000);
+    let q_count: usize = args.get(2).and_then(|s| s.parse().ok()).unwrap_or(200);
+    let seed: u64 = args.get(3).and_then(|s| s.parse().ok()).unwrap_or(7);
+
+    if !Path::new(FEAT_100K).exists() {
+        eprintln!("data not extracted ({FEAT_100K}); see src/data.rs header. skipping.");
+        return;
+    }
+
+    let k = 10;
+    let ef = 512; // tuned operating point (see acorn_tune: ~92% recall at sel=1%, n=20k)
+    let pool = 512; // post-filter retrieval pool == ef (generous; not a strawman k-only pool)
+    let gamma = 2;
+
+    eprintln!("[teeth] loading arxiv slice n={n}…");
+    let ds = Dataset::load_arxiv(n);
+    let n = ds.len();
+    eprintln!("[teeth] building ACORN-γ (γ={gamma}, {} edges/node, ef={ef})…", 16 * gamma);
+    let t0 = std::time::Instant::now();
+    let acorn = Acorn::build(&ds.feats, gamma, ef);
+    eprintln!("[teeth] graph built in {:.1}s", t0.elapsed().as_secs_f64());
+
+    let mut rng = StdRng::seed_from_u64(seed);
+    let queries: Vec<usize> = (0..q_count).map(|_| rng.gen_range(0..n)).collect();
+
+    println!("\n=== M1 teeth: post-filter collapse vs ACORN-agnostic (ρ=1, n={n}, k={k}, Q={q_count}) ===");
+    println!(
+        "{:>7} {:>8} | {:>10} {:>10} | {:>11} {:>11}",
+        "sel", "#match", "B_recall", "C_recall", "B_evals", "C_evals"
+    );
+    println!("{}", "-".repeat(66));
+
+    for &sel in &[0.001_f64, 0.005, 0.01, 0.05, 0.10, 0.30] {
+        let pred = predicate::correlated(&ds.labels, sel, 1.0, 0, &mut rng);
+        if pred.n_match < k {
+            println!("{sel:>7.3} {:>8} | (skipped: #match < k)", pred.n_match);
+            continue;
+        }
+        let pf = pred.as_fn();
+
+        let (mut b_rec, mut c_rec, mut b_ev, mut c_ev) = (0.0, 0.0, 0u64, 0u64);
+        for &qi in &queries {
+            let q = &ds.feats[qi];
+            // Exclude the query's own id so the trivial self-match (distance 0) can't
+            // inflate either contender.
+            let truth: Vec<u32> = exact_filtered_knn(&ds.feats, q, k + 1, pf)
+                .into_iter()
+                .filter(|&id| id as usize != qi)
+                .take(k)
+                .collect();
+
+            let b = acorn.search(q, k, pf);
+            let c = acorn.postfilter(q, k, pool, pf);
+            let strip = |ids: Vec<u32>| ids.into_iter().filter(|&id| id as usize != qi).collect::<Vec<_>>();
+
+            b_rec += recall(&truth, &strip(b.ids));
+            c_rec += recall(&truth, &strip(c.ids));
+            b_ev += b.evals;
+            c_ev += c.evals;
+        }
+        let nq = queries.len() as f64;
+        println!(
+            "{sel:>7.3} {:>8} | {:>9.1}% {:>9.1}% | {:>11} {:>11}",
+            pred.n_match,
+            100.0 * b_rec / nq,
+            100.0 * c_rec / nq,
+            b_ev / queries.len() as u64,
+            c_ev / queries.len() as u64,
+        );
+    }
+    println!("\nExpected (teeth): C_recall falls sharply as sel→0 while B_recall stays high.");
+}
diff --git a/crates/ruvector-filtered-bench/src/contenders.rs b/crates/ruvector-filtered-bench/src/contenders.rs
new file mode 100644
index 0000000000..58cf491d22
--- /dev/null
+++ b/crates/ruvector-filtered-bench/src/contenders.rs
@@ -0,0 +1,124 @@
+//! M1 — incumbents (B/D) and the post-filter floor (C), each reporting exact
+//! distance-evals via the instrumented `ruvector-acorn` search.
+//!
+//! All three drive the **real** `AcornGraph` + `acorn_search_counted` — not a
+//! re-implementation — so the head-to-head measures ACORN as shipped (protocol
+//! rule #2). Contender A (region-pruned IVF) arrives in M2 (`prune` module).
+
+use ruvector_acorn::graph::AcornGraph;
+use ruvector_acorn::search::acorn_search_counted;
+
+/// ACORN edge budget base (γ·M neighbors/node); matches `AcornIndexGamma::M`.
+pub const ACORN_M: usize = 16;
+
+/// Outcome of one filtered query: the returned ids (nearest-first) and the exact
+/// number of distance evaluations spent — the pre-registered primary cost metric.
+pub struct QueryResult {
+    pub ids: Vec<u32>,
+    pub evals: u64,
+}
+
+/// A real ACORN-γ graph. Drives **B** (predicate-agnostic search) and **C** (the
+/// post-filter floor) off one graph, so the only variable between them is the
+/// traversal policy — the cleanest demonstration that post-filter, not graph
+/// density, is what collapses at low selectivity.
+pub struct Acorn {
+    pub graph: AcornGraph,
+    pub gamma: usize,
+    pub ef: usize,
+}
+
+impl Acorn {
+    /// Build the incumbent graph. `gamma` = 2 is `AcornIndexGamma`'s default
+    /// (32 edges/node); `gamma` = 3 is the "tune harder" variant (D's denser graph).
+    pub fn build(feats: &[Vec<f32>], gamma: usize, ef: usize) -> Self {
+        let graph = AcornGraph::build(feats.to_vec(), ACORN_M * gamma)
+            .expect("acorn graph build");
+        Acorn { graph, gamma, ef }
+    }
+
+    /// **Contender B** — ACORN predicate-agnostic search (expands all neighbors).
+    pub fn search(&self, query: &[f32], k: usize, predicate: impl Fn(u32) -> bool) -> QueryResult {
+        let (got, evals) = acorn_search_counted(&self.graph, query, k, self.ef, predicate);
+        QueryResult { ids: got.into_iter().map(|(id, _)| id).collect(), evals }
+    }
+
+    /// **Contender C** — classic post-filter: retrieve the `pool` nearest neighbors
+    /// *ignoring* the predicate, then keep the first `k` that pass. At low
+    /// selectivity the unfiltered pool is almost all non-matching, so few (or zero)
+    /// survive → recall collapses. This is the floor ACORN was designed to beat;
+    /// reproducing the collapse proves the benchmark has teeth.
+    pub fn postfilter(
+        &self,
+        query: &[f32],
+        k: usize,
+        pool: usize,
+        predicate: impl Fn(u32) -> bool,
+    ) -> QueryResult {
+        let pool = pool.max(k);
+        // Unfiltered retrieval (predicate = always-true); cost is the search's evals.
+        let (cands, evals) = acorn_search_counted(&self.graph, query, pool, self.ef, |_| true);
+        let ids = cands
+            .into_iter()
+            .map(|(id, _)| id)
+            .filter(|&id| predicate(id))
+            .take(k)
+            .collect();
+        QueryResult { ids, evals }
+    }
+}
+
+/// Recall@k against an exact filtered-kNN truth set: fraction of the true top-k
+/// that the contender returned. `truth` may be shorter than k when matches < k.
+pub fn recall(truth: &[u32], got: &[u32]) -> f64 {
+    if truth.is_empty() {
+        return 1.0;
+    }
+    let got_set: std::collections::HashSet<u32> = got.iter().copied().collect();
+    let hit = truth.iter().filter(|id| got_set.contains(id)).count();
+    hit as f64 / truth.len() as f64
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ruvector_acorn::graph::exact_filtered_knn;
+
+    fn ramp(n: usize) -> Vec<Vec<f32>> {
+        (0..n).map(|i| vec![i as f32, (i % 7) as f32]).collect()
+    }
+
+    #[test]
+    fn agnostic_beats_postfilter_when_selective() {
+        // A predicate matching only every 11th node (~9%) should let ACORN's
+        // agnostic search keep recall while post-filter (pool=k) starves.
+        let feats = ramp(600);
+        let acorn = Acorn::build(&feats, 2, 80);
+        let k = 5;
+        let pred = |id: u32| id.is_multiple_of(11);
+
+        let (mut agn_hits, mut pf_hits, mut n) = (0.0, 0.0, 0.0);
+        for qi in (0..600).step_by(97) {
+            let truth = exact_filtered_knn(&feats, &feats[qi], k, pred);
+            let agn = acorn.search(&feats[qi], k, pred);
+            let pf = acorn.postfilter(&feats[qi], k, k, pred); // tight pool → starves
+            agn_hits += recall(&truth, &agn.ids);
+            pf_hits += recall(&truth, &pf.ids);
+            n += 1.0;
+        }
+        assert!(
+            agn_hits / n >= pf_hits / n,
+            "agnostic recall {:.2} should be >= post-filter recall {:.2}",
+            agn_hits / n,
+            pf_hits / n
+        );
+    }
+
+    #[test]
+    fn evals_are_recorded() {
+        let feats = ramp(300);
+        let acorn = Acorn::build(&feats, 2, 64);
+        let r = acorn.search(&feats[10], 5, |_| true);
+        assert!(r.evals > 0);
+    }
+}
diff --git a/crates/ruvector-filtered-bench/src/lib.rs b/crates/ruvector-filtered-bench/src/lib.rs
index b3206e97aa..a9b2eb4ea2 100644
--- a/crates/ruvector-filtered-bench/src/lib.rs
+++ b/crates/ruvector-filtered-bench/src/lib.rs
@@ -13,6 +13,7 @@
 //! - `predicate` (M0) — predicate families + ρ-correlation knob + selectivity targeting.
 //! - `prune` (M2)     — contender A: region-pruned IVF filtered search + eval counters.
 
+pub mod contenders;
 pub mod data;
 pub mod predicate;
 

From 57b27b3c22bfcaaf45ce0b5bf52e0e645c263f00 Mon Sep 17 00:00:00 2001
From: Ofer Shaal <oshaal@phase2technology.com>
Date: Thu, 4 Jun 2026 16:07:27 -0400
Subject: [PATCH 3/8] =?UTF-8?q?feat(filtered-bench):=20M2=20contender=20A?=
 =?UTF-8?q?=20=E2=80=94=20region-pruned=20IVF=20(exact=20B&B)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

src/prune.rs: RegionPruneIvf, built on ruvector-rairs k-means (ADR-193 substrate).
Two stacked prunings realizing the salvaged SepRAG kernel on the treewidth-immune
IVF hierarchy:
  1. predicate pruning — skip clusters with zero matching members (the BET-2 win).
  2. branch-and-bound distance pruning — triangle-inequality lower bound
     (dist(q,centroid) - radius); once the top-k heap is full, clusters whose LB
     exceeds the worst result are skipped. Probe in LB order so the bound lets us
     break, not just skip — a strict improvement over the M2-sketch's match-count
     ordering, and it yields EXACT filtered top-k.

Cost metric = nclusters (routing) + matching members scanned; the O(1) predicate
gates the expensive distance, so non-matching points cost nothing (the asymmetry
vs ACORN, which evaluates a distance per expanded node regardless of predicate).

max_probe knob: None = exact B&B (recall 1.0); Some(p) caps match-clusters probed
(trades recall for fewer evals, mirroring ACORN's ef) for equal-recall comparison.

Tests: exact_bb_matches_oracle (recall 1.0 vs exact_filtered_knn on 20 queries) and
zero_match_clusters_are_skipped (1% selectivity → <1000 evals vs 4000 full scan).
8 unit + 1 integration green, clippy clean.
---
 crates/ruvector-filtered-bench/src/lib.rs   |   1 +
 crates/ruvector-filtered-bench/src/prune.rs | 200 ++++++++++++++++++++
 2 files changed, 201 insertions(+)
 create mode 100644 crates/ruvector-filtered-bench/src/prune.rs

diff --git a/crates/ruvector-filtered-bench/src/lib.rs b/crates/ruvector-filtered-bench/src/lib.rs
index a9b2eb4ea2..2e892f05f1 100644
--- a/crates/ruvector-filtered-bench/src/lib.rs
+++ b/crates/ruvector-filtered-bench/src/lib.rs
@@ -16,6 +16,7 @@
 pub mod contenders;
 pub mod data;
 pub mod predicate;
+pub mod prune;
 
 // Re-export the substrate + incumbent + oracle so the head-to-head harness has one entry
 // point and the dependency graph is exercised at build time.
diff --git a/crates/ruvector-filtered-bench/src/prune.rs b/crates/ruvector-filtered-bench/src/prune.rs
new file mode 100644
index 0000000000..78819a2462
--- /dev/null
+++ b/crates/ruvector-filtered-bench/src/prune.rs
@@ -0,0 +1,200 @@
+//! M2 — contender A: region-pruned IVF filtered search.
+//!
+//! Built on `ruvector-rairs` k-means (the ADR-193 IVF substrate). Two stacked prunings,
+//! both realizing the salvaged SepRAG kernel on a treewidth-immune cluster hierarchy:
+//!
+//! 1. **Predicate pruning** — skip every cluster with zero predicate-matching members.
+//!    This is the BET-2 win: a correlated metadata filter concentrates matches in a few
+//!    clusters, so most of the corpus is never touched.
+//! 2. **Branch-and-bound distance pruning** — by the triangle inequality, the nearest
+//!    possible point in cluster `c` is `dist(q, centroid_c) − radius_c`. Once the top-k
+//!    heap is full, clusters whose lower bound exceeds the current k-th distance cannot
+//!    improve the result and are skipped. With a valid lower bound this is **exact**.
+//!
+//! Cost (the pre-registered metric) = `#centroids routed (= nclusters)` + `#matching
+//! members for which a distance was computed`. The O(1) predicate test gates the
+//! expensive distance, so non-matching points cost nothing — the asymmetry vs ACORN
+//! (which evaluates a distance per expanded node regardless of predicate).
+
+use ruvector_rairs::kmeans;
+
+use crate::contenders::QueryResult;
+
+#[inline]
+fn l2_sq(a: &[f32], b: &[f32]) -> f32 {
+    a.iter().zip(b).map(|(x, y)| (x - y) * (x - y)).sum()
+}
+
+/// Region-pruned IVF index (contender A).
+pub struct RegionPruneIvf {
+    centroids: Vec<Vec<f32>>,
+    /// `members[c]` = node ids assigned to cluster `c`.
+    members: Vec<Vec<u32>>,
+    /// `radius[c]` = max **L2** distance (not squared) from centroid `c` to any member —
+    /// the triangle-inequality slack for the branch-and-bound lower bound.
+    radius: Vec<f32>,
+    pub nclusters: usize,
+}
+
+impl RegionPruneIvf {
+    /// Partition `feats` into `nclusters` k-means cells (rairs clustering).
+    pub fn build(feats: &[Vec<f32>], nclusters: usize, max_iter: usize, seed: u64) -> Self {
+        let (centroids, assign) = kmeans::train(feats, nclusters, max_iter, seed);
+        let k = centroids.len();
+        let mut members = vec![Vec::new(); k];
+        for (id, &c) in assign.iter().enumerate() {
+            members[c].push(id as u32);
+        }
+        let radius = (0..k)
+            .map(|c| {
+                members[c]
+                    .iter()
+                    .map(|&id| l2_sq(&centroids[c], &feats[id as usize]).sqrt())
+                    .fold(0.0_f32, f32::max)
+            })
+            .collect();
+        RegionPruneIvf { centroids, members, radius, nclusters: k }
+    }
+
+    /// Region-pruned filtered top-k search.
+    ///
+    /// `max_probe = None` runs exact branch-and-bound (recall 1.0); `Some(p)` caps the
+    /// number of *match-containing* clusters probed (the approximate knob that trades
+    /// recall for fewer distance-evals, mirroring ACORN's `ef`).
+    pub fn search(
+        &self,
+        feats: &[Vec<f32>],
+        query: &[f32],
+        k: usize,
+        predicate: impl Fn(u32) -> bool,
+        max_probe: Option<usize>,
+    ) -> QueryResult {
+        let mut evals = 0u64;
+
+        // 1. Route: distance to every centroid (the fixed routing cost).
+        let mut clusters: Vec<(f32, usize)> = (0..self.nclusters)
+            .map(|c| {
+                evals += 1;
+                (l2_sq(query, &self.centroids[c]), c)
+            })
+            .collect();
+
+        // Lower bound per cluster (squared L2): (max(0, sqrt(d_qc) - radius))^2.
+        // Sorting by LB lets us *break* (not just skip) once LB exceeds the worst result.
+        let lb_sq = |d_qc_sq: f32, c: usize| {
+            let lb = (d_qc_sq.sqrt() - self.radius[c]).max(0.0);
+            lb * lb
+        };
+        clusters.sort_by(|&(da, ca), &(db, cb)| {
+            lb_sq(da, ca).total_cmp(&lb_sq(db, cb))
+        });
+
+        // 2. Probe in lower-bound order, skipping zero-match clusters; B&B early-out.
+        // Max-heap on squared distance — peek = current worst of the top-k.
+        let mut heap: std::collections::BinaryHeap<(ordered::Of, u32)> =
+            std::collections::BinaryHeap::with_capacity(k + 1);
+        let mut probed = 0usize;
+
+        for &(d_qc_sq, c) in &clusters {
+            // B&B: once the heap is full, no later cluster (sorted by LB) can help.
+            if heap.len() >= k {
+                if let Some(&(ordered::Of(worst), _)) = heap.peek() {
+                    if lb_sq(d_qc_sq, c) >= worst {
+                        break;
+                    }
+                }
+            }
+            // Does this cluster contain any match? (cheap O(1) tests, not distance-evals)
+            let mut any = false;
+            for &id in &self.members[c] {
+                if !predicate(id) {
+                    continue;
+                }
+                any = true;
+                let d = l2_sq(query, &feats[id as usize]);
+                evals += 1;
+                if heap.len() < k {
+                    heap.push((ordered::Of(d), id));
+                } else if let Some(&(ordered::Of(worst), _)) = heap.peek() {
+                    if d < worst {
+                        heap.pop();
+                        heap.push((ordered::Of(d), id));
+                    }
+                }
+            }
+            if any {
+                probed += 1;
+                if let Some(cap) = max_probe {
+                    if probed >= cap {
+                        break;
+                    }
+                }
+            }
+        }
+
+        let mut out: Vec<(u32, f32)> =
+            heap.into_iter().map(|(ordered::Of(d), id)| (id, d)).collect();
+        out.sort_by(|a, b| a.1.total_cmp(&b.1));
+        QueryResult { ids: out.into_iter().map(|(id, _)| id).collect(), evals }
+    }
+}
+
+/// Minimal total-ordered f32 wrapper for the binary heap (NaN-free distances).
+mod ordered {
+    #[derive(Clone, Copy, PartialEq)]
+    pub struct Of(pub f32);
+    impl Eq for Of {}
+    impl PartialOrd for Of {
+        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+            Some(self.cmp(other))
+        }
+    }
+    impl Ord for Of {
+        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+            self.0.total_cmp(&other.0)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ruvector_acorn::graph::exact_filtered_knn;
+    use rand::rngs::StdRng;
+    use rand::{Rng, SeedableRng};
+
+    fn gauss(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
+        let mut rng = StdRng::seed_from_u64(seed);
+        (0..n)
+            .map(|_| (0..dim).map(|_| rng.gen_range(-1.0_f32..1.0)).collect())
+            .collect()
+    }
+
+    #[test]
+    fn exact_bb_matches_oracle() {
+        // max_probe = None must return the exact filtered top-k (recall 1.0).
+        let feats = gauss(2000, 16, 1);
+        let idx = RegionPruneIvf::build(&feats, 48, 10, 7);
+        let k = 10;
+        let pred = |id: u32| id.is_multiple_of(4);
+        let mut rng = StdRng::seed_from_u64(99);
+        for _ in 0..20 {
+            let qi = rng.gen_range(0..feats.len());
+            let truth = exact_filtered_knn(&feats, &feats[qi], k, pred);
+            let got = idx.search(&feats, &feats[qi], k, pred, None);
+            assert_eq!(got.ids, truth, "exact B&B must equal the oracle");
+        }
+    }
+
+    #[test]
+    fn zero_match_clusters_are_skipped() {
+        // A predicate matching a tiny fraction must cost far fewer evals than scanning all.
+        let feats = gauss(4000, 16, 2);
+        let idx = RegionPruneIvf::build(&feats, 64, 10, 7);
+        let pred = |id: u32| id < 40; // 1% selectivity
+        let r = idx.search(&feats, &feats[0], 10, pred, None);
+        // evals = nclusters routing + matches scanned; must be << full scan (4000).
+        assert!(r.evals < 1000, "pruning failed: {} evals", r.evals);
+        assert!(r.evals >= idx.nclusters as u64, "must at least route to all centroids");
+    }
+}

From 48e994c0c47492b6db7fddcee3edd325de7b379c Mon Sep 17 00:00:00 2001
From: Ofer Shaal <oshaal@phase2technology.com>
Date: Thu, 4 Jun 2026 16:13:48 -0400
Subject: [PATCH 4/8] =?UTF-8?q?feat(filtered-bench):=20M3=20sweep=20+=20ga?=
 =?UTF-8?q?te=20eval=20=E2=80=94=20partial=20WIN=20(boundary=20at=20sel<?=
 =?UTF-8?q?=3D1%)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

examples/sweep.rs: full selectivity x rho grid, cost-at-matched-recall comparison
(tune A's probe cap to ACORN's recall, then compare distance-evals), with the
wall-clock honesty guard and the rho=0 kill control.

VERDICT vs the frozen gate (n=20k, ACORN gamma2 ef=512, IVF nclusters=64):
- WIN at sel<=1%, rho>=0.7: region-pruned IVF beats tuned ACORN by 6.1-48x evals
  and 4.7-26x wall-clock at equal-or-better recall (A's exact B&B recall >= ACORN).
  e.g. rho=1 sel=1%: ACORN 92.6%@1622 evals vs A 99.9%@264 evals = 6.1x (4.7x wall).
- MISS at sel=5%: best 1.5x (gate wanted >=2x). The win is a low-selectivity
  (<=1%) phenomenon — the dominant production metadata-filter regime, but a real
  boundary, not the full pre-registered claim.
- Mechanism partly refuted: A also wins at rho=0 (low sel), so the eval advantage
  is selectivity-driven (few matches -> cheap exact B&B) more than correlation-
  driven; correlation governs recall, not cost. Reported, not buried.
- rho=0 kill control: A does NOT collapse (recall-safe); high-sel (>=10%) A loses
  as expected (ACORN's regime). Wall-clock guard: PASS (win survives the clock).

nclusters is A's tuning knob (parallel to ACORN's ef): 64 beats 128 in the win
regime (cheaper routing); both confirm the same boundary.
---
 .../ruvector-filtered-bench/examples/sweep.rs | 159 ++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 crates/ruvector-filtered-bench/examples/sweep.rs

diff --git a/crates/ruvector-filtered-bench/examples/sweep.rs b/crates/ruvector-filtered-bench/examples/sweep.rs
new file mode 100644
index 0000000000..8a8ebe5d28
--- /dev/null
+++ b/crates/ruvector-filtered-bench/examples/sweep.rs
@@ -0,0 +1,159 @@
+//! M3 — the frozen-gate run: tuned ACORN vs region-pruned IVF over a selectivity × ρ grid.
+//!
+//! Compares **cost at matched recall** (the honest framing): contender A's exact B&B has
+//! recall 1.0 ≥ ACORN, so we tune A's probe cap down until its recall ≈ ACORN's, then
+//! compare distance-evals/query. Reports the ratio against the pre-registered gate
+//! (≥5× at sel≤1%, ≥2× at sel=5%, ρ≥0.7), the ρ=0 kill control, and wall-clock (the
+//! honesty guard — a distance-eval win that reverses on wall-clock is "inconclusive").
+//!
+//! Run: cargo run --release -p ruvector-filtered-bench --example sweep -- [N] [Q] [nclusters] [ef] [seed]
+
+use ruvector_acorn::graph::exact_filtered_knn;
+use ruvector_acorn::search::acorn_search_counted;
+use ruvector_filtered_bench::contenders::{recall, Acorn};
+use ruvector_filtered_bench::data::{Dataset, FEAT_100K};
+use ruvector_filtered_bench::predicate;
+use ruvector_filtered_bench::prune::RegionPruneIvf;
+
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::path::Path;
+use std::time::Instant;
+
+const K: usize = 10;
+const GATE: f64 = 0.02; // recall match tolerance
+
+fn main() {
+    let a: Vec<String> = std::env::args().collect();
+    let n: usize = a.get(1).and_then(|s| s.parse().ok()).unwrap_or(20_000);
+    let q_count: usize = a.get(2).and_then(|s| s.parse().ok()).unwrap_or(200);
+    let nclusters: usize = a.get(3).and_then(|s| s.parse().ok()).unwrap_or(128);
+    let ef: usize = a.get(4).and_then(|s| s.parse().ok()).unwrap_or(512);
+    let seed: u64 = a.get(5).and_then(|s| s.parse().ok()).unwrap_or(7);
+
+    if !Path::new(FEAT_100K).exists() {
+        eprintln!("data not extracted ({FEAT_100K}); skipping.");
+        return;
+    }
+
+    let ds = Dataset::load_arxiv(n);
+    let n = ds.len();
+    eprintln!("[sweep] n={n} Q={q_count} nclusters={nclusters} ef={ef}");
+    eprintln!("[sweep] building ACORN-γ2 + region-prune IVF…");
+    let t = Instant::now();
+    let acorn = Acorn::build(&ds.feats, 2, ef);
+    let ivf = RegionPruneIvf::build(&ds.feats, nclusters, 15, seed);
+    eprintln!("[sweep] built in {:.1}s (ivf nclusters={})", t.elapsed().as_secs_f64(), ivf.nclusters);
+
+    let mut rng = StdRng::seed_from_u64(seed);
+    let queries: Vec<usize> = (0..q_count).map(|_| rng.gen_range(0..n)).collect();
+
+    let sels = [0.001_f64, 0.005, 0.01, 0.05, 0.10, 0.30];
+    let rhos = [0.0_f64, 0.3, 0.5, 0.7, 1.0];
+    let probe_caps = [1usize, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128];
+
+    println!("\n=== M3 sweep (n={n}, k={K}, ACORN γ2 ef={ef}, IVF nclusters={}) ===", ivf.nclusters);
+    println!(
+        "{:>4} {:>6} {:>7} | {:>7} {:>8} | {:>7} {:>8} | {:>7} {:>8} {:>6} {:>6} | verdict",
+        "ρ", "sel", "#match", "B_rec", "B_evals", "Aex_rec", "Aex_ev", "Am_rec", "Am_evals", "ev×", "wc×"
+    );
+    println!("{}", "-".repeat(104));
+
+    for &rho in &rhos {
+        for &sel in &sels {
+            let pred = predicate::correlated(&ds.labels, sel, rho, 0, &mut rng);
+            if pred.n_match < K {
+                continue;
+            }
+            let pf = pred.as_fn();
+
+            // Truth per query (exclude self to avoid trivial distance-0 inflation).
+            let truths: Vec<Vec<u32>> = queries
+                .iter()
+                .map(|&qi| {
+                    exact_filtered_knn(&ds.feats, &ds.feats[qi], K + 1, pf)
+                        .into_iter()
+                        .filter(|&id| id as usize != qi)
+                        .take(K)
+                        .collect()
+                })
+                .collect();
+
+            // ACORN (B).
+            let (b_rec, b_ev, b_ms) = measure(&queries, &truths, |qi| {
+                let (got, ev) = acorn_search_counted(&acorn.graph, &ds.feats[qi], K, ef, pf);
+                (got.into_iter().map(|(id, _)| id).collect(), ev)
+            });
+
+            // A exact (B&B, recall ~1.0).
+            let (aex_rec, aex_ev, _) = measure(&queries, &truths, |qi| {
+                let r = ivf.search(&ds.feats, &ds.feats[qi], K, pf, None);
+                (r.ids, r.evals)
+            });
+
+            // A matched: smallest probe cap with recall >= b_rec - GATE.
+            let mut am_rec = aex_rec;
+            let mut am_ev = aex_ev;
+            let mut am_ms = 0.0;
+            for &cap in &probe_caps {
+                let (r, ev, ms) = measure(&queries, &truths, |qi| {
+                    let res = ivf.search(&ds.feats, &ds.feats[qi], K, pf, Some(cap));
+                    (res.ids, res.evals)
+                });
+                if r >= b_rec - GATE {
+                    am_rec = r;
+                    am_ev = ev;
+                    am_ms = ms;
+                    break;
+                }
+            }
+
+            let ratio = if am_ev > 0 { b_ev as f64 / am_ev as f64 } else { 0.0 };
+            // Wall-clock honesty guard: a distance-eval win that reverses on the clock is
+            // not a real win. wc_ratio > 1 means A is also faster in wall time.
+            let wc_ratio = if am_ms > 0.0 { b_ms / am_ms } else { 0.0 };
+            let target = if sel <= 0.01 { 5.0 } else if sel <= 0.05 { 2.0 } else { 0.0 };
+            let verdict = if rho >= 0.7 && target > 0.0 {
+                if ratio >= target { "WIN" } else { "miss" }
+            } else if rho <= 0.3 {
+                // graceful-degradation guard: A must not lose by >1.5x
+                if ratio >= 1.0 / 1.5 { "ok(ctrl)" } else { "DEGRADE" }
+            } else {
+                "—"
+            };
+
+            println!(
+                "{rho:>4.1} {sel:>6.3} {:>7} | {:>6.1}% {:>8} | {:>6.1}% {:>8} | {:>6.1}% {:>8} {:>5.1}× {:>5.1}× | {verdict}",
+                pred.n_match,
+                100.0 * b_rec,
+                b_ev,
+                100.0 * aex_rec,
+                aex_ev,
+                100.0 * am_rec,
+                am_ev,
+                ratio,
+                wc_ratio,
+            );
+        }
+        println!();
+    }
+}
+
+/// Mean recall, mean distance-evals, mean wall-clock(µs) over the query set.
+fn measure(
+    queries: &[usize],
+    truths: &[Vec<u32>],
+    mut run: impl FnMut(usize) -> (Vec<u32>, u64),
+) -> (f64, u64, f64) {
+    let mut rec = 0.0;
+    let mut ev = 0u64;
+    let t = Instant::now();
+    for (&qi, truth) in queries.iter().zip(truths) {
+        let (ids, e) = run(qi);
+        let ids: Vec<u32> = ids.into_iter().filter(|&id| id as usize != qi).collect();
+        rec += recall(truth, &ids);
+        ev += e;
+    }
+    let nq = queries.len() as f64;
+    (rec / nq, ev / queries.len() as u64, t.elapsed().as_secs_f64() * 1e6 / nq)
+}

From 43d62c07809a051026886cbae84810c54e06580f Mon Sep 17 00:00:00 2001
From: Ofer Shaal <oshaal@phase2technology.com>
Date: Thu, 4 Jun 2026 16:34:44 -0400
Subject: [PATCH 5/8] =?UTF-8?q?feat(filtered-bench):=20M3=20adversarial=20?=
 =?UTF-8?q?check=20(contender=20D)=20=E2=80=94=20win=20largely=20fails=20t?=
 =?UTF-8?q?he=20gate?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds predicate-aware-entry ACORN (the rule-#5 "tune harder" adversary):
- ruvector-acorn: acorn_search_seeded_counted (beam starts from caller seeds instead
  of multi-probe entry); acorn_search_impl refactored to take Option<seeds>, existing
  fns pass None — 13 acorn tests still green (behavior preserved).
- contenders.rs: Acorn::search_predicate_entry — stride-sample probes, predicate-test
  free, distance-eval only matching probes, seed the beam from the nearest matches.
- examples/adversarial.rs: A vs best-of(vanilla-B, predicate-entry-D) at matched recall.

FINDING (rule #5 changed the verdict): predicate-aware entry slashes ACORN's cost at
HIGH correlation (rho=1 sel=0.1%: 3753 -> 203 evals), collapsing A's advantage from
44.7x (vs vanilla) to 2.4x — BELOW the pre-registered 5x bar. A vs best ACORN:
  rho=1.0: 2.4x / 2.3x / 1.9x (sel .001/.005/.01) — MISS at the 5x bar.
  rho=0.7: 38.8x / 14.6x / 6.5x — WIN (D's seeding is weak at moderate correlation,
           where matches are scattered so a seeded walk still wanders).
So A and predicate-entry-ACORN exploit the SAME structure and converge (~2x) at high
correlation; A's clean win is NOT robust to a properly-tuned ACORN. Honest verdict:
largely a KILL at the pre-registered bar, with a narrower conditional edge at rho~0.7.
Caveat favoring A: D's seeding leans on ~16k "free" predicate tests (the eval metric
ignores the O(1) predicate scan); at scale that scan isn't free, restoring some edge.
---
 crates/ruvector-acorn/src/search.rs           |  80 +++++++---
 .../examples/adversarial.rs                   | 144 ++++++++++++++++++
 .../ruvector-filtered-bench/src/contenders.rs |  49 +++++-
 3 files changed, 247 insertions(+), 26 deletions(-)
 create mode 100644 crates/ruvector-filtered-bench/examples/adversarial.rs

diff --git a/crates/ruvector-acorn/src/search.rs b/crates/ruvector-acorn/src/search.rs
index 1983600cc2..64e2d8879c 100644
--- a/crates/ruvector-acorn/src/search.rs
+++ b/crates/ruvector-acorn/src/search.rs
@@ -35,7 +35,7 @@ pub fn acorn_search(
     predicate: impl Fn(u32) -> bool,
 ) -> Vec<(u32, f32)> {
     let mut evals = 0u64;
-    acorn_search_impl(graph, query, k, ef, predicate, &mut evals)
+    acorn_search_impl(graph, query, k, ef, predicate, &mut evals, None)
 }
 
 /// Like [`acorn_search`] but also returns the exact number of distance
@@ -50,7 +50,24 @@ pub fn acorn_search_counted(
     predicate: impl Fn(u32) -> bool,
 ) -> (Vec<(u32, f32)>, u64) {
     let mut evals = 0u64;
-    let out = acorn_search_impl(graph, query, k, ef, predicate, &mut evals);
+    let out = acorn_search_impl(graph, query, k, ef, predicate, &mut evals, None);
+    (out, evals)
+}
+
+/// ACORN search seeded from caller-supplied entry nodes instead of the default
+/// multi-probe entry — the substrate for contender D (predicate-aware entry). The beam
+/// starts from `seeds` (each costs one distance-eval, counted); everything else is the
+/// identical predicate-agnostic traversal. Returns results + exact eval count.
+pub fn acorn_search_seeded_counted(
+    graph: &AcornGraph,
+    query: &[f32],
+    k: usize,
+    ef: usize,
+    predicate: impl Fn(u32) -> bool,
+    seeds: &[u32],
+) -> (Vec<(u32, f32)>, u64) {
+    let mut evals = 0u64;
+    let out = acorn_search_impl(graph, query, k, ef, predicate, &mut evals, Some(seeds));
     (out, evals)
 }
 
@@ -61,6 +78,7 @@ fn acorn_search_impl(
     ef: usize,
     predicate: impl Fn(u32) -> bool,
     evals: &mut u64,
+    seeds: Option<&[u32]>,
 ) -> Vec<(u32, f32)> {
     if graph.is_empty() {
         return vec![];
@@ -68,24 +86,6 @@ fn acorn_search_impl(
     let n = graph.len();
     let ef = ef.max(k);
 
-    // Multi-probe entry: sample evenly-spaced nodes to find a good starting
-    // point. O(probes × D) overhead vs O(n × D) for flat — negligible.
-    let n_probes = (n as f64).sqrt().ceil() as usize;
-    let n_probes = n_probes.clamp(4, 64);
-    // Count each probe distance once (result-identical to the min_by form, which
-    // recomputed l2_sq inside the comparator — the count reflects fundamental work).
-    let mut entry = 0u32;
-    let mut best = f32::INFINITY;
-    for i in 0..n_probes {
-        let cand = (i * n / n_probes) as u32;
-        let d = l2_sq(query, graph.row(cand as usize));
-        *evals += 1;
-        if d < best {
-            best = d;
-            entry = cand;
-        }
-    }
-
     let mut visited: Vec<bool> = vec![false; n];
     // Min-heap by distance — pop closest unexplored candidate first.
     let mut candidates: BinaryHeap<Reverse<(OrdF32, u32)>> = BinaryHeap::with_capacity(ef + 1);
@@ -95,11 +95,41 @@ fn acorn_search_impl(
     // candidate, used to gate eviction when the frontier exceeds ef.
     let mut farthest_in_beam: BinaryHeap<OrdF32> = BinaryHeap::with_capacity(ef + 1);
 
-    let d0 = l2_sq(query, graph.row(entry as usize));
-    *evals += 1;
-    candidates.push(Reverse((OrdF32(d0), entry)));
-    farthest_in_beam.push(OrdF32(d0));
-    visited[entry as usize] = true;
+    // Initial frontier: caller-supplied predicate-aware seeds (contender D), else the
+    // standard multi-probe entry. Multi-probe distances are counted once (result-identical
+    // to the original min_by form, which recomputed l2_sq inside the comparator).
+    let seed_ids: Vec<u32> = match seeds {
+        Some(s) if !s.is_empty() => s.iter().copied().filter(|&id| (id as usize) < n).collect(),
+        _ => {
+            let n_probes = (n as f64).sqrt().ceil() as usize;
+            let n_probes = n_probes.clamp(4, 64);
+            let mut entry = 0u32;
+            let mut best = f32::INFINITY;
+            for i in 0..n_probes {
+                let cand = (i * n / n_probes) as u32;
+                let d = l2_sq(query, graph.row(cand as usize));
+                *evals += 1;
+                if d < best {
+                    best = d;
+                    entry = cand;
+                }
+            }
+            vec![entry]
+        }
+    };
+    for &s in &seed_ids {
+        if visited[s as usize] {
+            continue;
+        }
+        let d = l2_sq(query, graph.row(s as usize));
+        *evals += 1;
+        candidates.push(Reverse((OrdF32(d), s)));
+        farthest_in_beam.push(OrdF32(d));
+        visited[s as usize] = true;
+    }
+    if candidates.is_empty() {
+        return vec![];
+    }
 
     while let Some(Reverse((OrdF32(curr_d), curr))) = candidates.pop() {
         // Pop curr's mirror entry from the farthest-tracker. Since the two
diff --git a/crates/ruvector-filtered-bench/examples/adversarial.rs b/crates/ruvector-filtered-bench/examples/adversarial.rs
new file mode 100644
index 0000000000..b362391fff
--- /dev/null
+++ b/crates/ruvector-filtered-bench/examples/adversarial.rs
@@ -0,0 +1,144 @@
+//! M3 adversarial check (protocol rule #5) — does predicate-aware-entry ACORN (contender D)
+//! erase region-pruning's win in its own regime?
+//!
+//! For the win cells (ρ≥0.7, sel≤5%) it reports vanilla ACORN (B), the best
+//! predicate-aware-entry ACORN (D, over a probe-budget sweep, tuned to match B's recall at
+//! fewest evals), and contender A matched to the same recall. The headline ratio is A vs the
+//! **cheaper** ACORN variant — so the win must survive the strongest ACORN we can build.
+//!
+//! Run: cargo run --release -p ruvector-filtered-bench --example adversarial -- [N] [Q] [nclusters]
+
+use ruvector_acorn::graph::exact_filtered_knn;
+use ruvector_acorn::search::acorn_search_counted;
+use ruvector_filtered_bench::contenders::{recall, Acorn};
+use ruvector_filtered_bench::data::{Dataset, FEAT_100K};
+use ruvector_filtered_bench::predicate;
+use ruvector_filtered_bench::prune::RegionPruneIvf;
+
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::path::Path;
+
+const K: usize = 10;
+const GATE: f64 = 0.02;
+
+fn main() {
+    let a: Vec<String> = std::env::args().collect();
+    let n: usize = a.get(1).and_then(|s| s.parse().ok()).unwrap_or(20_000);
+    let q_count: usize = a.get(2).and_then(|s| s.parse().ok()).unwrap_or(200);
+    let nclusters: usize = a.get(3).and_then(|s| s.parse().ok()).unwrap_or(64);
+    let ef = 512;
+    let seed = 7;
+
+    if !Path::new(FEAT_100K).exists() {
+        eprintln!("data not extracted ({FEAT_100K}); skipping.");
+        return;
+    }
+
+    let ds = Dataset::load_arxiv(n);
+    let n = ds.len();
+    let acorn = Acorn::build(&ds.feats, 2, ef);
+    let ivf = RegionPruneIvf::build(&ds.feats, nclusters, 15, seed);
+    let mut rng = StdRng::seed_from_u64(seed);
+    let queries: Vec<usize> = (0..q_count).map(|_| rng.gen_range(0..n)).collect();
+
+    let probe_budgets = [256usize, 1024, 4096, 16384];
+    let a_caps = [1usize, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128];
+
+    println!("\n=== M3 adversarial: A vs best-of(vanilla-B, predicate-entry-D) (n={n}, nclusters={}) ===", ivf.nclusters);
+    println!(
+        "{:>4} {:>6} | {:>7} {:>7} | {:>7} {:>7} {:>7} | {:>7} {:>7} | {:>6} {:>6} | verdict",
+        "ρ", "sel", "B_rec", "B_ev", "D_rec", "D_ev", "D_pb", "Am_rec", "Am_ev", "vsB", "vsBest"
+    );
+    println!("{}", "-".repeat(100));
+
+    for &rho in &[0.7_f64, 1.0] {
+        for &sel in &[0.001_f64, 0.005, 0.01, 0.05] {
+            let pred = predicate::correlated(&ds.labels, sel, rho, 0, &mut rng);
+            if pred.n_match < K {
+                continue;
+            }
+            let pf = pred.as_fn();
+            let truths: Vec<Vec<u32>> = queries
+                .iter()
+                .map(|&qi| {
+                    exact_filtered_knn(&ds.feats, &ds.feats[qi], K + 1, pf)
+                        .into_iter()
+                        .filter(|&id| id as usize != qi)
+                        .take(K)
+                        .collect()
+                })
+                .collect();
+
+            // B — vanilla ACORN.
+            let (b_rec, b_ev) = mean(&queries, &truths, |qi| {
+                let (g, e) = acorn_search_counted(&acorn.graph, &ds.feats[qi], K, ef, pf);
+                (g.into_iter().map(|(id, _)| id).collect(), e)
+            });
+
+            // D — predicate-aware entry; pick cheapest probe budget reaching B's recall.
+            let (mut d_rec, mut d_ev, mut d_pb) = (0.0, u64::MAX, 0usize);
+            for &pb in &probe_budgets {
+                let (r, e) = mean(&queries, &truths, |qi| {
+                    let res = acorn.search_predicate_entry(&ds.feats[qi], K, pf, pb, 4);
+                    (res.ids, res.evals)
+                });
+                if r >= b_rec - GATE && e < d_ev {
+                    d_rec = r;
+                    d_ev = e;
+                    d_pb = pb;
+                }
+            }
+            if d_ev == u64::MAX {
+                // none matched B's recall; report the highest-budget point.
+                let (r, e) = mean(&queries, &truths, |qi| {
+                    let res = acorn.search_predicate_entry(&ds.feats[qi], K, pf, 16384, 4);
+                    (res.ids, res.evals)
+                });
+                d_rec = r;
+                d_ev = e;
+                d_pb = 16384;
+            }
+
+            // A — matched to vanilla B's recall.
+            let (mut am_rec, mut am_ev) = (1.0, u64::MAX);
+            for &cap in &a_caps {
+                let (r, e) = mean(&queries, &truths, |qi| {
+                    let res = ivf.search(&ds.feats, &ds.feats[qi], K, pf, Some(cap));
+                    (res.ids, res.evals)
+                });
+                if r >= b_rec - GATE {
+                    am_rec = r;
+                    am_ev = e;
+                    break;
+                }
+            }
+
+            let best_acorn = b_ev.min(d_ev);
+            let vs_b = b_ev as f64 / am_ev as f64;
+            let vs_best = best_acorn as f64 / am_ev as f64;
+            let target = if sel <= 0.01 { 5.0 } else { 2.0 };
+            let verdict = if vs_best >= target { "WIN" } else { "miss" };
+
+            println!(
+                "{rho:>4.1} {sel:>6.3} | {:>6.1}% {:>7} | {:>6.1}% {:>7} {:>7} | {:>6.1}% {:>7} | {:>5.1}× {:>5.1}× | {verdict}",
+                100.0 * b_rec, b_ev,
+                100.0 * d_rec, d_ev, d_pb,
+                100.0 * am_rec, am_ev,
+                vs_b, vs_best,
+            );
+        }
+    }
+}
+
+fn mean(queries: &[usize], truths: &[Vec<u32>], mut run: impl FnMut(usize) -> (Vec<u32>, u64)) -> (f64, u64) {
+    let mut rec = 0.0;
+    let mut ev = 0u64;
+    for (&qi, truth) in queries.iter().zip(truths) {
+        let (ids, e) = run(qi);
+        let ids: Vec<u32> = ids.into_iter().filter(|&id| id as usize != qi).collect();
+        rec += recall(truth, &ids);
+        ev += e;
+    }
+    (rec / queries.len() as f64, ev / queries.len() as u64)
+}
diff --git a/crates/ruvector-filtered-bench/src/contenders.rs b/crates/ruvector-filtered-bench/src/contenders.rs
index 58cf491d22..b76daa43ee 100644
--- a/crates/ruvector-filtered-bench/src/contenders.rs
+++ b/crates/ruvector-filtered-bench/src/contenders.rs
@@ -6,11 +6,16 @@
 //! rule #2). Contender A (region-pruned IVF) arrives in M2 (`prune` module).
 
 use ruvector_acorn::graph::AcornGraph;
-use ruvector_acorn::search::acorn_search_counted;
+use ruvector_acorn::search::{acorn_search_counted, acorn_search_seeded_counted};
 
 /// ACORN edge budget base (γ·M neighbors/node); matches `AcornIndexGamma::M`.
 pub const ACORN_M: usize = 16;
 
+#[inline]
+fn l2_sq(a: &[f32], b: &[f32]) -> f32 {
+    a.iter().zip(b).map(|(x, y)| (x - y) * (x - y)).sum()
+}
+
 /// Outcome of one filtered query: the returned ids (nearest-first) and the exact
 /// number of distance evaluations spent — the pre-registered primary cost metric.
 pub struct QueryResult {
@@ -66,6 +71,48 @@ impl Acorn {
             .collect();
         QueryResult { ids, evals }
     }
+
+    /// **Contender D** — ACORN with *predicate-aware entry* (the adversarial "tune harder"
+    /// variant, rule #5). Stride-samples `max_entry_probes` nodes, tests the predicate on
+    /// each (O(1), uncounted — symmetric with how contender A gates distances), and
+    /// distance-evaluates only the *matching* probes to pick the `n_seeds` nearest matching
+    /// seeds. The agnostic beam then starts inside the matching region instead of walking to
+    /// it from a random entry. Falls back to standard ACORN if the sample finds no match.
+    ///
+    /// Cost = (matching probes distance-evaluated) + seeded-search evals. At very low
+    /// selectivity a bounded sample usually finds no match → D degenerates to B.
+    pub fn search_predicate_entry(
+        &self,
+        query: &[f32],
+        k: usize,
+        predicate: impl Fn(u32) -> bool,
+        max_entry_probes: usize,
+        n_seeds: usize,
+    ) -> QueryResult {
+        let n = self.graph.len();
+        let probes = max_entry_probes.clamp(1, n);
+        let mut evals = 0u64;
+        let mut seeds: Vec<(f32, u32)> = Vec::new();
+        for i in 0..probes {
+            let id = (i * n / probes) as u32;
+            if !predicate(id) {
+                continue;
+            }
+            let d = l2_sq(query, self.graph.row(id as usize));
+            evals += 1;
+            seeds.push((d, id));
+        }
+        if seeds.is_empty() {
+            // No matching seed in the sample → standard ACORN entry.
+            let (got, ev) = acorn_search_counted(&self.graph, query, k, self.ef, predicate);
+            return QueryResult { ids: got.into_iter().map(|(id, _)| id).collect(), evals: evals + ev };
+        }
+        seeds.sort_by(|a, b| a.0.total_cmp(&b.0));
+        seeds.truncate(n_seeds.max(1));
+        let seed_ids: Vec<u32> = seeds.iter().map(|&(_, id)| id).collect();
+        let (got, ev) = acorn_search_seeded_counted(&self.graph, query, k, self.ef, predicate, &seed_ids);
+        QueryResult { ids: got.into_iter().map(|(id, _)| id).collect(), evals: evals + ev }
+    }
 }
 
 /// Recall@k against an exact filtered-kNN truth set: fraction of the true top-k

From 5523706553c384704a31d2458f04e8066fd9103e Mon Sep 17 00:00:00 2001
From: Ofer Shaal <oshaal@phase2technology.com>
Date: Thu, 4 Jun 2026 16:37:28 -0400
Subject: [PATCH 6/8] =?UTF-8?q?docs(adr):=20ADR-201=20=E2=80=94=20region-p?=
 =?UTF-8?q?runed=20filtered=20ANN=20vs=20ACORN,=20qualified=20NO-GO=20(M4)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Writes up the BET 2 ⊗ BET 4 outcome with ADR-199/200 honesty. Verdict: region-pruned
IVF beats VANILLA ACORN 6-48x evals (4.7-26x wall-clock) at sel<=1%, but the
pre-registered >=5x WIN does NOT survive the rule-#5 adversarial check — giving ACORN
a predicate-aware entry collapses the gap to ~2x at high correlation (rho=1), below the
bar. A retains a narrow conditional edge at moderate correlation (rho~0.7, 6-39x) plus
an at-scale caveat (D's seeding leans on a ~full predicate scan the eval metric treats
as free). Net: the bet does not cleanly pay; the clean win was an artifact of an
under-equipped incumbent. Central lesson: a filtered-ANN cost claim is meaningless
without a predicate-aware-entry baseline.

Also strips a stray tag from the pre-registration doc (non-semantic).
---
 ...201-region-pruned-filtered-ann-vs-acorn.md | 156 ++++++++++++++++++
 .../bet2-filtered-ann/PRE-REGISTRATION.md     |   1 -
 2 files changed, 156 insertions(+), 1 deletion(-)
 create mode 100644 docs/adr/ADR-201-region-pruned-filtered-ann-vs-acorn.md

diff --git a/docs/adr/ADR-201-region-pruned-filtered-ann-vs-acorn.md b/docs/adr/ADR-201-region-pruned-filtered-ann-vs-acorn.md
new file mode 100644
index 0000000000..5f29465e38
--- /dev/null
+++ b/docs/adr/ADR-201-region-pruned-filtered-ann-vs-acorn.md
@@ -0,0 +1,156 @@
+---
+adr: 201
+title: "Region-Pruned IVF for Filtered ANN vs ACORN: Qualified NO-GO"
+status: proposed
+date: 2026-06-04
+authors: [ofershaal, claude-flow]
+related: [ADR-193, ADR-196, ADR-199, ADR-200]
+tags: [ruvector, retrieval, ann, filtered-search, acorn, ivf, region-pruning, no-go]
+---
+
+# ADR-201 — Region-Pruned IVF for Filtered ANN vs ACORN: Qualified NO-GO
+
+## Status
+
+**Proposed — qualified NO-GO at the pre-registered bar (2026-06-04).** BET 2 ⊗ BET 4 of the
+SepRAG exploration (issue #534): does region-pruned IVF search beat the in-repo `ruvector-acorn`
+incumbent on *correlated* filtered queries? Pre-registration:
+[`docs/plans/bet2-filtered-ann/PRE-REGISTRATION.md`](../plans/bet2-filtered-ann/PRE-REGISTRATION.md).
+
+Region-pruning beats *vanilla* ACORN by 6–48× distance-evals (and 4.7–26× wall-clock) at
+selectivity ≤ 1%. **But the pre-registered ≥5× WIN does not survive the mandatory adversarial
+check (protocol rule #5):** giving ACORN a *predicate-aware entry* — a simple, known enhancement
+— collapses the advantage to **~2× at high correlation (ρ=1), below the 5× bar.** A retains a
+real but **narrow, conditional** edge at *moderate* correlation (ρ≈0.7, 6–39×) and very low
+selectivity, plus an at-scale metric caveat that favours it. Net: the bet **does not cleanly
+pay**; the clean win was an artifact of an under-equipped incumbent.
+
+## Context
+
+Filtered ANN ("nearest among items matching predicate X") is a real flat-ANN weakness: a
+post-filter graph walk starves at low selectivity. `ruvector-acorn` (SIGMOD 2024,
+arXiv:2403.04871) fixes this with a denser γ·M graph + predicate-agnostic traversal, and is the
+strong in-repo incumbent. The hypothesis (BET 2 ⊗ BET 4): when the predicate **correlates** with
+embedding-cluster structure (the production metadata-filter case — `tenant`, `doc_type`, `year`,
+`category`), an IVF hierarchy can **skip whole clusters with zero matches** and beat ACORN on
+cost. On embeddings the pruning kernel cannot use graph separators (high treewidth, [ADR-199]),
+so the substrate is the treewidth-immune IVF hierarchy (`ruvector-rairs`, [ADR-193]) — BET 4 is
+the mechanism, BET 2 the benchmark.
+
+## Method
+
+Self-contained crate `ruvector-filtered-bench` (depends only on `ruvector-acorn` +
+`ruvector-rairs`; independent of [ADR-200]/PR #535). Real ogbn-arxiv (n=20k slice, 128-d, 40
+subject labels). Ground truth = `ruvector-acorn::exact_filtered_knn`. Cost = **distance-evals/
+query** (hardware-independent), with wall-clock as an honesty guard. Predicates built by a
+ρ-correlation knob holding selectivity *exactly* constant across ρ (shuffle a fraction 1−ρ of a
+structured label-class set), so cost deltas are attributable to correlation, not set size.
+
+Contenders, all scored against the same oracle, all reporting **exact** distance-evals (ACORN
+was instrumented with additive, result-preserving `*_counted` search variants):
+- **A** — region-pruned IVF (`prune::RegionPruneIvf`): k-means partition + two stacked prunings
+  — skip zero-match clusters (predicate) and a triangle-inequality branch-and-bound on cluster
+  radius (exact). The salvaged separator-tree B&B kernel ([ADR-196]) on the IVF hierarchy.
+- **B** — tuned vanilla ACORN (γ=2, ef swept; ef=512 ≈ 92% recall at sel=1%).
+- **C** — post-filter floor (retrieve top-pool unfiltered, then filter).
+- **D** — ACORN with predicate-aware entry (the rule-#5 "tune harder" adversary): sample probes,
+  predicate-test free, distance-eval only matching probes, seed the beam from the nearest match.
+
+## Evidence
+
+### The benchmark has teeth (negative control, M1)
+
+Post-filter (C) vs agnostic ACORN (B) on the *same* graph, ρ=1, recall@10:
+
+| sel | B (agnostic) | C (post-filter) |
+|---|---|---|
+| 0.1% | 73.7% | **22.7%** |
+| 0.5% | 90.4% | **59.7%** |
+| 1% | 92.6% | 79.3% |
+| ≥5% | (converge) | (fine) |
+
+A 50+ point swing at low selectivity → the benchmark can distinguish methods (it is not
+insensitive). Tuned ACORN reaches ~92.6% recall @ ~1622 evals/query at sel=1%; its eval count is
+~flat in ef (early-termination-bound), so "tuned" = crank ef for recall at near-constant cost.
+
+### A vs vanilla ACORN — large win (M3 sweep, nclusters=64, cost at matched recall)
+
+| ρ | sel | ACORN-B evals | A evals | ev-ratio | wall-clock ratio |
+|---|---|---|---|---|---|
+| 1.0 | 0.1% | 3753 | 145 | 25.9× | 22.5× |
+| 1.0 | 0.5% | 2152 | 164 | 13.1× | 8.3× |
+| 1.0 | 1% | 1622 | 264 | 6.1× | 4.7× |
+| 1.0 | 5% | 955 | 628 | 1.5× | 1.6× |
+| 0.7 | 1% | 1710 | 189 | 9.0× | 6.4× |
+
+A's exact B&B has recall ≥ ACORN (≈1.0). Win is monotonic in selectivity and **selectivity-
+driven** (it also holds at ρ=0 in the sparse regime — partially refuting the pre-registered
+*correlation* mechanism: correlation governs recall quality, not the eval win). sel=5% already
+misses the ≥2× sub-bar.
+
+### A vs **predicate-aware-entry** ACORN — the win collapses (M3 adversarial, rule #5)
+
+| ρ | sel | vanilla B | **tuned D** | A | A vs **best ACORN** |
+|---|---|---|---|---|---|
+| 1.0 | 0.1% | 3753 | **203** | 84 | **2.4× — MISS** |
+| 1.0 | 0.5% | 2152 | **377** | 164 | **2.3× — MISS** |
+| 1.0 | 1% | 1622 | **508** | 264 | **1.9× — MISS** |
+| 0.7 | 0.1% | 4009 | 3100 | 80 | 38.8× — WIN |
+| 0.7 | 1% | 1769 | 1388 | 214 | 6.5× — WIN |
+
+**Predicate-aware entry cuts ACORN's cost up to ~18× at high correlation** (3753→203 evals),
+because seeding the beam at any matching node lands it inside the tight match cluster, finishing
+in a few hops. A and D then exploit the *same* structure and converge to within ~2×. The win
+**inverts with correlation**: A beats D decisively (6–39×) only at *moderate* ρ=0.7, where D's
+sampled seed often lands on a scattered random match and the walk still wanders.
+
+## Decision / Finding
+
+**Qualified NO-GO at the pre-registered ≥5× bar.** Region-pruned IVF does *not* cleanly beat a
+properly-tuned ACORN. The headline 6–48× win is against *vanilla* ACORN; once ACORN is given a
+predicate-aware entry (a simple, standard enhancement), the gap at high correlation falls to
+~2×, below the bar. The pre-registered WIN required ≥5× at sel≤1% for ρ≥0.7 — met at ρ=0.7,
+**failed at ρ=1.0** — so the conjunction does not hold.
+
+What *did* hold, honestly:
+- A's **exact** recall (1.0) dominates ACORN's ~92% — a quality, not cost, advantage.
+- A retains a **6–39× cost edge at moderate correlation (ρ≈0.7) and sel≤1%**, where ACORN's
+  predicate-aware seeding is ineffective.
+- **At-scale caveat (favours A):** D's seeding leans on predicate-testing ~16k nodes that the
+  distance-eval metric counts as free (O(1) predicate vs 128-d distance). At billion-scale a near-
+  full predicate scan per query is *not* free; that cost would partially restore A's edge. The
+  metric flatters D in exactly the regime where D wins.
+
+## Consequences
+
+- **Do not productionize region-pruned IVF as a general ACORN replacement.** The clean win was an
+  artifact of benchmarking an under-equipped incumbent — caught only by the rule-#5 adversarial
+  check, which is the central lesson: *a filtered-ANN cost claim is meaningless without a
+  predicate-aware-entry baseline.*
+- The B&B region-pruning kernel is **correct and exact** (validated vs the oracle) and remains a
+  reusable asset; its cost advantage is real but narrow and regime-dependent.
+- The honest open question worth a follow-up: at **large n**, where D's per-query predicate scan
+  is genuinely costly, does A's edge re-open? That is the only condition under which this bet
+  could flip to a WIN, and it is not yet tested.
+
+## Boundaries / not proven
+
+- Single dataset (ogbn-arxiv), n=20k, k=10, 200 queries (per-point noise ~±1%).
+- Label-derived correlation as a proxy for production metadata filters.
+- ACORN's lite single-layer graph is weak in dense regions (recall non-monotonic at high
+  selectivity); the comparison is fair (both use it) but absolute recalls are modest there.
+- D's seed-finding is one realization of predicate-aware entry; a smarter one could differ.
+- The at-scale (large-n) regime — where the verdict might flip — is unmeasured.
+
+## Next steps
+
+1. **Large-n re-test** (n ≥ 10⁵–10⁶, ≥500 queries): the one condition that could flip the
+   verdict (D's predicate scan stops being free). If A's edge re-opens there, revisit.
+2. Otherwise, close BET 2 ⊗ BET 4 as a qualified NO-GO and retain the exact B&B kernel as a
+   validated asset for the narrow ρ≈0.7 / very-low-selectivity regime.
+
+## Alternatives considered
+
+- **Region-pruning on graph separators** (not IVF) — rejected upstream ([ADR-199]: embedding
+  graphs are high-treewidth).
+- **Believing the vanilla-ACORN win** — rejected: it does not survive the adversarial check.
diff --git a/docs/plans/bet2-filtered-ann/PRE-REGISTRATION.md b/docs/plans/bet2-filtered-ann/PRE-REGISTRATION.md
index b0c0309b40..affe25ec2d 100644
--- a/docs/plans/bet2-filtered-ann/PRE-REGISTRATION.md
+++ b/docs/plans/bet2-filtered-ann/PRE-REGISTRATION.md
@@ -117,4 +117,3 @@ live in `src/`; the harness is `examples/filtered_ann_pruning.rs`. No dependency
 - The uncorrelated/agnostic regime as a *target* (kill control only — ACORN owns it).
 - Multi-predicate conjunctions, streaming updates, the live-GNN metric (BET 1's frontier).
 - Disk-resident / billion-scale (in-memory ogbn-arxiv is the stage).
-</content>

From dfb15d3737f00f143c05d0c964a40ca58afcb573 Mon Sep 17 00:00:00 2001
From: Ofer Shaal <oshaal@phase2technology.com>
Date: Thu, 4 Jun 2026 16:47:03 -0400
Subject: [PATCH 7/8] =?UTF-8?q?docs(adr):=20ADR-201=20next-steps=20?=
 =?UTF-8?q?=E2=80=94=20multi-predicate=20conjunction=20hypothesis?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The experiment's own evidence points to two flip conditions (conjunctions where
ACORN's predicate-seeding degrades but cluster-skip composes; large-n where the
predicate scan stops being free) and the open BET 4 standalone baseline.
---
 ...201-region-pruned-filtered-ann-vs-acorn.md | 22 +++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/docs/adr/ADR-201-region-pruned-filtered-ann-vs-acorn.md b/docs/adr/ADR-201-region-pruned-filtered-ann-vs-acorn.md
index 5f29465e38..0c5cbf94e0 100644
--- a/docs/adr/ADR-201-region-pruned-filtered-ann-vs-acorn.md
+++ b/docs/adr/ADR-201-region-pruned-filtered-ann-vs-acorn.md
@@ -144,10 +144,24 @@ What *did* hold, honestly:
 
 ## Next steps
 
-1. **Large-n re-test** (n ≥ 10⁵–10⁶, ≥500 queries): the one condition that could flip the
-   verdict (D's predicate scan stops being free). If A's edge re-opens there, revisit.
-2. Otherwise, close BET 2 ⊗ BET 4 as a qualified NO-GO and retain the exact B&B kernel as a
-   validated asset for the narrow ρ≈0.7 / very-low-selectivity regime.
+Two conditions, both surfaced by this experiment's own evidence, could flip the verdict to a
+scoped WIN — they are the honest follow-ups, not the result:
+
+1. **Multi-predicate conjunctions (the strongest lead).** Under `X ∧ Y ∧ Z`, region-pruning's
+   cluster-skip **composes** (skip clusters with zero conjunction-matches), while ACORN's
+   predicate-aware entry (contender D) **degrades sharply** — a *sampled* seed satisfying *all*
+   conjuncts becomes exponentially unlikely as the conjunction tightens, so D regresses toward
+   vanilla ACORN's cost while A stays cheap. This is precisely the regime where A could beat a
+   tuned ACORN *even at high correlation*. This ADR's experiments hold one predicate fixed;
+   conjunctions were out of scope. Highest-leverage next bet, and it reuses this harness.
+2. **Large-n re-test** (n ≥ 10⁵–10⁶, ≥500 queries): D's seeding leans on a ~full predicate scan
+   the distance-eval metric treats as free; at scale that scan is genuinely costly, which could
+   re-open A's edge. Add a predicate-scan cost term and/or measure wall-clock at n=10⁶.
+3. **(Lower priority) BET 4 standalone:** the IVF region-pruning kernel was validated as BET 2's
+   *mechanism* but never run vs the original *plain-IVF-probe* baseline. The kernel is exact; the
+   standalone "beats plain IVF" head-to-head is technically still open.
+4. If none of the above re-open it, close BET 2 ⊗ BET 4 as a qualified NO-GO and retain the
+   exact B&B kernel as a validated asset for the narrow ρ≈0.7 / very-low-selectivity regime.
 
 ## Alternatives considered
 

From 7c10ade929e9cb0a0c58f8f737b122cbc9a01f81 Mon Sep 17 00:00:00 2001
From: Ofer Shaal <oshaal@phase2technology.com>
Date: Thu, 4 Jun 2026 17:03:59 -0400
Subject: [PATCH 8/8] =?UTF-8?q?docs(adr):=20retract=20conjunction=20lead?=
 =?UTF-8?q?=20in=20ADR-201=20(scrutiny=20=E2=80=94=20it=20doesn't=20hold)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A conjunction is a single O(1) boolean predicate of selectivity = product; in the
distance-eval metric it reduces to (selectivity, scatter) — both already swept. The
'exponentially-unlikely seed' reasoning was wrong (testing a conjunction is O(1)).
Residual leads downgraded to narrow/speculative (predicate-eval cost, large-n).
Recommend closing BET 2 ⊗ BET 4; thread value is BET 1 productionization + BET 3.
---
 ...201-region-pruned-filtered-ann-vs-acorn.md | 40 ++++++++++++-------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/docs/adr/ADR-201-region-pruned-filtered-ann-vs-acorn.md b/docs/adr/ADR-201-region-pruned-filtered-ann-vs-acorn.md
index 0c5cbf94e0..6d68671aec 100644
--- a/docs/adr/ADR-201-region-pruned-filtered-ann-vs-acorn.md
+++ b/docs/adr/ADR-201-region-pruned-filtered-ann-vs-acorn.md
@@ -144,24 +144,36 @@ What *did* hold, honestly:
 
 ## Next steps
 
-Two conditions, both surfaced by this experiment's own evidence, could flip the verdict to a
-scoped WIN — they are the honest follow-ups, not the result:
-
-1. **Multi-predicate conjunctions (the strongest lead).** Under `X ∧ Y ∧ Z`, region-pruning's
-   cluster-skip **composes** (skip clusters with zero conjunction-matches), while ACORN's
-   predicate-aware entry (contender D) **degrades sharply** — a *sampled* seed satisfying *all*
-   conjuncts becomes exponentially unlikely as the conjunction tightens, so D regresses toward
-   vanilla ACORN's cost while A stays cheap. This is precisely the regime where A could beat a
-   tuned ACORN *even at high correlation*. This ADR's experiments hold one predicate fixed;
-   conjunctions were out of scope. Highest-leverage next bet, and it reuses this harness.
+**Retraction (2026-06-04, post-verdict scrutiny).** An earlier draft of this section named
+*multi-predicate conjunctions* as "the strongest lead," on the reasoning that a seed satisfying
+all conjuncts is "exponentially unlikely" to sample. **That reasoning is wrong and is retracted.**
+A conjunction `X ∧ Y ∧ Z` is a single boolean predicate of selectivity ≈ the product, evaluated
+**O(1)** by both A and ACORN-D. In the distance-eval metric a conjunction is therefore invisible
+*as* a conjunction — only its **selectivity** and **geometric scatter** matter, and both axes are
+already swept here (the selectivity sweep × the ρ-knob). ACORN-D finds a conjunction-seed by
+sampling at exactly the rate it finds any seed of that selectivity. The multi-modal rescue also
+fails: top-k nearest matches are almost always local to one mode, so D's seed lands correctly.
+**Conjunctions do not favour region-pruning in this cost model.**
+
+The honest residual leads (both narrow):
+
+1. **Predicate-evaluation cost** (a *different* cost axis, excluded here). ACORN's agnostic
+   traversal tests the predicate on **every** expanded node (~1600/query); A tests it on far
+   fewer (probed-cluster members) and can precompute per-attribute per-cluster bitmaps. When
+   predicate evaluation is *expensive* (many attributes, costly lookups — and conjunctions
+   amplify this), A's asymmetry could matter. But for cheap metadata predicates this term is
+   small vs a 128-d distance, so the regime is narrow. Would require a predicate-eval cost model.
 2. **Large-n re-test** (n ≥ 10⁵–10⁶, ≥500 queries): D's seeding leans on a ~full predicate scan
-   the distance-eval metric treats as free; at scale that scan is genuinely costly, which could
-   re-open A's edge. Add a predicate-scan cost term and/or measure wall-clock at n=10⁶.
+   this metric treats as free; at scale that scan is genuinely costly, which *could* re-open A's
+   edge. The most concrete remaining check.
 3. **(Lower priority) BET 4 standalone:** the IVF region-pruning kernel was validated as BET 2's
    *mechanism* but never run vs the original *plain-IVF-probe* baseline. The kernel is exact; the
    standalone "beats plain IVF" head-to-head is technically still open.
-4. If none of the above re-open it, close BET 2 ⊗ BET 4 as a qualified NO-GO and retain the
-   exact B&B kernel as a validated asset for the narrow ρ≈0.7 / very-low-selectivity regime.
+
+**Recommendation:** treat BET 2 ⊗ BET 4 as **closed** (qualified NO-GO). The residual leads are
+narrow/speculative; the SepRAG thread's remaining value is productionizing BET 1 (the proven WIN,
+[ADR-200]) and exploring BET 3 (multi-hop KG, a different mechanism). Retain the exact B&B kernel
+as a validated asset for the narrow ρ≈0.7 / very-low-selectivity regime.
 
 ## Alternatives considered