prostomarkeloff · prostomarkeloff · May 30, 2026 · May 30, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "difflib-fast"
-version = "0.2.0"
-description = "Fast, byte-for-byte exact difflib Ratcliff–Obershelp (gestalt) similarity ratio + single-linkage clustering, via a suffix automaton."
+version = "0.3.0"
+description = "Fast, byte-for-byte exact difflib Ratcliff–Obershelp (gestalt) similarity ratio + single-linkage clustering (suffix automaton), plus an exact all-pairs weighted-cosine similarity join (L2AP, CPU+GPU)."
 keywords = ["difflib", "similarity", "ratcliff-obershelp", "suffix-automaton", "fuzzy"]
 categories = ["algorithms", "text-processing"]
 edition = "2021"
@@ -48,6 +48,10 @@ gpu = ["dep:metal"]
 # lengths. Costs ~10-20% wall when active (relaxed-ordered atomics on the hot path); the
 # `cfg(feature)` gate compiles to a no-op in default builds so production stays untouched.
 instrument = []
+# `profiling` marks `simjoin`'s hot phases `#[inline(never)]` so the sampler (samply) attributes
+# self-time per phase (candidate-gen vs verify vs index-suffix) instead of one inlined `cosine_join`
+# blob. Pure observability — compiles to identical codegen as default when off. Never ship it.
+profiling = []
 
 # `objc`'s `sel!`/`msg_send!` macros (pulled in transitively by the `metal` crate under the `gpu`
 # feature) expand to `cfg(feature = "cargo-clippy")` checks — whitelist that cfg so the lint stays
@@ -67,6 +71,11 @@ pedantic = { level = "deny", priority = -1 }
 name = "bench"
 required-features = ["bench"]
 
+# GPU-vs-CPU throughput experiment for the simjoin verify step (Apple Metal). Needs the `gpu` feature.
+[[example]]
+name = "simjoin_gpu_bench"
+required-features = ["gpu"]
+
 # Keep symbols + line tables in release so an external sampler (samply) can resolve frames.
 [profile.release]
 debug = "line-tables-only"

diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ assert_eq!(ratio("the quick brown fox", "the quick brown dog"), 0.89473684210526
 ```
 
 ```toml
-difflib-fast = "0.1"
+difflib-fast = "0.3"
 ```
 
 ---
@@ -140,7 +140,7 @@ default (the GPU paths remain opt-in via `DFGPU_RATIO_MANY_THRESHOLD` / `DFGPU_M
 the feature off, on non-macOS, or with no Metal device, every call quietly runs on CPU.
 
 ```toml
-difflib-fast = { version = "0.1", features = ["gpu"] }   # macOS only
+difflib-fast = { version = "0.3", features = ["gpu"] }   # macOS only
 ```
 
 ---
@@ -218,7 +218,7 @@ pick the wheel for you — grab the one for your platform from the
 
 ```bash
 # macOS Apple Silicon — swap the filename for your platform (see below):
-pip install https://github.com/prostomarkeloff/difflib-fast/releases/download/v0.2.0/difflib_fast-0.2.0-cp39-abi3-macosx_11_0_arm64.whl
+pip install https://github.com/prostomarkeloff/difflib-fast/releases/download/v0.3.0/difflib_fast-0.3.0-cp39-abi3-macosx_11_0_arm64.whl
 ```
 
 | platform | wheel suffix |
@@ -298,6 +298,48 @@ default, so list both).
 
 ---
 
+## Also: exact cosine similarity join (`simjoin`)
+
+The same "exact, or it's a bug" discipline, pointed at a different metric. **`simjoin`** is an exact
+all-pairs **weighted-cosine** similarity join over sparse non-negative vectors — *every* pair with
+`cos ≥ t`, no LSH, no approximation — on the provably-SOTA **L2AP** algorithm (inverted index +
+Cauchy–Schwarz prefix pruning; Anastasiu & Karypis, ICDE'14). It's the principled exact replacement for
+"shingle candidates → verify" near-duplicate detection: documents = functions, dimensions = canonical
+lines, weights = IDF — i.e. **exact Type-3 code-clone detection**.
+
+```python
+import difflib_fast as df
+
+# documents as token lists → TF-IDF in Rust → every pair with cosine ≥ 0.8
+docs = [["def _fn(_v0):", "return _v0 + 1"],
+        ["def _fn(_v0):", "return _v0 + 1"],   # an exact clone of doc 0
+        ["import os", "import sys"]]
+df.cosine_join(docs, 0.8)          # → [(0, 1, 1.0)]   tuples are (j, i, cos), j < i
+df.cosine_join(docs, 0.8, "gpu")   # same join, the dot-products run on the Metal GPU
+```
+
+Three backends, one argument (`concurrency=`) — all auto-parallel across every core (rayon, GIL
+released, exactly like `ratio`):
+
+| `concurrency` | how | result |
+|---|---|---|
+| `"cpu"` | L2AP on all cores | exact `f64` |
+| `"gpu+cpu"` | CPU prunes ~99% of candidates, GPU verifies the rest (f32 filter), CPU re-scores survivors exactly | **byte-identical to `"cpu"`** |
+| `"gpu"` | CPU prunes, GPU verifies, emit the f32 score | ε-exact (≤ 1 differing pair per **millions**) |
+
+On the real **top-300 PyPI** corpus (287,408 functions, 3.1M clone pairs found) the verify is
+memory-**bandwidth**-bound, and the Apple GPU's memory-level parallelism wins it: **53 GB/s** of
+random-gather sparse dot-products vs the CPU's 22 GB/s, so the GPU backends run the whole join
+**~1.8–2× faster than the (already L2AP-tuned) CPU**, byte-for-byte. Brute force would be ~4·10¹⁰ pairs
+(hours); this is seconds. `CosineJoiner(docs)` is the stateful handle (build corpus + GPU upload once,
+sweep thresholds); full numbers in [`benchmarks.md`](benchmarks.md#6-similarity-join-simjoin).
+
+In Rust: `difflib_fast::simjoin::{Corpus, cosine_join, cosine_join_with, CosineJoiner}` (GPU backends
+behind the `gpu` feature). Same correctness gate as the rest of the crate — the indexed join is
+asserted **bit-identical to an O(n²) brute-force oracle** on hundreds of fuzzed corpora.
+
+---
+
 ## How it works
 
 The metric is **Ratcliff–Obershelp** (Ratcliff & Obershelp, 1988) computed over a **suffix automaton**

diff --git a/benchmarks.md b/benchmarks.md
@@ -2,7 +2,9 @@
 
 Exact **Ratcliff–Obershelp** (Python `difflib.SequenceMatcher(..., autojunk=False).ratio()`) throughput
 and head-to-head vs every other implementation we could find. `pairs/s` = pairwise `ratio` decisions
-per second. Speedups (`Nx`) are **difflib-fast ÷ competitor**.
+per second. Speedups (`Nx`) are **difflib-fast ÷ competitor**. §1–5 + Landscape cover the RO path;
+[**§6**](#6-similarity-join-simjoin) benches the crate's other exact capability — `simjoin`, an
+all-pairs weighted-cosine similarity join (L2AP) — on a real 287k-function corpus, CPU vs GPU.
 
 ## Setup
 
@@ -270,3 +272,60 @@ exists to close.
 The libraries that beat difflib-fast outright on raw speed (RapidFuzz, strsim) do so by computing a
 **different metric** (Indel/Levenshtein), not difflib's ratio — so they aren't drop-in replacements for
 `difflib.ratio()`.
+
+---
+
+## 6. Similarity join (`simjoin`) — exact weighted-cosine
+
+A different capability in the same crate, held to the same exactness bar. **`simjoin`** is an exact
+all-pairs **weighted-cosine** similarity join over sparse non-negative vectors (every pair with
+`cos ≥ t`), on the SOTA **L2AP** algorithm — inverted index + Cauchy–Schwarz prefix-prune (Anastasiu &
+Karypis, ICDE'14). The exact replacement for shingle-candidate + verify near-duplicate detection
+(functions × IDF-weighted canonical lines = exact Type-3 clone detection). Correctness gate: the indexed
+join is **bit-identical to an `O(n²)` brute-force oracle** on fuzzed corpora — the same
+"two implementations, one answer" discipline as the RO path.
+
+**Corpus:** the real **top-300 PyPI** Type-3 snapshot — **287,408 functions**, 1.53M distinct canonical
+lines, mean 11.4 lines/function. Brute force is `287k²/2 ≈ 4·10¹⁰` pairs (hours); the join runs in
+seconds. M3 Pro (6 P + 6 E cores), 12 threads.
+
+### 6a. Full join — three backends
+
+`cosine_join` across its three `Concurrency` backends. `"cpu"` and `"gpu+cpu"` are **byte-for-byte
+identical**; `"gpu"` (pure f32) differs by ≤ 1 pair in millions — cosine is a sum of non-negative
+products, so there's no cancellation, f32 is ~1e-6 accurate, and only threshold-boundary pairs can flip:
+
+| threshold | pairs found | `cpu` (exact f64) | `gpu+cpu` (exact f64) | `gpu` (f32) |
+|---|---|---|---|---|
+| 0.8 | 3,115,369 | 2.9–3.5 s | **1.6–2.0 s · ~1.8×** | 1.6 s · 1.9× |
+| 0.7 | 4,427,097 | 4.6–5.2 s | 2.4 s · 1.9× | **2.3 s · 2.0×** |
+
+(`gpu` differed from exact by **1 pair of 3,115,369** at t=0.8, **0 of 4,427,097** at t=0.7; max cosine
+gap on shared pairs 8.8e-7.)
+
+### 6b. Why the GPU wins — the verify is bandwidth-bound
+
+The join's dominant cost is the **verify**: ~10⁸ candidate pairs, each an `O(nnz)` sparse dot that
+gathers two random CSR rows. With every core gathering at once it's memory-**bandwidth**-bound (not
+compute), and the Apple GPU sustains far more in-flight memory requests against the same unified-memory
+pool. Measured on 20M random sparse-dot pairs (173 B/pair, f32):
+
+| backend | throughput | effective gather bandwidth | % of ~150 GB/s peak |
+|---|---|---|---|
+| GPU (Metal, incl. pair upload) | 309 M pairs/s | **53 GB/s** | ~36% |
+| CPU rayon (12 threads) | 126 M pairs/s | 22 GB/s | ~14% |
+| CPU serial | 17 M pairs/s | 2.9 GB/s | ~2% |
+
+GPU f32 matches CPU f32 to **6e-8** (the kernel is correct). On a scatter pattern, 36% of peak is the
+GPU's memory-level-parallelism edge — the CPU stalls on each random gather where the GPU keeps hundreds
+in flight.
+
+**Honest read.** The hardware (Metal) is **f32-only** — no `double` — so a GPU dot can't be
+bit-identical to the CPU `f64`. `gpu+cpu` works around it: the GPU f32 dot is a *conservative filter*
+(it rejects only what's clearly below `t`), and the CPU recomputes the exact `f64` score on the ~3% of
+survivors that pass — so the pair set + scores are byte-identical to `cpu`. The `gpu` backend skips the
+re-verify and emits f32 (≤1 differing pair per millions, immaterial at a similarity threshold). The CPU
+backend itself is already L2AP-tuned to the bandwidth wall — branchless sorted-merge dot, cache-packed
+prune state, full-index-then-parallel-probe — so the GPU's ~2× sits on top of an already-fast baseline
+(itself ~23× over a naive inverted-index join on synthetic Zipfian data). Methodology + Metal kernel:
+[`examples/simjoin_gpu_bench.rs`](examples/simjoin_gpu_bench.rs), `src/simjoin_gpu.rs`.
diff --git a/examples/simjoin_bench.rs b/examples/simjoin_bench.rs
@@ -0,0 +1,115 @@
+//! Bench harness for `simjoin::cosine_join` on a synthetic Zipfian-skewed sparse corpus (few common
+//! dims, many rare — like IDF-weighted lines/tokens). Reproducible, no I/O, scalable.
+//!
+//! `cargo run --release --example simjoin_bench -- [n] [nnz] [ndims] [threshold] [reps]`
+//! defaults: n=100000 nnz=14 ndims=20000 t=0.7 reps=3
+
+#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss, clippy::cast_sign_loss)]
+
+use std::time::Instant;
+
+use difflib_fast::simjoin::{cosine_join, Corpus};
+#[cfg(feature = "profiling")]
+use difflib_fast::simjoin::cosine_join_counts;
+
+fn arg<T: std::str::FromStr>(i: usize, def: T) -> T {
+    std::env::args().nth(i).and_then(|s| s.parse().ok()).unwrap_or(def)
+}
+
+/// Deterministic xorshift → IDF-weighted sparse rows. Each vector draws `nnz` distinct dims with a
+/// cubic bias toward low ids (so low ids are common, high ids rare); the per-dim weight is its IDF
+/// `ln(n / df)` computed from the generated corpus — the realistic weighted-cosine input shape.
+fn gen(n: usize, nnz: usize, ndims: usize, seed: u64) -> Vec<Vec<(u32, f64)>> {
+    let mut s = seed;
+    let mut next = move || {
+        s ^= s << 13;
+        s ^= s >> 7;
+        s ^= s << 17;
+        s
+    };
+    // 1. distinct dims per vector (cubic skew → Zipfian-ish frequencies). ~5% of vectors are planted
+    //    near-duplicates of an earlier one (so real cosine clusters exist → the verify path runs).
+    let mut sets: Vec<Vec<u32>> = Vec::with_capacity(n);
+    for i in 0..n {
+        let dup = i > 0 && (next() % 100) < 5;
+        let mut v: Vec<u32> = if dup {
+            let src = (next() as usize) % i;
+            let mut c = sets[src].clone();
+            if !c.is_empty() {
+                let k = (next() as usize) % c.len();
+                c[k] = (next() % ndims as u64) as u32; // mutate one dim
+            }
+            c
+        } else {
+            (0..nnz)
+                .map(|_| {
+                    let u = (next() >> 11) as f64 / (1u64 << 53) as f64; // [0,1)
+                    ((u * u * u) * ndims as f64) as u32 % ndims as u32
+                })
+                .collect()
+        };
+        v.sort_unstable();
+        v.dedup();
+        sets.push(v);
+    }
+    let mut df = vec![0u32; ndims];
+    for v in &sets {
+        for &d in v {
+            df[d as usize] += 1;
+        }
+    }
+    // 2. weight each present dim by its IDF.
+    sets.into_iter()
+        .map(|v| {
+            v.into_iter()
+                .map(|d| {
+                    let idf = (n as f64 / f64::from(df[d as usize]).max(1.0)).ln();
+                    (d, idf)
+                })
+                .collect()
+        })
+        .collect()
+}
+
+fn main() {
+    let n: usize = arg(1, 100_000);
+    let nnz: usize = arg(2, 14);
+    let ndims: usize = arg(3, 20_000);
+    let t: f64 = arg(4, 0.7);
+    let reps: usize = arg(5, 3);
+
+    let rows = gen(n, nnz, ndims, 0x1234_5678_9abc_def1);
+    let build0 = Instant::now();
+    let corpus = Corpus::from_rows(&rows);
+    let build_ms = build0.elapsed().as_secs_f64() * 1000.0;
+
+    // Strategy diagnostic (profiling builds only): posting touches / candidates / pairs. The
+    // candidates-per-pair ratio decides whether to prune harder or speed the dot up.
+    #[cfg(feature = "profiling")]
+    if std::env::var("STATS").is_ok() {
+        let (ncand, survivors, pairs) = cosine_join_counts(&corpus, t);
+        eprintln!(
+            "STATS n={n} t={t} | candidates={ncand} survivors(cos_full)={survivors} pairs={pairs} \
+             | prune_pass={:.4} cos_full_saved={:.4} survivor_precision={:.3}",
+            survivors as f64 / ncand.max(1) as f64,
+            1.0 - survivors as f64 / ncand.max(1) as f64,
+            pairs as f64 / survivors.max(1) as f64,
+        );
+    }
+
+    let mut ms: Vec<f64> = Vec::with_capacity(reps);
+    let mut npairs = 0usize;
+    for _ in 0..reps {
+        let t0 = Instant::now();
+        let pairs = cosine_join(&corpus, t);
+        ms.push(t0.elapsed().as_secs_f64() * 1000.0);
+        npairs = pairs.len();
+        std::hint::black_box(&pairs);
+    }
+    ms.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
+    eprintln!(
+        "n={n} nnz={nnz} ndims={ndims} t={t} | build={build_ms:.0}ms | join: min={:.1}ms median={:.1}ms | pairs={npairs}",
+        ms[0],
+        ms[reps / 2],
+    );
+}