From 69cbd0a0c2857380006ac6734e93aaf1540232ef Mon Sep 17 00:00:00 2001 From: Joao Henrique Machado Silva Date: Mon, 25 May 2026 09:47:09 +0200 Subject: [PATCH] fix hnsw delete reinsert rebuild --- docs/sql-engine.md | 2 +- examples/nodejs-notes/README.md | 13 +-- examples/nodejs-notes/src/ingest.mjs | 16 +-- src/sql/db/table.rs | 51 +++++++++- src/sql/executor.rs | 17 ++-- src/sql/hnsw.rs | 141 ++++++++++++++++++--------- src/sql/mod.rs | 55 +++++++++++ src/sql/pager/mod.rs | 52 +--------- 8 files changed, 216 insertions(+), 131 deletions(-) diff --git a/docs/sql-engine.md b/docs/sql-engine.md index 195679d..d534893 100644 --- a/docs/sql-engine.md +++ b/docs/sql-engine.md @@ -133,7 +133,7 @@ Two specialized shortcuts in [`src/sql/executor.rs`](../src/sql/executor.rs) rec ORDER BY vec_distance_l2(, ) ASC LIMIT k ``` -Returns top-k from the HNSW graph in `O(log N)` per probe. Mirrored shapes for `vec_distance_cosine` and `vec_distance_dot`. +Returns top-k from the HNSW graph in `O(log N)` per probe. Mirrored shapes for `vec_distance_cosine` and `vec_distance_dot`. INSERT maintains HNSW incrementally. DELETE / UPDATE mark the graph dirty; the next INSERT on the indexed vector column rebuilds the in-memory graph from surviving rows before adding the new node, and save/COMMIT still rebuilds dirty graphs before serializing. `try_fts_probe` (Phase 8b) — BM25 keyword: diff --git a/examples/nodejs-notes/README.md b/examples/nodejs-notes/README.md index cd814a6..6d31e37 100644 --- a/examples/nodejs-notes/README.md +++ b/examples/nodejs-notes/README.md @@ -247,17 +247,6 @@ Default is 0.5. time and won't notice newly-ingested files until you reload it anyway, so the gain from coexisting isn't worth the surprise. -- **HNSW after delete + re-insert (engine bug, SQLR-8).** The - engine's HNSW chunk index panics when rows are deleted and re- - inserted within the same connection lifetime — `index out of - bounds` inside `DistanceMetric::compute`. The ingest pipeline - works around it by splitting refresh into a delete-phase → - close/reopen → insert-phase, which forces a clean index rebuild - on next open. We pay the close/reopen hop only when there are - actual deletions (so first-time `init` skips it). Once SQLR-8 - lands in the engine, `src/ingest.mjs` can drop the `db.reopen()` - call. - - **Aggregates limited.** The engine supports `COUNT(*)`, `SUM`/`AVG`/`MIN`/`MAX` but not arbitrary expressions in `SELECT` projection beyond aggregates (see [`docs/supported-sql.md`](../../docs/supported-sql.md)). @@ -319,7 +308,7 @@ examples/nodejs-notes/ │ ├── db.mjs # schema, migrations, every SQL string │ ├── chunker.mjs # frontmatter + heading-aware chunking │ ├── embeddings.mjs # hash + OpenAI embedders -│ ├── ingest.mjs # plan → delete → reopen → insert +│ ├── ingest.mjs # plan → delete → insert │ ├── search.mjs # hybrid retrieval driver │ ├── serve.mjs # spawn sqlrite-mcp --read-only │ └── claude-config.mjs # Claude Desktop snippet renderer diff --git a/examples/nodejs-notes/src/ingest.mjs b/examples/nodejs-notes/src/ingest.mjs index 2c4cfcf..be49a7b 100644 --- a/examples/nodejs-notes/src/ingest.mjs +++ b/examples/nodejs-notes/src/ingest.mjs @@ -10,11 +10,7 @@ // // Both flow through `ingestImpl`, which splits the work into three // phases: PLAN (read-only diff against the current DB) → DELETE (drop -// stale documents/chunks; close + reopen the DB) → INSERT (write new -// rows). The close/reopen between DELETE and INSERT is a workaround -// for an engine bug where the HNSW chunk index panics when rows are -// deleted and re-inserted in the same connection lifetime — see the -// "Known limitations" section of this example's README. +// stale documents/chunks) → INSERT (write new rows). import { readFile, readdir, stat } from 'node:fs/promises'; import { createHash } from 'node:crypto'; @@ -192,15 +188,6 @@ async function ingestImpl({ db, root, embedder, logger, chunkOpts, mode }) { // ---------------------------------------------------------------- // PHASE 2 — deletes (and replacing-deletes). - // - // The engine's HNSW index has a bug where rows deleted and re- - // inserted within the same connection lifetime can corrupt the - // index's stored vectors (see ../README.md "Known limitations"). - // Closing + reopening the connection between the delete-pass and - // the insert-pass forces a full index rebuild on next open, - // sidestepping the issue. We only pay this cost when there's - // actually something to delete; pure-INSERT runs (first `init`) - // skip this hop entirely. if (hasMutations) { db.transaction(() => { for (const id of planDeletes) db.deleteDocument(id); @@ -208,7 +195,6 @@ async function ingestImpl({ db, root, embedder, logger, chunkOpts, mode }) { if (e.plan.priorId !== null) db.deleteDocument(e.plan.priorId); } }); - db.reopen(); } // ---------------------------------------------------------------- diff --git a/src/sql/db/table.rs b/src/sql/db/table.rs index 392cc80..6a5952a 100644 --- a/src/sql/db/table.rs +++ b/src/sql/db/table.rs @@ -1237,7 +1237,7 @@ impl Table { // pulls from the table's row storage (which we *just* updated // with the new value). if let Some(Value::Vector(new_vec)) = &typed_value { - self.maintain_hnsw_on_insert(key, next_rowid, new_vec); + self.maintain_hnsw_on_insert(key, next_rowid, new_vec)?; } // Step 4 (Phase 8b): maintain any FTS indexes on this column. @@ -1257,7 +1257,9 @@ impl Table { /// the borrowing dance — we need both `&self.rows` (read other /// vectors) and `&mut self.hnsw_indexes` (insert into the graph) — /// stays localized. - fn maintain_hnsw_on_insert(&mut self, column: &str, rowid: i64, new_vec: &[f32]) { + fn maintain_hnsw_on_insert(&mut self, column: &str, rowid: i64, new_vec: &[f32]) -> Result<()> { + self.rebuild_dirty_hnsw_indexes()?; + // Snapshot the current vector storage so the get_vec closure // doesn't fight with `&mut self.hnsw_indexes`. For a typical // HNSW insert we touch ef_construction × log(N) other vectors, @@ -1279,9 +1281,52 @@ impl Table { if entry.column_name == column { entry.index.insert(rowid, new_vec, |id| { vec_snapshot.get(&id).cloned().unwrap_or_default() - }); + })?; + } + } + Ok(()) + } + + /// Rebuilds any dirty HNSW index on this table from the current + /// vector column storage. DELETE / UPDATE mark indexes dirty because + /// stale graph edges may still point at removed rowids; this makes + /// the next in-memory operation see a clean graph without requiring + /// a close/reopen or save round-trip. + pub fn rebuild_dirty_hnsw_indexes(&mut self) -> Result<()> { + let dirty: Vec<(String, String, DistanceMetric)> = self + .hnsw_indexes + .iter() + .filter(|e| e.needs_rebuild) + .map(|e| (e.name.clone(), e.column_name.clone(), e.metric)) + .collect(); + if dirty.is_empty() { + return Ok(()); + } + + for (idx_name, col_name, metric) in dirty { + let mut vectors: Vec<(i64, Vec)> = Vec::new(); + { + let row_data = self.rows.lock().expect("rows mutex poisoned"); + if let Some(Row::Vector(map)) = row_data.get(&col_name) { + for (id, v) in map.iter() { + vectors.push((*id, v.clone())); + } + } + } + + let snapshot: HashMap> = vectors.iter().cloned().collect(); + let mut new_idx = HnswIndex::new(metric, 0xC0FFEE); + vectors.sort_by_key(|(id, _)| *id); + for (id, v) in &vectors { + new_idx.insert(*id, v, |q| snapshot.get(&q).cloned().unwrap_or_default())?; + } + + if let Some(entry) = self.hnsw_indexes.iter_mut().find(|e| e.name == idx_name) { + entry.index = new_idx; + entry.needs_rebuild = false; } } + Ok(()) } /// After a row insert, push the new (rowid, text) into every FTS diff --git a/src/sql/executor.rs b/src/sql/executor.rs index 8f57f3c..181cd90 100644 --- a/src/sql/executor.rs +++ b/src/sql/executor.rs @@ -1468,7 +1468,7 @@ fn create_hnsw_index( let v_clone = v.clone(); idx.insert(*rowid, &v_clone, |id| { vec_map.get(&id).cloned().unwrap_or_default() - }); + })?; } } @@ -1863,12 +1863,15 @@ fn try_hnsw_probe(table: &Table, order_expr: &Expr, k: usize) -> Option // module stays decoupled from the SQL types. let column_for_closure = col_name.clone(); let table_ref = table; - let result = entry.index.search(&query_vec, k, |id| { - match table_ref.get_value(&column_for_closure, id) { - Some(Value::Vector(v)) => v, - _ => Vec::new(), - } - }); + let result = entry + .index + .search(&query_vec, k, |id| { + match table_ref.get_value(&column_for_closure, id) { + Some(Value::Vector(v)) => v, + _ => Vec::new(), + } + }) + .ok()?; Some(result) } diff --git a/src/sql/hnsw.rs b/src/sql/hnsw.rs index 5665d35..bbca1d7 100644 --- a/src/sql/hnsw.rs +++ b/src/sql/hnsw.rs @@ -53,6 +53,8 @@ use std::cmp::Ordering; use std::collections::{BinaryHeap, HashMap, HashSet}; +use crate::error::{Result, SQLRiteError}; + /// Distance metric used by the HNSW index. Must match what the /// surrounding `vec_distance_*` SQL function would compute on the same /// pair of vectors — otherwise the index probe and the brute-force @@ -107,8 +109,14 @@ impl DistanceMetric { /// will prefer any finite alternative, which matches the SQL-level /// behaviour where `vec_distance_cosine` errors but the optimizer's /// fallback path simply skips the offending row. - pub fn compute(self, a: &[f32], b: &[f32]) -> f32 { - debug_assert_eq!(a.len(), b.len(), "vector dim mismatch in HNSW distance"); + pub fn compute(self, a: &[f32], b: &[f32]) -> Result { + if a.is_empty() || b.is_empty() || a.len() != b.len() { + return Err(SQLRiteError::General(format!( + "HNSW vector dimension mismatch: left has {}, right has {}", + a.len(), + b.len() + ))); + } match self { DistanceMetric::L2 => { let mut sum = 0.0f32; @@ -116,7 +124,7 @@ impl DistanceMetric { let d = a[i] - b[i]; sum += d * d; } - sum.sqrt() + Ok(sum.sqrt()) } DistanceMetric::Cosine => { let mut dot = 0.0f32; @@ -129,9 +137,9 @@ impl DistanceMetric { } let denom = (na * nb).sqrt(); if denom == 0.0 { - f32::INFINITY + Ok(f32::INFINITY) } else { - 1.0 - dot / denom + Ok(1.0 - dot / denom) } } DistanceMetric::Dot => { @@ -139,7 +147,7 @@ impl DistanceMetric { for i in 0..a.len() { dot += a[i] * b[i]; } - -dot + Ok(-dot) } } } @@ -284,12 +292,12 @@ impl HnswIndex { /// re-inserting an existing id is a no-op (returns without error). /// `vec` is the new node's vector; `get_vec` looks up the vector /// for any other node id the algorithm touches. - pub fn insert(&mut self, node_id: i64, vec: &[f32], get_vec: F) + pub fn insert(&mut self, node_id: i64, vec: &[f32], get_vec: F) -> Result<()> where F: Fn(i64) -> Vec, { if self.nodes.contains_key(&node_id) { - return; + return Ok(()); } // First node: trivial case. Becomes entry point at layer 0. @@ -302,7 +310,7 @@ impl HnswIndex { ); self.entry_point = Some(node_id); self.top_layer = 0; - return; + return Ok(()); } // Pick a layer for this new node. @@ -321,7 +329,7 @@ impl HnswIndex { // because the new node doesn't live there. let mut entry = self.entry_point.expect("non-empty index has entry point"); for layer in (target_layer + 1..=self.top_layer).rev() { - let nearest = self.search_layer(vec, &[entry], 1, layer, &get_vec); + let nearest = self.search_layer(vec, &[entry], 1, layer, &get_vec)?; if let Some((_, id)) = nearest.into_iter().next() { entry = id; } @@ -333,7 +341,7 @@ impl HnswIndex { let mut entries = vec![entry]; for layer in (0..=target_layer).rev() { let candidates = - self.search_layer(vec, &entries, self.params.ef_construction, layer, &get_vec); + self.search_layer(vec, &entries, self.params.ef_construction, layer, &get_vec)?; // Pick up to M neighbors from candidates (M_max0 at layer 0 // since we allow more connections at the dense layer). @@ -368,8 +376,12 @@ impl HnswIndex { let nb_vec = get_vec(nb); let mut by_dist: Vec<(f32, i64)> = nb_layers[layer] .iter() - .map(|id| (self.distance.compute(&nb_vec, &get_vec(*id)), *id)) - .collect(); + .map(|id| { + self.distance + .compute(&nb_vec, &get_vec(*id)) + .map(|d| (d, *id)) + }) + .collect::>>()?; by_dist .sort_by(|(da, _), (db, _)| da.partial_cmp(db).unwrap_or(Ordering::Equal)); by_dist.truncate(m_max); @@ -387,22 +399,23 @@ impl HnswIndex { self.top_layer = target_layer; self.entry_point = Some(node_id); } + Ok(()) } /// Returns the k nearest node ids to `query`, in distance-ascending /// order (closest first). Empty index returns an empty Vec. - pub fn search(&self, query: &[f32], k: usize, get_vec: F) -> Vec + pub fn search(&self, query: &[f32], k: usize, get_vec: F) -> Result> where F: Fn(i64) -> Vec, { if self.is_empty() || k == 0 { - return Vec::new(); + return Ok(Vec::new()); } // Greedy descent from the top down to layer 1. let mut entry = self.entry_point.expect("non-empty index has entry point"); for layer in (1..=self.top_layer).rev() { - let nearest = self.search_layer(query, &[entry], 1, layer, &get_vec); + let nearest = self.search_layer(query, &[entry], 1, layer, &get_vec)?; if let Some((_, id)) = nearest.into_iter().next() { entry = id; } @@ -410,9 +423,9 @@ impl HnswIndex { // Beam search at layer 0 with width = max(ef_search, k). let ef = self.params.ef_search.max(k); - let candidates = self.search_layer(query, &[entry], ef, 0, &get_vec); + let candidates = self.search_layer(query, &[entry], ef, 0, &get_vec)?; - candidates.into_iter().take(k).map(|(_, id)| id).collect() + Ok(candidates.into_iter().take(k).map(|(_, id)| id).collect()) } /// Runs a beam search at one layer starting from `entries`, returning @@ -430,7 +443,7 @@ impl HnswIndex { ef: usize, layer: usize, get_vec: &F, - ) -> Vec<(f32, i64)> + ) -> Result> where F: Fn(i64) -> Vec, { @@ -444,7 +457,7 @@ impl HnswIndex { if !visited.insert(id) { continue; } - let d = self.distance.compute(query, &get_vec(id)); + let d = self.distance.compute(query, &get_vec(id))?; candidates.push(MinHeapItem { dist: d, id }); results.push(MaxHeapItem { dist: d, id }); } @@ -474,7 +487,7 @@ impl HnswIndex { if !visited.insert(nb) { continue; } - let d = self.distance.compute(query, &get_vec(nb)); + let d = self.distance.compute(query, &get_vec(nb))?; let admit = if results.len() < ef { true } else { @@ -497,7 +510,7 @@ impl HnswIndex { out.push((item.dist, item.id)); } out.reverse(); - out + Ok(out) } /// Picks a layer for a new node using the standard HNSW geometric @@ -506,7 +519,7 @@ impl HnswIndex { /// - P(L=0) ≈ 1 - 1/M = 15/16 /// - P(L=1) ≈ 1/16 - 1/256 /// - P(L=2) ≈ 1/256 - … - /// i.e., most new nodes live only at layer 0; a few percolate up. + /// i.e., most new nodes live only at layer 0; a few percolate up. fn pick_layer(&mut self) -> usize { let u = self.next_uniform().max(1e-6); // guard log(0) let layer = (-u.ln() * self.params.m_l).floor() as usize; @@ -629,7 +642,7 @@ mod tests { let mut by_dist: Vec<(f32, i64)> = vectors .iter() .enumerate() - .map(|(i, v)| (metric.compute(query, v), i as i64)) + .map(|(i, v)| (metric.compute(query, v).expect("matching dims"), i as i64)) .collect(); by_dist.sort_by(|(a, _), (b, _)| a.partial_cmp(b).unwrap_or(Ordering::Equal)); by_dist.into_iter().take(k).map(|(_, id)| id).collect() @@ -650,7 +663,9 @@ mod tests { fn empty_index_returns_empty_search() { let idx = HnswIndex::new(DistanceMetric::L2, 42); let vectors: Vec> = vec![]; - let result = idx.search(&[0.0; 4], 5, |id| vectors[id as usize].clone()); + let result = idx + .search(&[0.0; 4], 5, |id| vectors[id as usize].clone()) + .unwrap(); assert!(result.is_empty()); } @@ -658,9 +673,12 @@ mod tests { fn single_node_returns_only_itself() { let mut idx = HnswIndex::new(DistanceMetric::L2, 42); let v0 = vec![1.0, 2.0, 3.0]; - let vectors = vec![v0.clone()]; - idx.insert(0, &v0, |id| vectors[id as usize].clone()); - let result = idx.search(&[0.0; 3], 5, |id| vectors[id as usize].clone()); + let vectors = [v0.clone()]; + idx.insert(0, &v0, |id| vectors[id as usize].clone()) + .unwrap(); + let result = idx + .search(&[0.0; 3], 5, |id| vectors[id as usize].clone()) + .unwrap(); assert_eq!(result, vec![0]); } @@ -668,20 +686,36 @@ mod tests { fn duplicate_insert_is_noop() { let mut idx = HnswIndex::new(DistanceMetric::L2, 42); let v0 = vec![1.0, 2.0]; - let vectors = vec![v0.clone()]; - idx.insert(0, &v0, |id| vectors[id as usize].clone()); - idx.insert(0, &v0, |id| vectors[id as usize].clone()); + let vectors = [v0.clone()]; + idx.insert(0, &v0, |id| vectors[id as usize].clone()) + .unwrap(); + idx.insert(0, &v0, |id| vectors[id as usize].clone()) + .unwrap(); assert_eq!(idx.len(), 1); } + #[test] + fn distance_rejects_empty_or_mismatched_vectors() { + let empty_err = DistanceMetric::L2.compute(&[1.0, 2.0], &[]).unwrap_err(); + assert!(format!("{empty_err}").contains("dimension mismatch")); + + let mismatch_err = DistanceMetric::Cosine + .compute(&[1.0, 2.0], &[1.0]) + .unwrap_err(); + assert!(format!("{mismatch_err}").contains("left has 2, right has 1")); + } + #[test] fn k_zero_returns_empty() { let mut idx = HnswIndex::new(DistanceMetric::L2, 42); - let vectors = vec![vec![1.0, 0.0], vec![0.0, 1.0]]; + let vectors = [vec![1.0, 0.0], vec![0.0, 1.0]]; for (i, v) in vectors.iter().enumerate() { - idx.insert(i as i64, v, |id| vectors[id as usize].clone()); + idx.insert(i as i64, v, |id| vectors[id as usize].clone()) + .unwrap(); } - let result = idx.search(&[0.5, 0.5], 0, |id| vectors[id as usize].clone()); + let result = idx + .search(&[0.5, 0.5], 0, |id| vectors[id as usize].clone()) + .unwrap(); assert!(result.is_empty()); } @@ -698,16 +732,21 @@ mod tests { ]; let mut idx = HnswIndex::new(DistanceMetric::L2, 42); for (i, v) in vectors.iter().enumerate() { - idx.insert(i as i64, v, |id| vectors[id as usize].clone()); + idx.insert(i as i64, v, |id| vectors[id as usize].clone()) + .unwrap(); } // Query at (1, 1): nearest is (0, 0). - let result = idx.search(&[1.0, 1.0], 1, |id| vectors[id as usize].clone()); + let result = idx + .search(&[1.0, 1.0], 1, |id| vectors[id as usize].clone()) + .unwrap(); assert_eq!(result, vec![0]); // Query at (5.5, 5.5): top-3 should be id=4 (5,5), then any // two of the corners at distance ~7.78. - let result = idx.search(&[5.5, 5.5], 3, |id| vectors[id as usize].clone()); + let result = idx + .search(&[5.5, 5.5], 3, |id| vectors[id as usize].clone()) + .unwrap(); assert_eq!(result.len(), 3); assert_eq!(result[0], 4, "closest to (5.5,5.5) should be id=4"); } @@ -728,13 +767,16 @@ mod tests { let mut idx = HnswIndex::new(DistanceMetric::L2, 42); for (i, v) in vectors.iter().enumerate() { - idx.insert(i as i64, v, |id| vectors[id as usize].clone()); + idx.insert(i as i64, v, |id| vectors[id as usize].clone()) + .unwrap(); } let mut total_recall = 0.0f32; for _ in 0..queries { let q = random_vec(&mut state, dim); - let hnsw_top = idx.search(&q, k, |id| vectors[id as usize].clone()); + let hnsw_top = idx + .search(&q, k, |id| vectors[id as usize].clone()) + .unwrap(); let baseline = brute_force_topk(&vectors, &q, k, DistanceMetric::L2); total_recall += recall_at_k(&hnsw_top, &baseline); } @@ -759,13 +801,16 @@ mod tests { let mut idx = HnswIndex::new(DistanceMetric::Cosine, 42); for (i, v) in vectors.iter().enumerate() { - idx.insert(i as i64, v, |id| vectors[id as usize].clone()); + idx.insert(i as i64, v, |id| vectors[id as usize].clone()) + .unwrap(); } let mut total_recall = 0.0f32; for _ in 0..queries { let q = random_vec(&mut state, dim); - let hnsw_top = idx.search(&q, k, |id| vectors[id as usize].clone()); + let hnsw_top = idx + .search(&q, k, |id| vectors[id as usize].clone()) + .unwrap(); let baseline = brute_force_topk(&vectors, &q, k, DistanceMetric::Cosine); total_recall += recall_at_k(&hnsw_top, &baseline); } @@ -791,7 +836,8 @@ mod tests { for i in 0..50 { vectors.push(random_vec(&mut state, dim)); let v = vectors[i].clone(); - idx.insert(i as i64, &v, |id| vectors[id as usize].clone()); + idx.insert(i as i64, &v, |id| vectors[id as usize].clone()) + .unwrap(); // Check invariant. let entry = idx.entry_point.expect("non-empty"); @@ -816,7 +862,8 @@ mod tests { for i in 0..200 { vectors.push(random_vec(&mut state, dim)); let v = vectors[i].clone(); - idx.insert(i as i64, &v, |id| vectors[id as usize].clone()); + idx.insert(i as i64, &v, |id| vectors[id as usize].clone()) + .unwrap(); } for (id, node) in &idx.nodes { @@ -848,8 +895,12 @@ mod tests { let mut idx_a = HnswIndex::new(DistanceMetric::L2, 42); let mut idx_b = HnswIndex::new(DistanceMetric::L2, 42); for (i, v) in vectors.iter().enumerate() { - idx_a.insert(i as i64, v, |id| vectors[id as usize].clone()); - idx_b.insert(i as i64, v, |id| vectors[id as usize].clone()); + idx_a + .insert(i as i64, v, |id| vectors[id as usize].clone()) + .unwrap(); + idx_b + .insert(i as i64, v, |id| vectors[id as usize].clone()) + .unwrap(); } // Same top layer. diff --git a/src/sql/mod.rs b/src/sql/mod.rs index ef4dc11..5b05783 100644 --- a/src/sql/mod.rs +++ b/src/sql/mod.rs @@ -1808,6 +1808,61 @@ mod tests { ); } + #[test] + fn hnsw_delete_then_insert_rebuilds_in_same_connection() { + let mut db = Database::new(String::from("test_db")); + process_command( + "CREATE TABLE chunks (id INTEGER PRIMARY KEY, document_id INTEGER, embedding VECTOR(4));", + &mut db, + ) + .unwrap(); + process_command( + "CREATE INDEX idx_emb ON chunks USING hnsw (embedding);", + &mut db, + ) + .unwrap(); + process_command( + "INSERT INTO chunks (document_id, embedding) VALUES (1, [1, 0, 0, 0]);", + &mut db, + ) + .unwrap(); + process_command( + "INSERT INTO chunks (document_id, embedding) VALUES (1, [0, 1, 0, 0]);", + &mut db, + ) + .unwrap(); + process_command("DELETE FROM chunks WHERE document_id = 1;", &mut db).unwrap(); + + process_command( + "INSERT INTO chunks (document_id, embedding) VALUES (2, [0, 0, 1, 0]);", + &mut db, + ) + .unwrap(); + let chunks = db.get_table("chunks".to_string()).unwrap(); + let entry = chunks + .hnsw_indexes + .iter() + .find(|e| e.name == "idx_emb") + .unwrap(); + assert!( + !entry.needs_rebuild, + "INSERT should rebuild the dirty index" + ); + assert_eq!(entry.index.len(), 1); + + let out = process_command_with_render( + "SELECT document_id FROM chunks ORDER BY vec_distance_l2(embedding, [0, 0, 1, 0]) ASC LIMIT 1;", + &mut db, + ) + .unwrap(); + assert!(out.status.contains("1 row returned"), "got: {}", out.status); + let rendered = out.rendered.expect("SELECT should render rows"); + assert!( + rendered.contains("| 2 |"), + "expected the post-delete inserted row: {rendered}" + ); + } + #[test] fn update_on_hnsw_indexed_vector_col_succeeds_and_marks_dirty() { let mut db = seed_hnsw_table(); diff --git a/src/sql/pager/mod.rs b/src/sql/pager/mod.rs index e9cd3d0..6701918 100644 --- a/src/sql/pager/mod.rs +++ b/src/sql/pager/mod.rs @@ -280,7 +280,7 @@ fn save_database_with_mode(db: &mut Database, path: &Path, compact: bool) -> Res // marked dirty. Done up front under the &mut Database borrow we // already hold, before the immutable iteration loops below need // their own borrow. - rebuild_dirty_hnsw_indexes(db); + rebuild_dirty_hnsw_indexes(db)?; // Phase 8b — same drill for FTS indexes flagged by DELETE / UPDATE. rebuild_dirty_fts_indexes(db); @@ -1251,55 +1251,11 @@ fn parse_hnsw_create_index_sql(sql: &str) -> Result<(String, String, DistanceMet /// trade-off SQLite makes for FTS5: dirtying-and-rebuilding is the /// MVP, more sophisticated incremental delete strategies (soft-delete /// + tombstones, neighbor reconnection) are future polish. -fn rebuild_dirty_hnsw_indexes(db: &mut Database) { - use crate::sql::hnsw::HnswIndex; - +fn rebuild_dirty_hnsw_indexes(db: &mut Database) -> Result<()> { for table in db.tables.values_mut() { - // Snapshot which (index_name, column, metric) triples need - // rebuilding, before we go grabbing column data — keeps the - // borrow structure simple. The per-entry metric matters here: - // rebuilding a cosine-built graph as L2 (or vice versa) would - // silently corrupt the topology and break the SQLR-28 probe. - let dirty: Vec<(String, String, DistanceMetric)> = table - .hnsw_indexes - .iter() - .filter(|e| e.needs_rebuild) - .map(|e| (e.name.clone(), e.column_name.clone(), e.metric)) - .collect(); - if dirty.is_empty() { - continue; - } - - for (idx_name, col_name, metric) in dirty { - // Snapshot every (rowid, vec) for this column. - let mut vectors: Vec<(i64, Vec)> = Vec::new(); - { - let row_data = table.rows.lock().expect("rows mutex poisoned"); - if let Some(Row::Vector(map)) = row_data.get(&col_name) { - for (id, v) in map.iter() { - vectors.push((*id, v.clone())); - } - } - } - // Pre-build a HashMap for the get_vec closure so we don't - // pay O(N) lookup per insert call. - let snapshot: std::collections::HashMap> = - vectors.iter().cloned().collect(); - - let mut new_idx = HnswIndex::new(metric, 0xC0FFEE); - // Sort by id so the rebuild is deterministic across runs. - vectors.sort_by_key(|(id, _)| *id); - for (id, v) in &vectors { - new_idx.insert(*id, v, |q| snapshot.get(&q).cloned().unwrap_or_default()); - } - - // Replace the entry's index + clear the dirty flag. - if let Some(entry) = table.hnsw_indexes.iter_mut().find(|e| e.name == idx_name) { - entry.index = new_idx; - entry.needs_rebuild = false; - } - } + table.rebuild_dirty_hnsw_indexes()?; } + Ok(()) } /// Synthesises the CREATE INDEX SQL stored back into `sqlrite_master`