diff --git a/docs/sql-engine.md b/docs/sql-engine.md
index 195679d..d534893 100644
--- a/docs/sql-engine.md
+++ b/docs/sql-engine.md
@@ -133,7 +133,7 @@ Two specialized shortcuts in [`src/sql/executor.rs`](../src/sql/executor.rs) rec
ORDER BY vec_distance_l2(
, ) ASC LIMIT k
```
-Returns top-k from the HNSW graph in `O(log N)` per probe. Mirrored shapes for `vec_distance_cosine` and `vec_distance_dot`.
+Returns top-k from the HNSW graph in `O(log N)` per probe. Mirrored shapes for `vec_distance_cosine` and `vec_distance_dot`. INSERT maintains HNSW incrementally. DELETE / UPDATE mark the graph dirty; the next INSERT on the indexed vector column rebuilds the in-memory graph from surviving rows before adding the new node, and save/COMMIT still rebuilds dirty graphs before serializing.
`try_fts_probe` (Phase 8b) — BM25 keyword:
diff --git a/examples/nodejs-notes/README.md b/examples/nodejs-notes/README.md
index cd814a6..6d31e37 100644
--- a/examples/nodejs-notes/README.md
+++ b/examples/nodejs-notes/README.md
@@ -247,17 +247,6 @@ Default is 0.5.
time and won't notice newly-ingested files until you reload it
anyway, so the gain from coexisting isn't worth the surprise.
-- **HNSW after delete + re-insert (engine bug, SQLR-8).** The
- engine's HNSW chunk index panics when rows are deleted and re-
- inserted within the same connection lifetime — `index out of
- bounds` inside `DistanceMetric::compute`. The ingest pipeline
- works around it by splitting refresh into a delete-phase →
- close/reopen → insert-phase, which forces a clean index rebuild
- on next open. We pay the close/reopen hop only when there are
- actual deletions (so first-time `init` skips it). Once SQLR-8
- lands in the engine, `src/ingest.mjs` can drop the `db.reopen()`
- call.
-
- **Aggregates limited.** The engine supports `COUNT(*)`,
`SUM`/`AVG`/`MIN`/`MAX` but not arbitrary expressions in `SELECT`
projection beyond aggregates (see [`docs/supported-sql.md`](../../docs/supported-sql.md)).
@@ -319,7 +308,7 @@ examples/nodejs-notes/
│ ├── db.mjs # schema, migrations, every SQL string
│ ├── chunker.mjs # frontmatter + heading-aware chunking
│ ├── embeddings.mjs # hash + OpenAI embedders
-│ ├── ingest.mjs # plan → delete → reopen → insert
+│ ├── ingest.mjs # plan → delete → insert
│ ├── search.mjs # hybrid retrieval driver
│ ├── serve.mjs # spawn sqlrite-mcp --read-only
│ └── claude-config.mjs # Claude Desktop snippet renderer
diff --git a/examples/nodejs-notes/src/ingest.mjs b/examples/nodejs-notes/src/ingest.mjs
index 2c4cfcf..be49a7b 100644
--- a/examples/nodejs-notes/src/ingest.mjs
+++ b/examples/nodejs-notes/src/ingest.mjs
@@ -10,11 +10,7 @@
//
// Both flow through `ingestImpl`, which splits the work into three
// phases: PLAN (read-only diff against the current DB) → DELETE (drop
-// stale documents/chunks; close + reopen the DB) → INSERT (write new
-// rows). The close/reopen between DELETE and INSERT is a workaround
-// for an engine bug where the HNSW chunk index panics when rows are
-// deleted and re-inserted in the same connection lifetime — see the
-// "Known limitations" section of this example's README.
+// stale documents/chunks) → INSERT (write new rows).
import { readFile, readdir, stat } from 'node:fs/promises';
import { createHash } from 'node:crypto';
@@ -192,15 +188,6 @@ async function ingestImpl({ db, root, embedder, logger, chunkOpts, mode }) {
// ----------------------------------------------------------------
// PHASE 2 — deletes (and replacing-deletes).
- //
- // The engine's HNSW index has a bug where rows deleted and re-
- // inserted within the same connection lifetime can corrupt the
- // index's stored vectors (see ../README.md "Known limitations").
- // Closing + reopening the connection between the delete-pass and
- // the insert-pass forces a full index rebuild on next open,
- // sidestepping the issue. We only pay this cost when there's
- // actually something to delete; pure-INSERT runs (first `init`)
- // skip this hop entirely.
if (hasMutations) {
db.transaction(() => {
for (const id of planDeletes) db.deleteDocument(id);
@@ -208,7 +195,6 @@ async function ingestImpl({ db, root, embedder, logger, chunkOpts, mode }) {
if (e.plan.priorId !== null) db.deleteDocument(e.plan.priorId);
}
});
- db.reopen();
}
// ----------------------------------------------------------------
diff --git a/src/sql/db/table.rs b/src/sql/db/table.rs
index 392cc80..6a5952a 100644
--- a/src/sql/db/table.rs
+++ b/src/sql/db/table.rs
@@ -1237,7 +1237,7 @@ impl Table {
// pulls from the table's row storage (which we *just* updated
// with the new value).
if let Some(Value::Vector(new_vec)) = &typed_value {
- self.maintain_hnsw_on_insert(key, next_rowid, new_vec);
+ self.maintain_hnsw_on_insert(key, next_rowid, new_vec)?;
}
// Step 4 (Phase 8b): maintain any FTS indexes on this column.
@@ -1257,7 +1257,9 @@ impl Table {
/// the borrowing dance — we need both `&self.rows` (read other
/// vectors) and `&mut self.hnsw_indexes` (insert into the graph) —
/// stays localized.
- fn maintain_hnsw_on_insert(&mut self, column: &str, rowid: i64, new_vec: &[f32]) {
+ fn maintain_hnsw_on_insert(&mut self, column: &str, rowid: i64, new_vec: &[f32]) -> Result<()> {
+ self.rebuild_dirty_hnsw_indexes()?;
+
// Snapshot the current vector storage so the get_vec closure
// doesn't fight with `&mut self.hnsw_indexes`. For a typical
// HNSW insert we touch ef_construction × log(N) other vectors,
@@ -1279,9 +1281,52 @@ impl Table {
if entry.column_name == column {
entry.index.insert(rowid, new_vec, |id| {
vec_snapshot.get(&id).cloned().unwrap_or_default()
- });
+ })?;
+ }
+ }
+ Ok(())
+ }
+
+ /// Rebuilds any dirty HNSW index on this table from the current
+ /// vector column storage. DELETE / UPDATE mark indexes dirty because
+ /// stale graph edges may still point at removed rowids; this makes
+ /// the next in-memory operation see a clean graph without requiring
+ /// a close/reopen or save round-trip.
+ pub fn rebuild_dirty_hnsw_indexes(&mut self) -> Result<()> {
+ let dirty: Vec<(String, String, DistanceMetric)> = self
+ .hnsw_indexes
+ .iter()
+ .filter(|e| e.needs_rebuild)
+ .map(|e| (e.name.clone(), e.column_name.clone(), e.metric))
+ .collect();
+ if dirty.is_empty() {
+ return Ok(());
+ }
+
+ for (idx_name, col_name, metric) in dirty {
+ let mut vectors: Vec<(i64, Vec)> = Vec::new();
+ {
+ let row_data = self.rows.lock().expect("rows mutex poisoned");
+ if let Some(Row::Vector(map)) = row_data.get(&col_name) {
+ for (id, v) in map.iter() {
+ vectors.push((*id, v.clone()));
+ }
+ }
+ }
+
+ let snapshot: HashMap> = vectors.iter().cloned().collect();
+ let mut new_idx = HnswIndex::new(metric, 0xC0FFEE);
+ vectors.sort_by_key(|(id, _)| *id);
+ for (id, v) in &vectors {
+ new_idx.insert(*id, v, |q| snapshot.get(&q).cloned().unwrap_or_default())?;
+ }
+
+ if let Some(entry) = self.hnsw_indexes.iter_mut().find(|e| e.name == idx_name) {
+ entry.index = new_idx;
+ entry.needs_rebuild = false;
}
}
+ Ok(())
}
/// After a row insert, push the new (rowid, text) into every FTS
diff --git a/src/sql/executor.rs b/src/sql/executor.rs
index 8f57f3c..181cd90 100644
--- a/src/sql/executor.rs
+++ b/src/sql/executor.rs
@@ -1468,7 +1468,7 @@ fn create_hnsw_index(
let v_clone = v.clone();
idx.insert(*rowid, &v_clone, |id| {
vec_map.get(&id).cloned().unwrap_or_default()
- });
+ })?;
}
}
@@ -1863,12 +1863,15 @@ fn try_hnsw_probe(table: &Table, order_expr: &Expr, k: usize) -> Option
// module stays decoupled from the SQL types.
let column_for_closure = col_name.clone();
let table_ref = table;
- let result = entry.index.search(&query_vec, k, |id| {
- match table_ref.get_value(&column_for_closure, id) {
- Some(Value::Vector(v)) => v,
- _ => Vec::new(),
- }
- });
+ let result = entry
+ .index
+ .search(&query_vec, k, |id| {
+ match table_ref.get_value(&column_for_closure, id) {
+ Some(Value::Vector(v)) => v,
+ _ => Vec::new(),
+ }
+ })
+ .ok()?;
Some(result)
}
diff --git a/src/sql/hnsw.rs b/src/sql/hnsw.rs
index 5665d35..bbca1d7 100644
--- a/src/sql/hnsw.rs
+++ b/src/sql/hnsw.rs
@@ -53,6 +53,8 @@
use std::cmp::Ordering;
use std::collections::{BinaryHeap, HashMap, HashSet};
+use crate::error::{Result, SQLRiteError};
+
/// Distance metric used by the HNSW index. Must match what the
/// surrounding `vec_distance_*` SQL function would compute on the same
/// pair of vectors — otherwise the index probe and the brute-force
@@ -107,8 +109,14 @@ impl DistanceMetric {
/// will prefer any finite alternative, which matches the SQL-level
/// behaviour where `vec_distance_cosine` errors but the optimizer's
/// fallback path simply skips the offending row.
- pub fn compute(self, a: &[f32], b: &[f32]) -> f32 {
- debug_assert_eq!(a.len(), b.len(), "vector dim mismatch in HNSW distance");
+ pub fn compute(self, a: &[f32], b: &[f32]) -> Result {
+ if a.is_empty() || b.is_empty() || a.len() != b.len() {
+ return Err(SQLRiteError::General(format!(
+ "HNSW vector dimension mismatch: left has {}, right has {}",
+ a.len(),
+ b.len()
+ )));
+ }
match self {
DistanceMetric::L2 => {
let mut sum = 0.0f32;
@@ -116,7 +124,7 @@ impl DistanceMetric {
let d = a[i] - b[i];
sum += d * d;
}
- sum.sqrt()
+ Ok(sum.sqrt())
}
DistanceMetric::Cosine => {
let mut dot = 0.0f32;
@@ -129,9 +137,9 @@ impl DistanceMetric {
}
let denom = (na * nb).sqrt();
if denom == 0.0 {
- f32::INFINITY
+ Ok(f32::INFINITY)
} else {
- 1.0 - dot / denom
+ Ok(1.0 - dot / denom)
}
}
DistanceMetric::Dot => {
@@ -139,7 +147,7 @@ impl DistanceMetric {
for i in 0..a.len() {
dot += a[i] * b[i];
}
- -dot
+ Ok(-dot)
}
}
}
@@ -284,12 +292,12 @@ impl HnswIndex {
/// re-inserting an existing id is a no-op (returns without error).
/// `vec` is the new node's vector; `get_vec` looks up the vector
/// for any other node id the algorithm touches.
- pub fn insert(&mut self, node_id: i64, vec: &[f32], get_vec: F)
+ pub fn insert(&mut self, node_id: i64, vec: &[f32], get_vec: F) -> Result<()>
where
F: Fn(i64) -> Vec,
{
if self.nodes.contains_key(&node_id) {
- return;
+ return Ok(());
}
// First node: trivial case. Becomes entry point at layer 0.
@@ -302,7 +310,7 @@ impl HnswIndex {
);
self.entry_point = Some(node_id);
self.top_layer = 0;
- return;
+ return Ok(());
}
// Pick a layer for this new node.
@@ -321,7 +329,7 @@ impl HnswIndex {
// because the new node doesn't live there.
let mut entry = self.entry_point.expect("non-empty index has entry point");
for layer in (target_layer + 1..=self.top_layer).rev() {
- let nearest = self.search_layer(vec, &[entry], 1, layer, &get_vec);
+ let nearest = self.search_layer(vec, &[entry], 1, layer, &get_vec)?;
if let Some((_, id)) = nearest.into_iter().next() {
entry = id;
}
@@ -333,7 +341,7 @@ impl HnswIndex {
let mut entries = vec![entry];
for layer in (0..=target_layer).rev() {
let candidates =
- self.search_layer(vec, &entries, self.params.ef_construction, layer, &get_vec);
+ self.search_layer(vec, &entries, self.params.ef_construction, layer, &get_vec)?;
// Pick up to M neighbors from candidates (M_max0 at layer 0
// since we allow more connections at the dense layer).
@@ -368,8 +376,12 @@ impl HnswIndex {
let nb_vec = get_vec(nb);
let mut by_dist: Vec<(f32, i64)> = nb_layers[layer]
.iter()
- .map(|id| (self.distance.compute(&nb_vec, &get_vec(*id)), *id))
- .collect();
+ .map(|id| {
+ self.distance
+ .compute(&nb_vec, &get_vec(*id))
+ .map(|d| (d, *id))
+ })
+ .collect::>>()?;
by_dist
.sort_by(|(da, _), (db, _)| da.partial_cmp(db).unwrap_or(Ordering::Equal));
by_dist.truncate(m_max);
@@ -387,22 +399,23 @@ impl HnswIndex {
self.top_layer = target_layer;
self.entry_point = Some(node_id);
}
+ Ok(())
}
/// Returns the k nearest node ids to `query`, in distance-ascending
/// order (closest first). Empty index returns an empty Vec.
- pub fn search(&self, query: &[f32], k: usize, get_vec: F) -> Vec
+ pub fn search(&self, query: &[f32], k: usize, get_vec: F) -> Result>
where
F: Fn(i64) -> Vec,
{
if self.is_empty() || k == 0 {
- return Vec::new();
+ return Ok(Vec::new());
}
// Greedy descent from the top down to layer 1.
let mut entry = self.entry_point.expect("non-empty index has entry point");
for layer in (1..=self.top_layer).rev() {
- let nearest = self.search_layer(query, &[entry], 1, layer, &get_vec);
+ let nearest = self.search_layer(query, &[entry], 1, layer, &get_vec)?;
if let Some((_, id)) = nearest.into_iter().next() {
entry = id;
}
@@ -410,9 +423,9 @@ impl HnswIndex {
// Beam search at layer 0 with width = max(ef_search, k).
let ef = self.params.ef_search.max(k);
- let candidates = self.search_layer(query, &[entry], ef, 0, &get_vec);
+ let candidates = self.search_layer(query, &[entry], ef, 0, &get_vec)?;
- candidates.into_iter().take(k).map(|(_, id)| id).collect()
+ Ok(candidates.into_iter().take(k).map(|(_, id)| id).collect())
}
/// Runs a beam search at one layer starting from `entries`, returning
@@ -430,7 +443,7 @@ impl HnswIndex {
ef: usize,
layer: usize,
get_vec: &F,
- ) -> Vec<(f32, i64)>
+ ) -> Result>
where
F: Fn(i64) -> Vec,
{
@@ -444,7 +457,7 @@ impl HnswIndex {
if !visited.insert(id) {
continue;
}
- let d = self.distance.compute(query, &get_vec(id));
+ let d = self.distance.compute(query, &get_vec(id))?;
candidates.push(MinHeapItem { dist: d, id });
results.push(MaxHeapItem { dist: d, id });
}
@@ -474,7 +487,7 @@ impl HnswIndex {
if !visited.insert(nb) {
continue;
}
- let d = self.distance.compute(query, &get_vec(nb));
+ let d = self.distance.compute(query, &get_vec(nb))?;
let admit = if results.len() < ef {
true
} else {
@@ -497,7 +510,7 @@ impl HnswIndex {
out.push((item.dist, item.id));
}
out.reverse();
- out
+ Ok(out)
}
/// Picks a layer for a new node using the standard HNSW geometric
@@ -506,7 +519,7 @@ impl HnswIndex {
/// - P(L=0) ≈ 1 - 1/M = 15/16
/// - P(L=1) ≈ 1/16 - 1/256
/// - P(L=2) ≈ 1/256 - …
- /// i.e., most new nodes live only at layer 0; a few percolate up.
+ /// i.e., most new nodes live only at layer 0; a few percolate up.
fn pick_layer(&mut self) -> usize {
let u = self.next_uniform().max(1e-6); // guard log(0)
let layer = (-u.ln() * self.params.m_l).floor() as usize;
@@ -629,7 +642,7 @@ mod tests {
let mut by_dist: Vec<(f32, i64)> = vectors
.iter()
.enumerate()
- .map(|(i, v)| (metric.compute(query, v), i as i64))
+ .map(|(i, v)| (metric.compute(query, v).expect("matching dims"), i as i64))
.collect();
by_dist.sort_by(|(a, _), (b, _)| a.partial_cmp(b).unwrap_or(Ordering::Equal));
by_dist.into_iter().take(k).map(|(_, id)| id).collect()
@@ -650,7 +663,9 @@ mod tests {
fn empty_index_returns_empty_search() {
let idx = HnswIndex::new(DistanceMetric::L2, 42);
let vectors: Vec> = vec![];
- let result = idx.search(&[0.0; 4], 5, |id| vectors[id as usize].clone());
+ let result = idx
+ .search(&[0.0; 4], 5, |id| vectors[id as usize].clone())
+ .unwrap();
assert!(result.is_empty());
}
@@ -658,9 +673,12 @@ mod tests {
fn single_node_returns_only_itself() {
let mut idx = HnswIndex::new(DistanceMetric::L2, 42);
let v0 = vec![1.0, 2.0, 3.0];
- let vectors = vec![v0.clone()];
- idx.insert(0, &v0, |id| vectors[id as usize].clone());
- let result = idx.search(&[0.0; 3], 5, |id| vectors[id as usize].clone());
+ let vectors = [v0.clone()];
+ idx.insert(0, &v0, |id| vectors[id as usize].clone())
+ .unwrap();
+ let result = idx
+ .search(&[0.0; 3], 5, |id| vectors[id as usize].clone())
+ .unwrap();
assert_eq!(result, vec![0]);
}
@@ -668,20 +686,36 @@ mod tests {
fn duplicate_insert_is_noop() {
let mut idx = HnswIndex::new(DistanceMetric::L2, 42);
let v0 = vec![1.0, 2.0];
- let vectors = vec![v0.clone()];
- idx.insert(0, &v0, |id| vectors[id as usize].clone());
- idx.insert(0, &v0, |id| vectors[id as usize].clone());
+ let vectors = [v0.clone()];
+ idx.insert(0, &v0, |id| vectors[id as usize].clone())
+ .unwrap();
+ idx.insert(0, &v0, |id| vectors[id as usize].clone())
+ .unwrap();
assert_eq!(idx.len(), 1);
}
+ #[test]
+ fn distance_rejects_empty_or_mismatched_vectors() {
+ let empty_err = DistanceMetric::L2.compute(&[1.0, 2.0], &[]).unwrap_err();
+ assert!(format!("{empty_err}").contains("dimension mismatch"));
+
+ let mismatch_err = DistanceMetric::Cosine
+ .compute(&[1.0, 2.0], &[1.0])
+ .unwrap_err();
+ assert!(format!("{mismatch_err}").contains("left has 2, right has 1"));
+ }
+
#[test]
fn k_zero_returns_empty() {
let mut idx = HnswIndex::new(DistanceMetric::L2, 42);
- let vectors = vec![vec![1.0, 0.0], vec![0.0, 1.0]];
+ let vectors = [vec![1.0, 0.0], vec![0.0, 1.0]];
for (i, v) in vectors.iter().enumerate() {
- idx.insert(i as i64, v, |id| vectors[id as usize].clone());
+ idx.insert(i as i64, v, |id| vectors[id as usize].clone())
+ .unwrap();
}
- let result = idx.search(&[0.5, 0.5], 0, |id| vectors[id as usize].clone());
+ let result = idx
+ .search(&[0.5, 0.5], 0, |id| vectors[id as usize].clone())
+ .unwrap();
assert!(result.is_empty());
}
@@ -698,16 +732,21 @@ mod tests {
];
let mut idx = HnswIndex::new(DistanceMetric::L2, 42);
for (i, v) in vectors.iter().enumerate() {
- idx.insert(i as i64, v, |id| vectors[id as usize].clone());
+ idx.insert(i as i64, v, |id| vectors[id as usize].clone())
+ .unwrap();
}
// Query at (1, 1): nearest is (0, 0).
- let result = idx.search(&[1.0, 1.0], 1, |id| vectors[id as usize].clone());
+ let result = idx
+ .search(&[1.0, 1.0], 1, |id| vectors[id as usize].clone())
+ .unwrap();
assert_eq!(result, vec![0]);
// Query at (5.5, 5.5): top-3 should be id=4 (5,5), then any
// two of the corners at distance ~7.78.
- let result = idx.search(&[5.5, 5.5], 3, |id| vectors[id as usize].clone());
+ let result = idx
+ .search(&[5.5, 5.5], 3, |id| vectors[id as usize].clone())
+ .unwrap();
assert_eq!(result.len(), 3);
assert_eq!(result[0], 4, "closest to (5.5,5.5) should be id=4");
}
@@ -728,13 +767,16 @@ mod tests {
let mut idx = HnswIndex::new(DistanceMetric::L2, 42);
for (i, v) in vectors.iter().enumerate() {
- idx.insert(i as i64, v, |id| vectors[id as usize].clone());
+ idx.insert(i as i64, v, |id| vectors[id as usize].clone())
+ .unwrap();
}
let mut total_recall = 0.0f32;
for _ in 0..queries {
let q = random_vec(&mut state, dim);
- let hnsw_top = idx.search(&q, k, |id| vectors[id as usize].clone());
+ let hnsw_top = idx
+ .search(&q, k, |id| vectors[id as usize].clone())
+ .unwrap();
let baseline = brute_force_topk(&vectors, &q, k, DistanceMetric::L2);
total_recall += recall_at_k(&hnsw_top, &baseline);
}
@@ -759,13 +801,16 @@ mod tests {
let mut idx = HnswIndex::new(DistanceMetric::Cosine, 42);
for (i, v) in vectors.iter().enumerate() {
- idx.insert(i as i64, v, |id| vectors[id as usize].clone());
+ idx.insert(i as i64, v, |id| vectors[id as usize].clone())
+ .unwrap();
}
let mut total_recall = 0.0f32;
for _ in 0..queries {
let q = random_vec(&mut state, dim);
- let hnsw_top = idx.search(&q, k, |id| vectors[id as usize].clone());
+ let hnsw_top = idx
+ .search(&q, k, |id| vectors[id as usize].clone())
+ .unwrap();
let baseline = brute_force_topk(&vectors, &q, k, DistanceMetric::Cosine);
total_recall += recall_at_k(&hnsw_top, &baseline);
}
@@ -791,7 +836,8 @@ mod tests {
for i in 0..50 {
vectors.push(random_vec(&mut state, dim));
let v = vectors[i].clone();
- idx.insert(i as i64, &v, |id| vectors[id as usize].clone());
+ idx.insert(i as i64, &v, |id| vectors[id as usize].clone())
+ .unwrap();
// Check invariant.
let entry = idx.entry_point.expect("non-empty");
@@ -816,7 +862,8 @@ mod tests {
for i in 0..200 {
vectors.push(random_vec(&mut state, dim));
let v = vectors[i].clone();
- idx.insert(i as i64, &v, |id| vectors[id as usize].clone());
+ idx.insert(i as i64, &v, |id| vectors[id as usize].clone())
+ .unwrap();
}
for (id, node) in &idx.nodes {
@@ -848,8 +895,12 @@ mod tests {
let mut idx_a = HnswIndex::new(DistanceMetric::L2, 42);
let mut idx_b = HnswIndex::new(DistanceMetric::L2, 42);
for (i, v) in vectors.iter().enumerate() {
- idx_a.insert(i as i64, v, |id| vectors[id as usize].clone());
- idx_b.insert(i as i64, v, |id| vectors[id as usize].clone());
+ idx_a
+ .insert(i as i64, v, |id| vectors[id as usize].clone())
+ .unwrap();
+ idx_b
+ .insert(i as i64, v, |id| vectors[id as usize].clone())
+ .unwrap();
}
// Same top layer.
diff --git a/src/sql/mod.rs b/src/sql/mod.rs
index ef4dc11..5b05783 100644
--- a/src/sql/mod.rs
+++ b/src/sql/mod.rs
@@ -1808,6 +1808,61 @@ mod tests {
);
}
+ #[test]
+ fn hnsw_delete_then_insert_rebuilds_in_same_connection() {
+ let mut db = Database::new(String::from("test_db"));
+ process_command(
+ "CREATE TABLE chunks (id INTEGER PRIMARY KEY, document_id INTEGER, embedding VECTOR(4));",
+ &mut db,
+ )
+ .unwrap();
+ process_command(
+ "CREATE INDEX idx_emb ON chunks USING hnsw (embedding);",
+ &mut db,
+ )
+ .unwrap();
+ process_command(
+ "INSERT INTO chunks (document_id, embedding) VALUES (1, [1, 0, 0, 0]);",
+ &mut db,
+ )
+ .unwrap();
+ process_command(
+ "INSERT INTO chunks (document_id, embedding) VALUES (1, [0, 1, 0, 0]);",
+ &mut db,
+ )
+ .unwrap();
+ process_command("DELETE FROM chunks WHERE document_id = 1;", &mut db).unwrap();
+
+ process_command(
+ "INSERT INTO chunks (document_id, embedding) VALUES (2, [0, 0, 1, 0]);",
+ &mut db,
+ )
+ .unwrap();
+ let chunks = db.get_table("chunks".to_string()).unwrap();
+ let entry = chunks
+ .hnsw_indexes
+ .iter()
+ .find(|e| e.name == "idx_emb")
+ .unwrap();
+ assert!(
+ !entry.needs_rebuild,
+ "INSERT should rebuild the dirty index"
+ );
+ assert_eq!(entry.index.len(), 1);
+
+ let out = process_command_with_render(
+ "SELECT document_id FROM chunks ORDER BY vec_distance_l2(embedding, [0, 0, 1, 0]) ASC LIMIT 1;",
+ &mut db,
+ )
+ .unwrap();
+ assert!(out.status.contains("1 row returned"), "got: {}", out.status);
+ let rendered = out.rendered.expect("SELECT should render rows");
+ assert!(
+ rendered.contains("| 2 |"),
+ "expected the post-delete inserted row: {rendered}"
+ );
+ }
+
#[test]
fn update_on_hnsw_indexed_vector_col_succeeds_and_marks_dirty() {
let mut db = seed_hnsw_table();
diff --git a/src/sql/pager/mod.rs b/src/sql/pager/mod.rs
index e9cd3d0..6701918 100644
--- a/src/sql/pager/mod.rs
+++ b/src/sql/pager/mod.rs
@@ -280,7 +280,7 @@ fn save_database_with_mode(db: &mut Database, path: &Path, compact: bool) -> Res
// marked dirty. Done up front under the &mut Database borrow we
// already hold, before the immutable iteration loops below need
// their own borrow.
- rebuild_dirty_hnsw_indexes(db);
+ rebuild_dirty_hnsw_indexes(db)?;
// Phase 8b — same drill for FTS indexes flagged by DELETE / UPDATE.
rebuild_dirty_fts_indexes(db);
@@ -1251,55 +1251,11 @@ fn parse_hnsw_create_index_sql(sql: &str) -> Result<(String, String, DistanceMet
/// trade-off SQLite makes for FTS5: dirtying-and-rebuilding is the
/// MVP, more sophisticated incremental delete strategies (soft-delete
/// + tombstones, neighbor reconnection) are future polish.
-fn rebuild_dirty_hnsw_indexes(db: &mut Database) {
- use crate::sql::hnsw::HnswIndex;
-
+fn rebuild_dirty_hnsw_indexes(db: &mut Database) -> Result<()> {
for table in db.tables.values_mut() {
- // Snapshot which (index_name, column, metric) triples need
- // rebuilding, before we go grabbing column data — keeps the
- // borrow structure simple. The per-entry metric matters here:
- // rebuilding a cosine-built graph as L2 (or vice versa) would
- // silently corrupt the topology and break the SQLR-28 probe.
- let dirty: Vec<(String, String, DistanceMetric)> = table
- .hnsw_indexes
- .iter()
- .filter(|e| e.needs_rebuild)
- .map(|e| (e.name.clone(), e.column_name.clone(), e.metric))
- .collect();
- if dirty.is_empty() {
- continue;
- }
-
- for (idx_name, col_name, metric) in dirty {
- // Snapshot every (rowid, vec) for this column.
- let mut vectors: Vec<(i64, Vec)> = Vec::new();
- {
- let row_data = table.rows.lock().expect("rows mutex poisoned");
- if let Some(Row::Vector(map)) = row_data.get(&col_name) {
- for (id, v) in map.iter() {
- vectors.push((*id, v.clone()));
- }
- }
- }
- // Pre-build a HashMap for the get_vec closure so we don't
- // pay O(N) lookup per insert call.
- let snapshot: std::collections::HashMap> =
- vectors.iter().cloned().collect();
-
- let mut new_idx = HnswIndex::new(metric, 0xC0FFEE);
- // Sort by id so the rebuild is deterministic across runs.
- vectors.sort_by_key(|(id, _)| *id);
- for (id, v) in &vectors {
- new_idx.insert(*id, v, |q| snapshot.get(&q).cloned().unwrap_or_default());
- }
-
- // Replace the entry's index + clear the dirty flag.
- if let Some(entry) = table.hnsw_indexes.iter_mut().find(|e| e.name == idx_name) {
- entry.index = new_idx;
- entry.needs_rebuild = false;
- }
- }
+ table.rebuild_dirty_hnsw_indexes()?;
}
+ Ok(())
}
/// Synthesises the CREATE INDEX SQL stored back into `sqlrite_master`