Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/sql-engine.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ Two specialized shortcuts in [`src/sql/executor.rs`](../src/sql/executor.rs) rec
ORDER BY vec_distance_l2(<col>, <bracket-array literal>) ASC LIMIT k
```

Returns top-k from the HNSW graph in `O(log N)` per probe. Mirrored shapes for `vec_distance_cosine` and `vec_distance_dot`.
Returns top-k from the HNSW graph in `O(log N)` per probe. Mirrored shapes for `vec_distance_cosine` and `vec_distance_dot`. INSERT maintains HNSW incrementally. DELETE / UPDATE mark the graph dirty; the next INSERT on the indexed vector column rebuilds the in-memory graph from surviving rows before adding the new node, and save/COMMIT still rebuilds dirty graphs before serializing.

`try_fts_probe` (Phase 8b) — BM25 keyword:

Expand Down
13 changes: 1 addition & 12 deletions examples/nodejs-notes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -247,17 +247,6 @@ Default is 0.5.
time and won't notice newly-ingested files until you reload it
anyway, so the gain from coexisting isn't worth the surprise.

- **HNSW after delete + re-insert (engine bug, SQLR-8).** The
engine's HNSW chunk index panics when rows are deleted and re-
inserted within the same connection lifetime — `index out of
bounds` inside `DistanceMetric::compute`. The ingest pipeline
works around it by splitting refresh into a delete-phase →
close/reopen → insert-phase, which forces a clean index rebuild
on next open. We pay the close/reopen hop only when there are
actual deletions (so first-time `init` skips it). Once SQLR-8
lands in the engine, `src/ingest.mjs` can drop the `db.reopen()`
call.

- **Aggregates limited.** The engine supports `COUNT(*)`,
`SUM`/`AVG`/`MIN`/`MAX` but not arbitrary expressions in `SELECT`
projection beyond aggregates (see [`docs/supported-sql.md`](../../docs/supported-sql.md)).
Expand Down Expand Up @@ -319,7 +308,7 @@ examples/nodejs-notes/
│ ├── db.mjs # schema, migrations, every SQL string
│ ├── chunker.mjs # frontmatter + heading-aware chunking
│ ├── embeddings.mjs # hash + OpenAI embedders
│ ├── ingest.mjs # plan → delete → reopen → insert
│ ├── ingest.mjs # plan → delete → insert
│ ├── search.mjs # hybrid retrieval driver
│ ├── serve.mjs # spawn sqlrite-mcp --read-only
│ └── claude-config.mjs # Claude Desktop snippet renderer
Expand Down
16 changes: 1 addition & 15 deletions examples/nodejs-notes/src/ingest.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,7 @@
//
// Both flow through `ingestImpl`, which splits the work into three
// phases: PLAN (read-only diff against the current DB) → DELETE (drop
// stale documents/chunks; close + reopen the DB) → INSERT (write new
// rows). The close/reopen between DELETE and INSERT is a workaround
// for an engine bug where the HNSW chunk index panics when rows are
// deleted and re-inserted in the same connection lifetime — see the
// "Known limitations" section of this example's README.
// stale documents/chunks) → INSERT (write new rows).

import { readFile, readdir, stat } from 'node:fs/promises';
import { createHash } from 'node:crypto';
Expand Down Expand Up @@ -192,23 +188,13 @@ async function ingestImpl({ db, root, embedder, logger, chunkOpts, mode }) {

// ----------------------------------------------------------------
// PHASE 2 — deletes (and replacing-deletes).
//
// The engine's HNSW index has a bug where rows deleted and re-
// inserted within the same connection lifetime can corrupt the
// index's stored vectors (see ../README.md "Known limitations").
// Closing + reopening the connection between the delete-pass and
// the insert-pass forces a full index rebuild on next open,
// sidestepping the issue. We only pay this cost when there's
// actually something to delete; pure-INSERT runs (first `init`)
// skip this hop entirely.
if (hasMutations) {
db.transaction(() => {
for (const id of planDeletes) db.deleteDocument(id);
for (const e of embedded) {
if (e.plan.priorId !== null) db.deleteDocument(e.plan.priorId);
}
});
db.reopen();
}

// ----------------------------------------------------------------
Expand Down
51 changes: 48 additions & 3 deletions src/sql/db/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1237,7 +1237,7 @@ impl Table {
// pulls from the table's row storage (which we *just* updated
// with the new value).
if let Some(Value::Vector(new_vec)) = &typed_value {
self.maintain_hnsw_on_insert(key, next_rowid, new_vec);
self.maintain_hnsw_on_insert(key, next_rowid, new_vec)?;
}

// Step 4 (Phase 8b): maintain any FTS indexes on this column.
Expand All @@ -1257,7 +1257,9 @@ impl Table {
/// the borrowing dance — we need both `&self.rows` (read other
/// vectors) and `&mut self.hnsw_indexes` (insert into the graph) —
/// stays localized.
fn maintain_hnsw_on_insert(&mut self, column: &str, rowid: i64, new_vec: &[f32]) {
fn maintain_hnsw_on_insert(&mut self, column: &str, rowid: i64, new_vec: &[f32]) -> Result<()> {
self.rebuild_dirty_hnsw_indexes()?;

// Snapshot the current vector storage so the get_vec closure
// doesn't fight with `&mut self.hnsw_indexes`. For a typical
// HNSW insert we touch ef_construction × log(N) other vectors,
Expand All @@ -1279,9 +1281,52 @@ impl Table {
if entry.column_name == column {
entry.index.insert(rowid, new_vec, |id| {
vec_snapshot.get(&id).cloned().unwrap_or_default()
});
})?;
}
}
Ok(())
}

/// Rebuilds any dirty HNSW index on this table from the current
/// vector column storage. DELETE / UPDATE mark indexes dirty because
/// stale graph edges may still point at removed rowids; this makes
/// the next in-memory operation see a clean graph without requiring
/// a close/reopen or save round-trip.
pub fn rebuild_dirty_hnsw_indexes(&mut self) -> Result<()> {
let dirty: Vec<(String, String, DistanceMetric)> = self
.hnsw_indexes
.iter()
.filter(|e| e.needs_rebuild)
.map(|e| (e.name.clone(), e.column_name.clone(), e.metric))
.collect();
if dirty.is_empty() {
return Ok(());
}

for (idx_name, col_name, metric) in dirty {
let mut vectors: Vec<(i64, Vec<f32>)> = Vec::new();
{
let row_data = self.rows.lock().expect("rows mutex poisoned");
if let Some(Row::Vector(map)) = row_data.get(&col_name) {
for (id, v) in map.iter() {
vectors.push((*id, v.clone()));
}
}
}

let snapshot: HashMap<i64, Vec<f32>> = vectors.iter().cloned().collect();
let mut new_idx = HnswIndex::new(metric, 0xC0FFEE);
vectors.sort_by_key(|(id, _)| *id);
for (id, v) in &vectors {
new_idx.insert(*id, v, |q| snapshot.get(&q).cloned().unwrap_or_default())?;
}

if let Some(entry) = self.hnsw_indexes.iter_mut().find(|e| e.name == idx_name) {
entry.index = new_idx;
entry.needs_rebuild = false;
}
}
Ok(())
}

/// After a row insert, push the new (rowid, text) into every FTS
Expand Down
17 changes: 10 additions & 7 deletions src/sql/executor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1468,7 +1468,7 @@ fn create_hnsw_index(
let v_clone = v.clone();
idx.insert(*rowid, &v_clone, |id| {
vec_map.get(&id).cloned().unwrap_or_default()
});
})?;
}
}

Expand Down Expand Up @@ -1863,12 +1863,15 @@ fn try_hnsw_probe(table: &Table, order_expr: &Expr, k: usize) -> Option<Vec<i64>
// module stays decoupled from the SQL types.
let column_for_closure = col_name.clone();
let table_ref = table;
let result = entry.index.search(&query_vec, k, |id| {
match table_ref.get_value(&column_for_closure, id) {
Some(Value::Vector(v)) => v,
_ => Vec::new(),
}
});
let result = entry
.index
.search(&query_vec, k, |id| {
match table_ref.get_value(&column_for_closure, id) {
Some(Value::Vector(v)) => v,
_ => Vec::new(),
}
})
.ok()?;
Some(result)
}

Expand Down
Loading
Loading