Skip to content

Commit 01b4b2a

Browse files
asg017claude
andcommitted
Scrub stale reverse edges on DiskANN delete (data leak fix)
After deleting a node, its rowid and quantized vector remained in other nodes' neighbor blobs via unidirectional reverse edges. This is a data leak — the deleted vector's compressed representation was still readable in shadow tables. Fix: after deleting the node and repairing forward edges, scan all remaining nodes and clear any neighbor slot that references the deleted rowid. Uses a lightweight two-pass approach: first scan reads only validity + neighbor_ids to find affected nodes, then does full read/clear/write only for those nodes. Tradeoff: O(N) scan per delete adds ~1ms/row at 10k vectors, ~10ms at 100k. Recall and query latency are unaffected. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c36a995 commit 01b4b2a

2 files changed

Lines changed: 132 additions & 0 deletions

File tree

sqlite-vec-diskann.c

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1638,6 +1638,95 @@ static int diskann_repair_reverse_edges(
16381638
* Delete a vector from the DiskANN graph (Algorithm 3: LM-Delete).
16391639
* If the vector is in the buffer (not yet flushed), just remove from buffer.
16401640
*/
1641+
/**
1642+
* Scan all nodes and clear any neighbor slot referencing deleted_rowid.
1643+
* This removes stale reverse edges that the forward-edge repair misses,
1644+
* preventing data leaks (deleted rowid + quantized vector lingering in
1645+
* other nodes' blobs).
1646+
*/
1647+
static int diskann_scrub_deleted_rowid(
1648+
vec0_vtab *p, int vec_col_idx, i64 deleted_rowid) {
1649+
1650+
struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx];
1651+
struct Vec0DiskannConfig *cfg = &col->diskann;
1652+
int rc;
1653+
sqlite3_stmt *stmt = NULL;
1654+
1655+
// Lightweight scan: only read validity + neighbor_ids to find matches
1656+
char *zSql = sqlite3_mprintf(
1657+
"SELECT rowid, neighbors_validity, neighbor_ids "
1658+
"FROM " VEC0_SHADOW_DISKANN_NODES_N_NAME,
1659+
p->schemaName, p->tableName, vec_col_idx);
1660+
if (!zSql) return SQLITE_NOMEM;
1661+
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
1662+
sqlite3_free(zSql);
1663+
if (rc != SQLITE_OK) return rc;
1664+
1665+
// Collect rowids that need updating (avoid modifying while iterating)
1666+
i64 *dirty = NULL;
1667+
int nDirty = 0, capDirty = 0;
1668+
1669+
while (sqlite3_step(stmt) == SQLITE_ROW) {
1670+
const u8 *validity = (const u8 *)sqlite3_column_blob(stmt, 1);
1671+
const u8 *ids = (const u8 *)sqlite3_column_blob(stmt, 2);
1672+
int idsBytes = sqlite3_column_bytes(stmt, 2);
1673+
if (!validity || !ids) continue;
1674+
1675+
int nSlots = idsBytes / (int)sizeof(i64);
1676+
if (nSlots > cfg->n_neighbors) nSlots = cfg->n_neighbors;
1677+
1678+
for (int i = 0; i < nSlots; i++) {
1679+
if (!diskann_validity_get(validity, i)) continue;
1680+
i64 nid = diskann_neighbor_id_get(ids, i);
1681+
if (nid == deleted_rowid) {
1682+
i64 nodeRowid = sqlite3_column_int64(stmt, 0);
1683+
// Add to dirty list
1684+
if (nDirty >= capDirty) {
1685+
capDirty = capDirty ? capDirty * 2 : 16;
1686+
i64 *tmp = sqlite3_realloc64(dirty, capDirty * sizeof(i64));
1687+
if (!tmp) { sqlite3_free(dirty); sqlite3_finalize(stmt); return SQLITE_NOMEM; }
1688+
dirty = tmp;
1689+
}
1690+
dirty[nDirty++] = nodeRowid;
1691+
break; // one match per node is enough
1692+
}
1693+
}
1694+
}
1695+
sqlite3_finalize(stmt);
1696+
1697+
// Now do full read/clear/write for each dirty node
1698+
for (int d = 0; d < nDirty; d++) {
1699+
u8 *val = NULL, *nids = NULL, *qvecs = NULL;
1700+
int vs, nis, qs;
1701+
rc = diskann_node_read(p, vec_col_idx, dirty[d],
1702+
&val, &vs, &nids, &nis, &qvecs, &qs);
1703+
if (rc != SQLITE_OK) continue;
1704+
1705+
int modified = 0;
1706+
for (int i = 0; i < cfg->n_neighbors; i++) {
1707+
if (diskann_validity_get(val, i) &&
1708+
diskann_neighbor_id_get(nids, i) == deleted_rowid) {
1709+
diskann_node_clear_neighbor(val, nids, qvecs, i,
1710+
cfg->quantizer_type, col->dimensions);
1711+
modified = 1;
1712+
}
1713+
}
1714+
1715+
if (modified) {
1716+
rc = diskann_node_write(p, vec_col_idx, dirty[d],
1717+
val, vs, nids, nis, qvecs, qs);
1718+
}
1719+
1720+
sqlite3_free(val);
1721+
sqlite3_free(nids);
1722+
sqlite3_free(qvecs);
1723+
if (rc != SQLITE_OK) break;
1724+
}
1725+
1726+
sqlite3_free(dirty);
1727+
return rc;
1728+
}
1729+
16411730
static int diskann_delete(vec0_vtab *p, int vec_col_idx, i64 rowid) {
16421731
struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx];
16431732
struct Vec0DiskannConfig *cfg = &col->diskann;
@@ -1706,6 +1795,12 @@ static int diskann_delete(vec0_vtab *p, int vec_col_idx, i64 rowid) {
17061795
rc = diskann_medoid_handle_delete(p, vec_col_idx, rowid);
17071796
}
17081797

1798+
// 5. Scrub stale reverse edges — removes deleted rowid + quantized vector
1799+
// from any node that still references it (data leak prevention)
1800+
if (rc == SQLITE_OK) {
1801+
rc = diskann_scrub_deleted_rowid(p, vec_col_idx, rowid);
1802+
}
1803+
17091804
return rc;
17101805
}
17111806

tests/test-diskann.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,3 +1289,40 @@ def test_diskann_text_pk_insert_knn_delete(db):
12891289
).fetchall()
12901290
ids = [r["id"] for r in rows]
12911291
assert "alpha" not in ids
1292+
1293+
1294+
def test_diskann_delete_scrubs_all_references(db):
1295+
"""After DELETE, no shadow table should contain the deleted rowid or its data."""
1296+
import struct
1297+
db.execute("""
1298+
CREATE VIRTUAL TABLE t USING vec0(
1299+
emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8)
1300+
)
1301+
""")
1302+
for i in range(20):
1303+
vec = struct.pack("8f", *[float(i + d) for d in range(8)])
1304+
db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, vec])
1305+
1306+
target = 5
1307+
db.execute("DELETE FROM t WHERE rowid = ?", [target])
1308+
1309+
# Node row itself should be gone
1310+
assert db.execute(
1311+
"SELECT count(*) FROM t_diskann_nodes00 WHERE rowid=?", [target]
1312+
).fetchone()[0] == 0
1313+
1314+
# Vector should be gone
1315+
assert db.execute(
1316+
"SELECT count(*) FROM t_vectors00 WHERE rowid=?", [target]
1317+
).fetchone()[0] == 0
1318+
1319+
# No other node should reference the deleted rowid in neighbor_ids
1320+
for row in db.execute("SELECT rowid, neighbor_ids FROM t_diskann_nodes00"):
1321+
node_rowid = row[0]
1322+
ids_blob = row[1]
1323+
for j in range(0, len(ids_blob), 8):
1324+
nid = struct.unpack("<q", ids_blob[j : j + 8])[0]
1325+
assert nid != target, (
1326+
f"Node {node_rowid} slot {j // 8} still references "
1327+
f"deleted rowid {target}"
1328+
)

0 commit comments

Comments
 (0)