Skip to content

Commit db3d286

Browse files
tcconnallytcconnallyclaude
authored
perf: sign-signature Hamming prefilter for dense search at scale (#347)
dense_search read EVERY embedded row's full f32 blob per query (~1.5KB per row at 384 dims) up to the 50k scan ceiling — the last full-table scan on the recall hot path. New entities.emb_sig column (schema v6): the embedding's sign bits packed into dim/8 bytes (48 bytes at 384 dims). Above 2048 embedded rows, dense_search now scans only id+emb_sig, ranks candidates by Hamming distance to the query signature (id tie-break — deterministic), keeps a generously oversampled pool (16x limit, min 512), fetches full embeddings for the pool alone (chunked IN), and exact-cosine re-ranks as before. ~30x less I/O on the scan phase at scale. Guarantees kept: - Below the cutover the exact full scan runs unchanged — small vaults (and every existing test) stay byte-identical. - Zero new dependencies; results remain deterministic run-to-run. - Rows with a NULL signature (written by a pre-v6 binary after migration) are always included in the pool, never silently dropped. - Dimension-mismatched signatures score maximally distant. store_embedding writes both blob+sig; the v6 migration backfills signatures for existing embeddings in one bounded pass. Tests: embedding_signature_packs_sign_bits_and_hamming_tracks_distance, dense_search_signature_prefilter_finds_the_same_top_hits_at_scale (2200-row vault, engineered near-neighbors, archived exclusion), migration_backfills_embedding_signatures. Suite: 167 passed / 0 failed; --no-default-features builds clean. Co-authored-by: tcconnally <hermes@perseus.observer> Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
1 parent 77fd4d9 commit db3d286

2 files changed

Lines changed: 286 additions & 22 deletions

File tree

src/db.rs

Lines changed: 225 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,8 @@ impl Database {
586586
Ok(result)
587587
}
588588

589-
/// Store a dense vector embedding for an entity.
589+
/// Store a dense vector embedding for an entity (and its sign-bit
590+
/// signature — see `embedding_signature` / the dense_search prefilter).
590591
#[allow(dead_code)]
591592
pub fn store_embedding(
592593
&self,
@@ -595,9 +596,10 @@ impl Database {
595596
) -> Result<(), Box<dyn std::error::Error>> {
596597
let conn = self.conn()?;
597598
let blob: Vec<u8> = embedding.iter().flat_map(|f| f.to_le_bytes()).collect();
599+
let sig = embedding_signature(embedding);
598600
conn.execute(
599-
"UPDATE entities SET embedding = ?1 WHERE id = ?2",
600-
params![blob, id],
601+
"UPDATE entities SET embedding = ?1, emb_sig = ?2 WHERE id = ?3",
602+
params![blob, sig, id],
601603
)?;
602604
Ok(())
603605
}
@@ -976,28 +978,114 @@ impl Database {
976978
let max_scan = 50_000; // safety ceiling — databases beyond this should use HNSW
977979
let dim = query_vec.len();
978980

981+
// Signature prefilter cutover point. Below this many embedded rows the
982+
// exact full scan is already cheap AND stays byte-identical to the
983+
// historical behavior; above it, reading every f32 blob dominates
984+
// query time, so we Hamming-prefilter on the dim/8-byte sign
985+
// signatures and only read full embeddings for an oversampled pool.
986+
const DENSE_SIG_PREFILTER_MIN_ROWS: i64 = 2048;
987+
// Exact-rerank pool size: generous oversampling keeps the top-k from
988+
// the prefiltered path effectively identical to the exact scan for
989+
// normalized text embeddings.
990+
let pool_target = |limit: usize| (limit.saturating_mul(16)).clamp(512, 4096);
991+
992+
let embedded_rows: i64 = conn.query_row(
993+
"SELECT COUNT(*) FROM entities WHERE archived = 0 AND embedding IS NOT NULL",
994+
[],
995+
|r| r.get(0),
996+
)?;
997+
979998
// Phase 1 (#209): lightweight scan — read only id + embedding for scoring.
980999
// The old query hydrated EVERY candidate (decrypt body, parse tags/links)
9811000
// up to max_scan just to score and then keep top-k. Defer full hydration
9821001
// to the surviving top-k in phase 3.
983-
let mut stmt = conn.prepare(&format!(
984-
"SELECT id, embedding FROM entities \
985-
WHERE archived = 0 AND embedding IS NOT NULL LIMIT {}",
986-
max_scan
987-
))?;
988-
let rows = stmt.query_map([], |row| {
989-
let id: String = row.get(0)?;
990-
let emb_blob: Vec<u8> = row.get(1)?;
991-
let emb: Vec<f32> = emb_blob
992-
.chunks_exact(4)
993-
.map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
994-
.collect();
995-
Ok((id, emb))
996-
})?;
997-
let candidates: Vec<(String, Vec<f32>)> = rows
998-
.filter_map(|r| r.ok())
999-
.filter(|(_, emb)| emb.len() == dim)
1000-
.collect();
1002+
let candidates: Vec<(String, Vec<f32>)> = if embedded_rows
1003+
< DENSE_SIG_PREFILTER_MIN_ROWS
1004+
{
1005+
let mut stmt = conn.prepare(&format!(
1006+
"SELECT id, embedding FROM entities \
1007+
WHERE archived = 0 AND embedding IS NOT NULL LIMIT {}",
1008+
max_scan
1009+
))?;
1010+
let rows = stmt.query_map([], |row| {
1011+
let id: String = row.get(0)?;
1012+
let emb_blob: Vec<u8> = row.get(1)?;
1013+
let emb: Vec<f32> = emb_blob
1014+
.chunks_exact(4)
1015+
.map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
1016+
.collect();
1017+
Ok((id, emb))
1018+
})?;
1019+
rows.filter_map(|r| r.ok())
1020+
.filter(|(_, emb)| emb.len() == dim)
1021+
.collect()
1022+
} else {
1023+
// Phase 0: signature prefilter. Scan only id + emb_sig (~48 bytes
1024+
// vs ~1.5KB per row for 384-dim), rank by Hamming distance to the
1025+
// query signature with an id tie-break (deterministic), keep an
1026+
// oversampled pool, then read full embeddings for the pool only.
1027+
// Rows with a NULL signature (written by a pre-v6 binary after
1028+
// migration) are always included so they can't be silently lost.
1029+
let query_sig = embedding_signature(query_vec);
1030+
let mut stmt = conn.prepare(&format!(
1031+
"SELECT id, emb_sig FROM entities \
1032+
WHERE archived = 0 AND embedding IS NOT NULL LIMIT {}",
1033+
max_scan
1034+
))?;
1035+
let rows = stmt.query_map([], |row| {
1036+
Ok((
1037+
row.get::<_, String>(0)?,
1038+
row.get::<_, Option<Vec<u8>>>(1)?,
1039+
))
1040+
})?;
1041+
let mut ranked: Vec<(u32, String)> = Vec::new();
1042+
let mut unsigned_ids: Vec<String> = Vec::new();
1043+
for row in rows {
1044+
let (id, sig) = row?;
1045+
match sig {
1046+
Some(s) => ranked.push((signature_hamming(&query_sig, &s), id)),
1047+
None => unsigned_ids.push(id),
1048+
}
1049+
}
1050+
let pool = pool_target(limit);
1051+
ranked.sort_by(|a, b| a.0.cmp(&b.0).then_with(|| a.1.cmp(&b.1)));
1052+
ranked.truncate(pool);
1053+
let mut pool_ids: Vec<String> =
1054+
ranked.into_iter().map(|(_, id)| id).collect();
1055+
pool_ids.append(&mut unsigned_ids);
1056+
1057+
// Fetch full embeddings for the pool only (chunked IN to bound
1058+
// SQL variable count).
1059+
let mut fetched: Vec<(String, Vec<f32>)> = Vec::with_capacity(pool_ids.len());
1060+
for chunk in pool_ids.chunks(500) {
1061+
let placeholders = vec!["?"; chunk.len()].join(",");
1062+
let sql = format!(
1063+
"SELECT id, embedding FROM entities WHERE id IN ({})",
1064+
placeholders
1065+
);
1066+
let mut estmt = conn.prepare(&sql)?;
1067+
let refs: Vec<&dyn rusqlite::types::ToSql> = chunk
1068+
.iter()
1069+
.map(|s| s as &dyn rusqlite::types::ToSql)
1070+
.collect();
1071+
let erows = estmt.query_map(refs.as_slice(), |row| {
1072+
let id: String = row.get(0)?;
1073+
let emb_blob: Vec<u8> = row.get(1)?;
1074+
let emb: Vec<f32> = emb_blob
1075+
.chunks_exact(4)
1076+
.map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
1077+
.collect();
1078+
Ok((id, emb))
1079+
})?;
1080+
for r in erows {
1081+
let (id, emb) = r?;
1082+
if emb.len() == dim {
1083+
fetched.push((id, emb));
1084+
}
1085+
}
1086+
}
1087+
fetched
1088+
};
10011089

10021090
// Phase 2: score by cosine similarity, keep the top `limit` ids.
10031091
let mut scored_ids: Vec<(String, f64)>;
@@ -5632,6 +5720,29 @@ fn is_stopword(word: &str) -> bool {
56325720
STOPWORDS.contains(&lower.as_str())
56335721
}
56345722

5723+
/// Sign-bit signature of an embedding: bit i set iff v[i] > 0, packed into
5724+
/// dim/8 bytes (dim 384 → 48 bytes vs 1536 for the f32 blob). For normalized
5725+
/// text embeddings, Hamming distance between signatures tracks cosine
5726+
/// distance closely enough to prefilter candidates before an exact re-rank.
5727+
pub(crate) fn embedding_signature(v: &[f32]) -> Vec<u8> {
5728+
let mut sig = vec![0u8; v.len().div_ceil(8)];
5729+
for (i, &x) in v.iter().enumerate() {
5730+
if x > 0.0 {
5731+
sig[i / 8] |= 1 << (i % 8);
5732+
}
5733+
}
5734+
sig
5735+
}
5736+
5737+
/// Hamming distance between two signatures. Length mismatch (different
5738+
/// embedding dims) scores maximally distant so it can never win a slot.
5739+
fn signature_hamming(a: &[u8], b: &[u8]) -> u32 {
5740+
if a.len() != b.len() {
5741+
return u32::MAX;
5742+
}
5743+
a.iter().zip(b).map(|(x, y)| (x ^ y).count_ones()).sum()
5744+
}
5745+
56355746
fn truncate_str(s: &str, max_len: usize) -> String {
56365747
if s.chars().count() <= max_len {
56375748
s.to_string()
@@ -7692,6 +7803,99 @@ mod tests {
76927803
let _ = fs::remove_file(&path);
76937804
}
76947805

7806+
#[test]
7807+
fn embedding_signature_packs_sign_bits_and_hamming_tracks_distance() {
7808+
// bit i set iff v[i] > 0; zero and negative both clear.
7809+
let sig = embedding_signature(&[1.0, -1.0, 0.0, 0.5, -0.2, 2.0, 0.0, -3.0, 0.1]);
7810+
assert_eq!(sig.len(), 2, "9 dims pack into 2 bytes");
7811+
assert_eq!(sig[0], 0b0010_1001);
7812+
assert_eq!(sig[1], 0b0000_0001);
7813+
7814+
let a = embedding_signature(&[1.0, 1.0, -1.0, -1.0]);
7815+
let b = embedding_signature(&[1.0, -1.0, -1.0, 1.0]);
7816+
assert_eq!(signature_hamming(&a, &a), 0);
7817+
assert_eq!(signature_hamming(&a, &b), 2);
7818+
// Dim mismatch can never win a candidate slot.
7819+
assert_eq!(signature_hamming(&a, &sig), u32::MAX);
7820+
}
7821+
7822+
#[test]
7823+
fn dense_search_signature_prefilter_finds_the_same_top_hits_at_scale() {
7824+
// Above DENSE_SIG_PREFILTER_MIN_ROWS dense_search switches to the
7825+
// Hamming-prefilter + exact-rerank path. Seed 2200 embedded rows with
7826+
// a deterministic spread plus two engineered near-neighbors of the
7827+
// query; the prefiltered path must surface both, exactly ranked by
7828+
// true cosine, and never return an archived row.
7829+
let (db, path) = temp_db();
7830+
let conn = db.conn().unwrap();
7831+
let blob = |v: &[f32]| -> Vec<u8> { v.iter().flat_map(|f| f.to_le_bytes()).collect() };
7832+
let dim = 16usize;
7833+
let tx = conn.unchecked_transaction().unwrap();
7834+
{
7835+
let mut stmt = tx
7836+
.prepare(
7837+
"INSERT INTO entities (id, category, key, body_json, type, status,
7838+
retrieval_count, last_accessed_unix_ms, created_at_unix_ms,
7839+
decay_score, layer, embedding, emb_sig, archived)
7840+
VALUES (?1, 'insight', ?2, '{}', 'insight', 'active', 0, 0, 0,
7841+
1.0, 'working', ?3, ?4, ?5)",
7842+
)
7843+
.unwrap();
7844+
// Deterministic filler vectors: sign pattern derived from the row
7845+
// index bits, pointing all over the hypercube and away from the
7846+
// all-positive query region. Starts at 1: i=0 hashes to the
7847+
// all-positive pattern, which would tie the engineered exact hit
7848+
// (2654435761 is odd, so no other i < 65536 hashes to it).
7849+
for i in 1..=2200u32 {
7850+
let v: Vec<f32> = (0..dim)
7851+
.map(|d| {
7852+
let bit = (i.wrapping_mul(2654435761) >> (d as u32 % 31)) & 1;
7853+
if bit == 1 { -1.0 } else { 0.3 } // mixed signs, never all-positive
7854+
})
7855+
.collect();
7856+
stmt.execute(params![
7857+
format!("filler-{:05}", i),
7858+
format!("filler-key-{:05}", i),
7859+
blob(&v),
7860+
embedding_signature(&v),
7861+
0i64
7862+
])
7863+
.unwrap();
7864+
}
7865+
// Engineered hits: the query itself and a slightly-rotated cousin.
7866+
let query: Vec<f32> = vec![1.0; dim];
7867+
let mut near = query.clone();
7868+
near[0] = 0.6;
7869+
stmt.execute(params![
7870+
"hit-exact", "hit-exact-key", blob(&query), embedding_signature(&query), 0i64
7871+
])
7872+
.unwrap();
7873+
stmt.execute(params![
7874+
"hit-near", "hit-near-key", blob(&near), embedding_signature(&near), 0i64
7875+
])
7876+
.unwrap();
7877+
// An archived twin of the query must never surface.
7878+
stmt.execute(params![
7879+
"hit-archived", "hit-archived-key", blob(&query), embedding_signature(&query), 1i64
7880+
])
7881+
.unwrap();
7882+
}
7883+
tx.commit().unwrap();
7884+
7885+
let query: Vec<f32> = vec![1.0; dim];
7886+
let results = db.dense_search(&query, 5).unwrap();
7887+
let ids: Vec<&str> = results.iter().map(|(e, _)| e.id.as_str()).collect();
7888+
assert_eq!(ids[0], "hit-exact", "true nearest must rank first: {ids:?}");
7889+
assert_eq!(ids[1], "hit-near", "second-nearest must rank second: {ids:?}");
7890+
assert!(!ids.contains(&"hit-archived"), "archived row leaked: {ids:?}");
7891+
assert!(
7892+
results[0].1 > results[1].1,
7893+
"scores must be exact-cosine ordered"
7894+
);
7895+
7896+
let _ = fs::remove_file(&path);
7897+
}
7898+
76957899
// #226: dense/hybrid recall must embed the query, not silently fall back to
76967900
// FTS5. With no embedding backend configured, a dense recall over a
76977901
// non-empty query surfaces the backend error instead of returning keyword

src/schema.rs

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ CREATE TABLE IF NOT EXISTS entities (
2626
created_at_unix_ms INTEGER NOT NULL,
2727
last_accessed_unix_ms INTEGER NOT NULL,
2828
embedding BLOB,
29+
-- Sign-bit signature of `embedding` (v2.13.0, dim/8 bytes): bit i set iff
30+
-- embedding[i] > 0. dense_search Hamming-prefilters on this instead of
31+
-- reading every full embedding blob once the vault is large enough.
32+
-- Written by store_embedding; backfilled by the v6 migration.
33+
emb_sig BLOB,
2934
always_on INTEGER DEFAULT 0,
3035
certainty REAL DEFAULT 0.5,
3136
-- Persistent importance floor (v2.13.0). Set by mimir_score; decay_tick and
@@ -145,7 +150,7 @@ CREATE INDEX IF NOT EXISTS idx_entity_history_catkey ON entity_history(category,
145150
/// the column-add migrations below have been applied. Bump this whenever you add
146151
/// a new ALTER-probe migration in `initialize_schema`, or existing databases
147152
/// (already at the previous level) will skip it.
148-
const SCHEMA_VERSION: i64 = 5;
153+
const SCHEMA_VERSION: i64 = 6;
149154

150155
/// Initialize the v0.2.0 schema on a fresh database.
151156
pub fn initialize_schema(conn: &Connection) -> Result<(), Box<dyn std::error::Error>> {
@@ -275,6 +280,35 @@ pub fn initialize_schema(conn: &Connection) -> Result<(), Box<dyn std::error::Er
275280
conn.execute_batch("ALTER TABLE entities ADD COLUMN importance REAL DEFAULT 0.0;")?;
276281
}
277282

283+
// v6: sign-bit embedding signatures for the dense-search prefilter, plus a
284+
// backfill for embeddings stored before the column existed. Bounded work:
285+
// one pass over embedded rows, ~50 bytes written per row.
286+
if conn.prepare("SELECT emb_sig FROM entities LIMIT 1").is_err() {
287+
conn.execute_batch("ALTER TABLE entities ADD COLUMN emb_sig BLOB;")?;
288+
}
289+
{
290+
let mut stmt = conn.prepare(
291+
"SELECT id, embedding FROM entities \
292+
WHERE embedding IS NOT NULL AND emb_sig IS NULL",
293+
)?;
294+
let rows = stmt.query_map([], |r| {
295+
Ok((r.get::<_, String>(0)?, r.get::<_, Vec<u8>>(1)?))
296+
})?;
297+
let pending: Vec<(String, Vec<u8>)> = rows.filter_map(|r| r.ok()).collect();
298+
drop(stmt);
299+
for (id, blob) in pending {
300+
let emb: Vec<f32> = blob
301+
.chunks_exact(4)
302+
.map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
303+
.collect();
304+
let sig = crate::db::embedding_signature(&emb);
305+
conn.execute(
306+
"UPDATE entities SET emb_sig = ?1 WHERE id = ?2",
307+
params![sig, id],
308+
)?;
309+
}
310+
}
311+
278312
// v4 (#339): identity becomes (category, key, workspace_hash). A plain
279313
// (category, key) uniqueness made cross-workspace key collisions
280314
// unstorable, which is what forced mimir_share's "copy into workspace" to
@@ -690,6 +724,32 @@ mod tests {
690724
);
691725
}
692726

727+
#[test]
728+
fn migration_backfills_embedding_signatures() {
729+
// v6: embeddings stored before emb_sig existed must get a signature
730+
// during the gated migration, matching what store_embedding writes.
731+
let (conn, _path) = temp_db();
732+
initialize_schema(&conn).expect("fresh init");
733+
// Rewind: drop the column's data by simulating a pre-v6 row.
734+
let emb: Vec<f32> = vec![1.0, -2.0, 0.5, -0.1];
735+
let blob: Vec<u8> = emb.iter().flat_map(|f| f.to_le_bytes()).collect();
736+
conn.execute(
737+
"INSERT INTO entities (id, category, key, body_json, embedding, emb_sig,
738+
created_at_unix_ms, last_accessed_unix_ms)
739+
VALUES ('sig-1', 'note', 'k', '{}', ?1, NULL, 0, 0)",
740+
params![blob],
741+
)
742+
.unwrap();
743+
conn.pragma_update(None, "user_version", 5).unwrap();
744+
745+
initialize_schema(&conn).expect("v5 -> v6 migration");
746+
747+
let sig: Vec<u8> = conn
748+
.query_row("SELECT emb_sig FROM entities WHERE id = 'sig-1'", [], |r| r.get(0))
749+
.expect("emb_sig must be backfilled");
750+
assert_eq!(sig, crate::db::embedding_signature(&emb));
751+
}
752+
693753
#[test]
694754
fn adds_bitemporal_columns_and_backfills_recorded_at() {
695755
// A legacy DB (no bi-temporal columns) with one row predating the migration.

0 commit comments

Comments
 (0)