@@ -586,7 +586,8 @@ impl Database {
586586 Ok ( result)
587587 }
588588
589- /// Store a dense vector embedding for an entity.
589+ /// Store a dense vector embedding for an entity (and its sign-bit
590+ /// signature — see `embedding_signature` / the dense_search prefilter).
590591 #[ allow( dead_code) ]
591592 pub fn store_embedding (
592593 & self ,
@@ -595,9 +596,10 @@ impl Database {
595596 ) -> Result < ( ) , Box < dyn std:: error:: Error > > {
596597 let conn = self . conn ( ) ?;
597598 let blob: Vec < u8 > = embedding. iter ( ) . flat_map ( |f| f. to_le_bytes ( ) ) . collect ( ) ;
599+ let sig = embedding_signature ( embedding) ;
598600 conn. execute (
599- "UPDATE entities SET embedding = ?1 WHERE id = ?2 " ,
600- params ! [ blob, id] ,
601+ "UPDATE entities SET embedding = ?1, emb_sig = ?2 WHERE id = ?3 " ,
602+ params ! [ blob, sig , id] ,
601603 ) ?;
602604 Ok ( ( ) )
603605 }
@@ -976,28 +978,114 @@ impl Database {
976978 let max_scan = 50_000 ; // safety ceiling — databases beyond this should use HNSW
977979 let dim = query_vec. len ( ) ;
978980
981+ // Signature prefilter cutover point. Below this many embedded rows the
982+ // exact full scan is already cheap AND stays byte-identical to the
983+ // historical behavior; above it, reading every f32 blob dominates
984+ // query time, so we Hamming-prefilter on the dim/8-byte sign
985+ // signatures and only read full embeddings for an oversampled pool.
986+ const DENSE_SIG_PREFILTER_MIN_ROWS : i64 = 2048 ;
987+ // Exact-rerank pool size: generous oversampling keeps the top-k from
988+ // the prefiltered path effectively identical to the exact scan for
989+ // normalized text embeddings.
990+ let pool_target = |limit : usize | ( limit. saturating_mul ( 16 ) ) . clamp ( 512 , 4096 ) ;
991+
992+ let embedded_rows: i64 = conn. query_row (
993+ "SELECT COUNT(*) FROM entities WHERE archived = 0 AND embedding IS NOT NULL" ,
994+ [ ] ,
995+ |r| r. get ( 0 ) ,
996+ ) ?;
997+
979998 // Phase 1 (#209): lightweight scan — read only id + embedding for scoring.
980999 // The old query hydrated EVERY candidate (decrypt body, parse tags/links)
9811000 // up to max_scan just to score and then keep top-k. Defer full hydration
9821001 // to the surviving top-k in phase 3.
983- let mut stmt = conn. prepare ( & format ! (
984- "SELECT id, embedding FROM entities \
985- WHERE archived = 0 AND embedding IS NOT NULL LIMIT {}",
986- max_scan
987- ) ) ?;
988- let rows = stmt. query_map ( [ ] , |row| {
989- let id: String = row. get ( 0 ) ?;
990- let emb_blob: Vec < u8 > = row. get ( 1 ) ?;
991- let emb: Vec < f32 > = emb_blob
992- . chunks_exact ( 4 )
993- . map ( |b| f32:: from_le_bytes ( [ b[ 0 ] , b[ 1 ] , b[ 2 ] , b[ 3 ] ] ) )
994- . collect ( ) ;
995- Ok ( ( id, emb) )
996- } ) ?;
997- let candidates: Vec < ( String , Vec < f32 > ) > = rows
998- . filter_map ( |r| r. ok ( ) )
999- . filter ( |( _, emb) | emb. len ( ) == dim)
1000- . collect ( ) ;
1002+ let candidates: Vec < ( String , Vec < f32 > ) > = if embedded_rows
1003+ < DENSE_SIG_PREFILTER_MIN_ROWS
1004+ {
1005+ let mut stmt = conn. prepare ( & format ! (
1006+ "SELECT id, embedding FROM entities \
1007+ WHERE archived = 0 AND embedding IS NOT NULL LIMIT {}",
1008+ max_scan
1009+ ) ) ?;
1010+ let rows = stmt. query_map ( [ ] , |row| {
1011+ let id: String = row. get ( 0 ) ?;
1012+ let emb_blob: Vec < u8 > = row. get ( 1 ) ?;
1013+ let emb: Vec < f32 > = emb_blob
1014+ . chunks_exact ( 4 )
1015+ . map ( |b| f32:: from_le_bytes ( [ b[ 0 ] , b[ 1 ] , b[ 2 ] , b[ 3 ] ] ) )
1016+ . collect ( ) ;
1017+ Ok ( ( id, emb) )
1018+ } ) ?;
1019+ rows. filter_map ( |r| r. ok ( ) )
1020+ . filter ( |( _, emb) | emb. len ( ) == dim)
1021+ . collect ( )
1022+ } else {
1023+ // Phase 0: signature prefilter. Scan only id + emb_sig (~48 bytes
1024+ // vs ~1.5KB per row for 384-dim), rank by Hamming distance to the
1025+ // query signature with an id tie-break (deterministic), keep an
1026+ // oversampled pool, then read full embeddings for the pool only.
1027+ // Rows with a NULL signature (written by a pre-v6 binary after
1028+ // migration) are always included so they can't be silently lost.
1029+ let query_sig = embedding_signature ( query_vec) ;
1030+ let mut stmt = conn. prepare ( & format ! (
1031+ "SELECT id, emb_sig FROM entities \
1032+ WHERE archived = 0 AND embedding IS NOT NULL LIMIT {}",
1033+ max_scan
1034+ ) ) ?;
1035+ let rows = stmt. query_map ( [ ] , |row| {
1036+ Ok ( (
1037+ row. get :: < _ , String > ( 0 ) ?,
1038+ row. get :: < _ , Option < Vec < u8 > > > ( 1 ) ?,
1039+ ) )
1040+ } ) ?;
1041+ let mut ranked: Vec < ( u32 , String ) > = Vec :: new ( ) ;
1042+ let mut unsigned_ids: Vec < String > = Vec :: new ( ) ;
1043+ for row in rows {
1044+ let ( id, sig) = row?;
1045+ match sig {
1046+ Some ( s) => ranked. push ( ( signature_hamming ( & query_sig, & s) , id) ) ,
1047+ None => unsigned_ids. push ( id) ,
1048+ }
1049+ }
1050+ let pool = pool_target ( limit) ;
1051+ ranked. sort_by ( |a, b| a. 0 . cmp ( & b. 0 ) . then_with ( || a. 1 . cmp ( & b. 1 ) ) ) ;
1052+ ranked. truncate ( pool) ;
1053+ let mut pool_ids: Vec < String > =
1054+ ranked. into_iter ( ) . map ( |( _, id) | id) . collect ( ) ;
1055+ pool_ids. append ( & mut unsigned_ids) ;
1056+
1057+ // Fetch full embeddings for the pool only (chunked IN to bound
1058+ // SQL variable count).
1059+ let mut fetched: Vec < ( String , Vec < f32 > ) > = Vec :: with_capacity ( pool_ids. len ( ) ) ;
1060+ for chunk in pool_ids. chunks ( 500 ) {
1061+ let placeholders = vec ! [ "?" ; chunk. len( ) ] . join ( "," ) ;
1062+ let sql = format ! (
1063+ "SELECT id, embedding FROM entities WHERE id IN ({})" ,
1064+ placeholders
1065+ ) ;
1066+ let mut estmt = conn. prepare ( & sql) ?;
1067+ let refs: Vec < & dyn rusqlite:: types:: ToSql > = chunk
1068+ . iter ( )
1069+ . map ( |s| s as & dyn rusqlite:: types:: ToSql )
1070+ . collect ( ) ;
1071+ let erows = estmt. query_map ( refs. as_slice ( ) , |row| {
1072+ let id: String = row. get ( 0 ) ?;
1073+ let emb_blob: Vec < u8 > = row. get ( 1 ) ?;
1074+ let emb: Vec < f32 > = emb_blob
1075+ . chunks_exact ( 4 )
1076+ . map ( |b| f32:: from_le_bytes ( [ b[ 0 ] , b[ 1 ] , b[ 2 ] , b[ 3 ] ] ) )
1077+ . collect ( ) ;
1078+ Ok ( ( id, emb) )
1079+ } ) ?;
1080+ for r in erows {
1081+ let ( id, emb) = r?;
1082+ if emb. len ( ) == dim {
1083+ fetched. push ( ( id, emb) ) ;
1084+ }
1085+ }
1086+ }
1087+ fetched
1088+ } ;
10011089
10021090 // Phase 2: score by cosine similarity, keep the top `limit` ids.
10031091 let mut scored_ids: Vec < ( String , f64 ) > ;
@@ -5632,6 +5720,29 @@ fn is_stopword(word: &str) -> bool {
56325720 STOPWORDS . contains ( & lower. as_str ( ) )
56335721}
56345722
5723+ /// Sign-bit signature of an embedding: bit i set iff v[i] > 0, packed into
5724+ /// dim/8 bytes (dim 384 → 48 bytes vs 1536 for the f32 blob). For normalized
5725+ /// text embeddings, Hamming distance between signatures tracks cosine
5726+ /// distance closely enough to prefilter candidates before an exact re-rank.
5727+ pub ( crate ) fn embedding_signature ( v : & [ f32 ] ) -> Vec < u8 > {
5728+ let mut sig = vec ! [ 0u8 ; v. len( ) . div_ceil( 8 ) ] ;
5729+ for ( i, & x) in v. iter ( ) . enumerate ( ) {
5730+ if x > 0.0 {
5731+ sig[ i / 8 ] |= 1 << ( i % 8 ) ;
5732+ }
5733+ }
5734+ sig
5735+ }
5736+
5737+ /// Hamming distance between two signatures. Length mismatch (different
5738+ /// embedding dims) scores maximally distant so it can never win a slot.
5739+ fn signature_hamming ( a : & [ u8 ] , b : & [ u8 ] ) -> u32 {
5740+ if a. len ( ) != b. len ( ) {
5741+ return u32:: MAX ;
5742+ }
5743+ a. iter ( ) . zip ( b) . map ( |( x, y) | ( x ^ y) . count_ones ( ) ) . sum ( )
5744+ }
5745+
56355746fn truncate_str ( s : & str , max_len : usize ) -> String {
56365747 if s. chars ( ) . count ( ) <= max_len {
56375748 s. to_string ( )
@@ -7692,6 +7803,99 @@ mod tests {
76927803 let _ = fs:: remove_file ( & path) ;
76937804 }
76947805
7806+ #[ test]
7807+ fn embedding_signature_packs_sign_bits_and_hamming_tracks_distance ( ) {
7808+ // bit i set iff v[i] > 0; zero and negative both clear.
7809+ let sig = embedding_signature ( & [ 1.0 , -1.0 , 0.0 , 0.5 , -0.2 , 2.0 , 0.0 , -3.0 , 0.1 ] ) ;
7810+ assert_eq ! ( sig. len( ) , 2 , "9 dims pack into 2 bytes" ) ;
7811+ assert_eq ! ( sig[ 0 ] , 0b0010_1001 ) ;
7812+ assert_eq ! ( sig[ 1 ] , 0b0000_0001 ) ;
7813+
7814+ let a = embedding_signature ( & [ 1.0 , 1.0 , -1.0 , -1.0 ] ) ;
7815+ let b = embedding_signature ( & [ 1.0 , -1.0 , -1.0 , 1.0 ] ) ;
7816+ assert_eq ! ( signature_hamming( & a, & a) , 0 ) ;
7817+ assert_eq ! ( signature_hamming( & a, & b) , 2 ) ;
7818+ // Dim mismatch can never win a candidate slot.
7819+ assert_eq ! ( signature_hamming( & a, & sig) , u32 :: MAX ) ;
7820+ }
7821+
7822+ #[ test]
7823+ fn dense_search_signature_prefilter_finds_the_same_top_hits_at_scale ( ) {
7824+ // Above DENSE_SIG_PREFILTER_MIN_ROWS dense_search switches to the
7825+ // Hamming-prefilter + exact-rerank path. Seed 2200 embedded rows with
7826+ // a deterministic spread plus two engineered near-neighbors of the
7827+ // query; the prefiltered path must surface both, exactly ranked by
7828+ // true cosine, and never return an archived row.
7829+ let ( db, path) = temp_db ( ) ;
7830+ let conn = db. conn ( ) . unwrap ( ) ;
7831+ let blob = |v : & [ f32 ] | -> Vec < u8 > { v. iter ( ) . flat_map ( |f| f. to_le_bytes ( ) ) . collect ( ) } ;
7832+ let dim = 16usize ;
7833+ let tx = conn. unchecked_transaction ( ) . unwrap ( ) ;
7834+ {
7835+ let mut stmt = tx
7836+ . prepare (
7837+ "INSERT INTO entities (id, category, key, body_json, type, status,
7838+ retrieval_count, last_accessed_unix_ms, created_at_unix_ms,
7839+ decay_score, layer, embedding, emb_sig, archived)
7840+ VALUES (?1, 'insight', ?2, '{}', 'insight', 'active', 0, 0, 0,
7841+ 1.0, 'working', ?3, ?4, ?5)" ,
7842+ )
7843+ . unwrap ( ) ;
7844+ // Deterministic filler vectors: sign pattern derived from the row
7845+ // index bits, pointing all over the hypercube and away from the
7846+ // all-positive query region. Starts at 1: i=0 hashes to the
7847+ // all-positive pattern, which would tie the engineered exact hit
7848+ // (2654435761 is odd, so no other i < 65536 hashes to it).
7849+ for i in 1 ..=2200u32 {
7850+ let v: Vec < f32 > = ( 0 ..dim)
7851+ . map ( |d| {
7852+ let bit = ( i. wrapping_mul ( 2654435761 ) >> ( d as u32 % 31 ) ) & 1 ;
7853+ if bit == 1 { -1.0 } else { 0.3 } // mixed signs, never all-positive
7854+ } )
7855+ . collect ( ) ;
7856+ stmt. execute ( params ! [
7857+ format!( "filler-{:05}" , i) ,
7858+ format!( "filler-key-{:05}" , i) ,
7859+ blob( & v) ,
7860+ embedding_signature( & v) ,
7861+ 0i64
7862+ ] )
7863+ . unwrap ( ) ;
7864+ }
7865+ // Engineered hits: the query itself and a slightly-rotated cousin.
7866+ let query: Vec < f32 > = vec ! [ 1.0 ; dim] ;
7867+ let mut near = query. clone ( ) ;
7868+ near[ 0 ] = 0.6 ;
7869+ stmt. execute ( params ! [
7870+ "hit-exact" , "hit-exact-key" , blob( & query) , embedding_signature( & query) , 0i64
7871+ ] )
7872+ . unwrap ( ) ;
7873+ stmt. execute ( params ! [
7874+ "hit-near" , "hit-near-key" , blob( & near) , embedding_signature( & near) , 0i64
7875+ ] )
7876+ . unwrap ( ) ;
7877+ // An archived twin of the query must never surface.
7878+ stmt. execute ( params ! [
7879+ "hit-archived" , "hit-archived-key" , blob( & query) , embedding_signature( & query) , 1i64
7880+ ] )
7881+ . unwrap ( ) ;
7882+ }
7883+ tx. commit ( ) . unwrap ( ) ;
7884+
7885+ let query: Vec < f32 > = vec ! [ 1.0 ; dim] ;
7886+ let results = db. dense_search ( & query, 5 ) . unwrap ( ) ;
7887+ let ids: Vec < & str > = results. iter ( ) . map ( |( e, _) | e. id . as_str ( ) ) . collect ( ) ;
7888+ assert_eq ! ( ids[ 0 ] , "hit-exact" , "true nearest must rank first: {ids:?}" ) ;
7889+ assert_eq ! ( ids[ 1 ] , "hit-near" , "second-nearest must rank second: {ids:?}" ) ;
7890+ assert ! ( !ids. contains( & "hit-archived" ) , "archived row leaked: {ids:?}" ) ;
7891+ assert ! (
7892+ results[ 0 ] . 1 > results[ 1 ] . 1 ,
7893+ "scores must be exact-cosine ordered"
7894+ ) ;
7895+
7896+ let _ = fs:: remove_file ( & path) ;
7897+ }
7898+
76957899 // #226: dense/hybrid recall must embed the query, not silently fall back to
76967900 // FTS5. With no embedding backend configured, a dense recall over a
76977901 // non-empty query surfaces the backend error instead of returning keyword
0 commit comments