@@ -3648,9 +3648,21 @@ impl Database {
36483648 }
36493649 }
36503650
3651+ // Precompute each entity's trigram set once (the #209 pattern from
3652+ // find_near_duplicate): trigram_similarity rebuilds BOTH sets on every
3653+ // call, so the pairwise scan was doing O(n²) set constructions on top
3654+ // of the O(n²) comparisons. The equal-body check preserves
3655+ // trigram_similarity's exact-match semantics for bodies shorter than
3656+ // one trigram (their sets are empty, which would otherwise score 0.0).
3657+ let trigram_sets: Vec < std:: collections:: HashSet < [ char ; 3 ] > > =
3658+ entities. iter ( ) . map ( |e| Self :: trigrams ( & e. 2 ) ) . collect ( ) ;
36513659 for i in 0 ..n {
36523660 for j in ( i + 1 ) ..n {
3653- let sim = Self :: trigram_similarity ( & entities[ i] . 2 , & entities[ j] . 2 ) ;
3661+ let sim = if entities[ i] . 2 == entities[ j] . 2 && !entities[ i] . 2 . is_empty ( ) {
3662+ 1.0
3663+ } else {
3664+ Self :: trigram_overlap ( & trigram_sets[ i] , & trigram_sets[ j] )
3665+ } ;
36543666 if sim >= params. similarity_threshold {
36553667 union ( & mut parent, i, j) ;
36563668 }
@@ -5361,8 +5373,10 @@ impl Database {
53615373 let seed_ids: std:: collections:: HashSet < & str > =
53625374 seeds. iter ( ) . map ( |e| e. id . as_str ( ) ) . collect ( ) ;
53635375 let mut seen: std:: collections:: HashSet < String > = std:: collections:: HashSet :: new ( ) ;
5364- let mut out = Vec :: new ( ) ;
53655376
5377+ // Phase 1: discover candidate neighbor ids in deterministic seed/link
5378+ // order (which is what makes the max_neighbors cut reproducible).
5379+ let mut ordered_ids: Vec < String > = Vec :: new ( ) ;
53665380 for seed in seeds {
53675381 for link in & seed. links {
53685382 if seed_ids. contains ( link. target_id . as_str ( ) ) {
@@ -5371,14 +5385,56 @@ impl Database {
53715385 if !seen. insert ( link. target_id . clone ( ) ) {
53725386 continue ; // already discovered via another seed
53735387 }
5374- if let Some ( neighbor) = self . get_entity_by_id_public ( & link. target_id ) ? {
5375- if !neighbor. archived {
5376- out. push ( ( neighbor, 1.0 ) ) ;
5388+ ordered_ids. push ( link. target_id . clone ( ) ) ;
5389+ }
5390+ }
5391+ if ordered_ids. is_empty ( ) {
5392+ return Ok ( Vec :: new ( ) ) ;
5393+ }
5394+
5395+ // Phase 2: hydrate neighbors with one IN(...) query per chunk instead
5396+ // of a point-query per link (this was an N+1 on the hybrid-recall hot
5397+ // path). Chunked to keep the SQL variable count bounded; iteration
5398+ // order over chunks preserves phase-1 order, and archived/missing
5399+ // neighbors don't count toward the cap — both as before.
5400+ let conn = self . conn ( ) ?;
5401+ let enc = self . encryption . as_ref ( ) ;
5402+ let mut out = Vec :: new ( ) ;
5403+ ' chunks: for chunk in ordered_ids. chunks ( 500 ) {
5404+ let placeholders = ( 1 ..=chunk. len ( ) )
5405+ . map ( |i| format ! ( "?{}" , i) )
5406+ . collect :: < Vec < _ > > ( )
5407+ . join ( ", " ) ;
5408+ let sql = format ! (
5409+ "SELECT id, category, key, body_json, status, type, tags,
5410+ decay_score, retrieval_count, layer, topic_path,
5411+ archived, archive_reason, links, verified, source,
5412+ created_at_unix_ms, last_accessed_unix_ms, NULL as embedding,
5413+ always_on, certainty, workspace_hash, agent_id, visibility,
5414+ follow_count, miss_count, follow_rate, efficacy_status
5415+ FROM entities WHERE archived = 0 AND id IN ({})" ,
5416+ placeholders
5417+ ) ;
5418+ let mut stmt = conn. prepare ( & sql) ?;
5419+ let param_refs: Vec < & dyn rusqlite:: types:: ToSql > = chunk
5420+ . iter ( )
5421+ . map ( |s| s as & dyn rusqlite:: types:: ToSql )
5422+ . collect ( ) ;
5423+ let rows =
5424+ stmt. query_map ( param_refs. as_slice ( ) , |row| entity_from_row ( row, enc) ) ?;
5425+ let mut by_id: std:: collections:: HashMap < String , crate :: models:: Entity > =
5426+ std:: collections:: HashMap :: new ( ) ;
5427+ for row in rows {
5428+ let e = row?;
5429+ by_id. insert ( e. id . clone ( ) , e) ;
5430+ }
5431+ for id in chunk {
5432+ if let Some ( e) = by_id. remove ( id) {
5433+ out. push ( ( e, 1.0 ) ) ;
5434+ if out. len ( ) >= max_neighbors {
5435+ break ' chunks;
53775436 }
53785437 }
5379- if out. len ( ) >= max_neighbors {
5380- return Ok ( out) ;
5381- }
53825438 }
53835439 }
53845440 Ok ( out)
@@ -8158,6 +8214,57 @@ mod tests {
81588214 let _ = fs:: remove_file ( & path) ;
81598215 }
81608216
8217+ #[ test]
8218+ fn graph_expand_caps_at_max_neighbors_in_link_order ( ) {
8219+ // The batched-hydration rewrite must keep the old point-query
8220+ // semantics: the max_neighbors cut follows deterministic seed/link
8221+ // order, and archived/missing neighbors don't count toward the cap.
8222+ let ( db, path) = temp_db ( ) ;
8223+
8224+ // Bodies must be dissimilar enough (>30% trigram distance) that
8225+ // remember()'s near-duplicate check doesn't merge them.
8226+ let bodies = [
8227+ r#"{"note":"postgres primary with streaming replication"}"# ,
8228+ r#"{"note":"redis cache fronting the session store"}"# ,
8229+ r#"{"note":"kafka event bus for order workflows"}"# ,
8230+ r#"{"note":"nginx ingress terminating tls"}"# ,
8231+ ] ;
8232+ for ( i, body) in bodies. iter ( ) . enumerate ( ) {
8233+ db. remember ( & make_entity (
8234+ & format ! ( "cap-n{}" , i + 1 ) ,
8235+ "architecture" ,
8236+ & format ! ( "cap-svc-{}" , i + 1 ) ,
8237+ body,
8238+ ) )
8239+ . unwrap ( ) ;
8240+ }
8241+ db. remember ( & make_entity (
8242+ "cap-hub" ,
8243+ "architecture" ,
8244+ "cap-hub-key" ,
8245+ r#"{"note":"hub for cap test"}"# ,
8246+ ) )
8247+ . unwrap ( ) ;
8248+ // n2 gets archived: it sits FIRST in link order but must not consume
8249+ // a cap slot.
8250+ db. link ( "architecture" , "cap-hub-key" , "cap-n2" , "depends_on" ) . unwrap ( ) ;
8251+ db. link ( "architecture" , "cap-hub-key" , "cap-n1" , "depends_on" ) . unwrap ( ) ;
8252+ db. link ( "architecture" , "cap-hub-key" , "cap-n3" , "depends_on" ) . unwrap ( ) ;
8253+ db. link ( "architecture" , "cap-hub-key" , "cap-n4" , "depends_on" ) . unwrap ( ) ;
8254+ db. forget ( "architecture" , "cap-svc-2" , "retired" ) . unwrap ( ) ;
8255+
8256+ let seed = db. get_entity ( "architecture" , "cap-hub-key" ) . unwrap ( ) . unwrap ( ) ;
8257+ let expanded = db. graph_expand ( & [ seed] , 2 ) . unwrap ( ) ;
8258+ let ids: Vec < & str > = expanded. iter ( ) . map ( |( e, _) | e. id . as_str ( ) ) . collect ( ) ;
8259+ assert_eq ! (
8260+ ids,
8261+ vec![ "cap-n1" , "cap-n3" ] ,
8262+ "cap follows link order, skipping the archived first link"
8263+ ) ;
8264+
8265+ let _ = fs:: remove_file ( & path) ;
8266+ }
8267+
81618268 #[ cfg( feature = "bundled-embeddings" ) ]
81628269 #[ test]
81638270 fn hybrid_recall_surfaces_linked_neighbor_via_graph_arm ( ) {
0 commit comments