Skip to content

Commit cf4adb0

Browse files
tcconnallytcconnallyclaude
authored
perf: batch graph_expand hydration + cache consolidate trigram sets (#340)
Two hot-path fixes from the 2026-06-29 perf audit: - graph_expand (hybrid-recall graph arm) issued one point-query per discovered link (N+1). Now discovers candidate ids in deterministic seed/link order, then hydrates them with one IN(...) query per 500-id chunk. Archived/missing neighbors still don't consume max_neighbors slots, and the cap still follows link order — covered by a new regression test. - consolidate rebuilt BOTH entities' trigram sets inside trigram_similarity for every pair in the O(n^2) union-find scan. Sets are now precomputed once per entity (the same #209 pattern find_near_duplicate already uses); the equal-body fast path is kept so sub-trigram-length identical bodies still score 1.0. Test: graph_expand_caps_at_max_neighbors_in_link_order. Suite: 160 passed / 0 failed. Co-authored-by: tcconnally <hermes@perseus.observer> Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
1 parent f132a13 commit cf4adb0

1 file changed

Lines changed: 115 additions & 8 deletions

File tree

src/db.rs

Lines changed: 115 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3648,9 +3648,21 @@ impl Database {
36483648
}
36493649
}
36503650

3651+
// Precompute each entity's trigram set once (the #209 pattern from
3652+
// find_near_duplicate): trigram_similarity rebuilds BOTH sets on every
3653+
// call, so the pairwise scan was doing O(n²) set constructions on top
3654+
// of the O(n²) comparisons. The equal-body check preserves
3655+
// trigram_similarity's exact-match semantics for bodies shorter than
3656+
// one trigram (their sets are empty, which would otherwise score 0.0).
3657+
let trigram_sets: Vec<std::collections::HashSet<[char; 3]>> =
3658+
entities.iter().map(|e| Self::trigrams(&e.2)).collect();
36513659
for i in 0..n {
36523660
for j in (i + 1)..n {
3653-
let sim = Self::trigram_similarity(&entities[i].2, &entities[j].2);
3661+
let sim = if entities[i].2 == entities[j].2 && !entities[i].2.is_empty() {
3662+
1.0
3663+
} else {
3664+
Self::trigram_overlap(&trigram_sets[i], &trigram_sets[j])
3665+
};
36543666
if sim >= params.similarity_threshold {
36553667
union(&mut parent, i, j);
36563668
}
@@ -5361,8 +5373,10 @@ impl Database {
53615373
let seed_ids: std::collections::HashSet<&str> =
53625374
seeds.iter().map(|e| e.id.as_str()).collect();
53635375
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
5364-
let mut out = Vec::new();
53655376

5377+
// Phase 1: discover candidate neighbor ids in deterministic seed/link
5378+
// order (which is what makes the max_neighbors cut reproducible).
5379+
let mut ordered_ids: Vec<String> = Vec::new();
53665380
for seed in seeds {
53675381
for link in &seed.links {
53685382
if seed_ids.contains(link.target_id.as_str()) {
@@ -5371,14 +5385,56 @@ impl Database {
53715385
if !seen.insert(link.target_id.clone()) {
53725386
continue; // already discovered via another seed
53735387
}
5374-
if let Some(neighbor) = self.get_entity_by_id_public(&link.target_id)? {
5375-
if !neighbor.archived {
5376-
out.push((neighbor, 1.0));
5388+
ordered_ids.push(link.target_id.clone());
5389+
}
5390+
}
5391+
if ordered_ids.is_empty() {
5392+
return Ok(Vec::new());
5393+
}
5394+
5395+
// Phase 2: hydrate neighbors with one IN(...) query per chunk instead
5396+
// of a point-query per link (this was an N+1 on the hybrid-recall hot
5397+
// path). Chunked to keep the SQL variable count bounded; iteration
5398+
// order over chunks preserves phase-1 order, and archived/missing
5399+
// neighbors don't count toward the cap — both as before.
5400+
let conn = self.conn()?;
5401+
let enc = self.encryption.as_ref();
5402+
let mut out = Vec::new();
5403+
'chunks: for chunk in ordered_ids.chunks(500) {
5404+
let placeholders = (1..=chunk.len())
5405+
.map(|i| format!("?{}", i))
5406+
.collect::<Vec<_>>()
5407+
.join(", ");
5408+
let sql = format!(
5409+
"SELECT id, category, key, body_json, status, type, tags,
5410+
decay_score, retrieval_count, layer, topic_path,
5411+
archived, archive_reason, links, verified, source,
5412+
created_at_unix_ms, last_accessed_unix_ms, NULL as embedding,
5413+
always_on, certainty, workspace_hash, agent_id, visibility,
5414+
follow_count, miss_count, follow_rate, efficacy_status
5415+
FROM entities WHERE archived = 0 AND id IN ({})",
5416+
placeholders
5417+
);
5418+
let mut stmt = conn.prepare(&sql)?;
5419+
let param_refs: Vec<&dyn rusqlite::types::ToSql> = chunk
5420+
.iter()
5421+
.map(|s| s as &dyn rusqlite::types::ToSql)
5422+
.collect();
5423+
let rows =
5424+
stmt.query_map(param_refs.as_slice(), |row| entity_from_row(row, enc))?;
5425+
let mut by_id: std::collections::HashMap<String, crate::models::Entity> =
5426+
std::collections::HashMap::new();
5427+
for row in rows {
5428+
let e = row?;
5429+
by_id.insert(e.id.clone(), e);
5430+
}
5431+
for id in chunk {
5432+
if let Some(e) = by_id.remove(id) {
5433+
out.push((e, 1.0));
5434+
if out.len() >= max_neighbors {
5435+
break 'chunks;
53775436
}
53785437
}
5379-
if out.len() >= max_neighbors {
5380-
return Ok(out);
5381-
}
53825438
}
53835439
}
53845440
Ok(out)
@@ -8158,6 +8214,57 @@ mod tests {
81588214
let _ = fs::remove_file(&path);
81598215
}
81608216

8217+
#[test]
8218+
fn graph_expand_caps_at_max_neighbors_in_link_order() {
8219+
// The batched-hydration rewrite must keep the old point-query
8220+
// semantics: the max_neighbors cut follows deterministic seed/link
8221+
// order, and archived/missing neighbors don't count toward the cap.
8222+
let (db, path) = temp_db();
8223+
8224+
// Bodies must be dissimilar enough (>30% trigram distance) that
8225+
// remember()'s near-duplicate check doesn't merge them.
8226+
let bodies = [
8227+
r#"{"note":"postgres primary with streaming replication"}"#,
8228+
r#"{"note":"redis cache fronting the session store"}"#,
8229+
r#"{"note":"kafka event bus for order workflows"}"#,
8230+
r#"{"note":"nginx ingress terminating tls"}"#,
8231+
];
8232+
for (i, body) in bodies.iter().enumerate() {
8233+
db.remember(&make_entity(
8234+
&format!("cap-n{}", i + 1),
8235+
"architecture",
8236+
&format!("cap-svc-{}", i + 1),
8237+
body,
8238+
))
8239+
.unwrap();
8240+
}
8241+
db.remember(&make_entity(
8242+
"cap-hub",
8243+
"architecture",
8244+
"cap-hub-key",
8245+
r#"{"note":"hub for cap test"}"#,
8246+
))
8247+
.unwrap();
8248+
// n2 gets archived: it sits FIRST in link order but must not consume
8249+
// a cap slot.
8250+
db.link("architecture", "cap-hub-key", "cap-n2", "depends_on").unwrap();
8251+
db.link("architecture", "cap-hub-key", "cap-n1", "depends_on").unwrap();
8252+
db.link("architecture", "cap-hub-key", "cap-n3", "depends_on").unwrap();
8253+
db.link("architecture", "cap-hub-key", "cap-n4", "depends_on").unwrap();
8254+
db.forget("architecture", "cap-svc-2", "retired").unwrap();
8255+
8256+
let seed = db.get_entity("architecture", "cap-hub-key").unwrap().unwrap();
8257+
let expanded = db.graph_expand(&[seed], 2).unwrap();
8258+
let ids: Vec<&str> = expanded.iter().map(|(e, _)| e.id.as_str()).collect();
8259+
assert_eq!(
8260+
ids,
8261+
vec!["cap-n1", "cap-n3"],
8262+
"cap follows link order, skipping the archived first link"
8263+
);
8264+
8265+
let _ = fs::remove_file(&path);
8266+
}
8267+
81618268
#[cfg(feature = "bundled-embeddings")]
81628269
#[test]
81638270
fn hybrid_recall_surfaces_linked_neighbor_via_graph_arm() {

0 commit comments

Comments
 (0)