Skip to content

Commit 19f9f38

Browse files
tcconnallytcconnallyclaude
authored
fix(cohere): gate auto-links on content similarity (#300) (#316)
cohere's auto-link step linked *every* same-category pair (with non-empty tags) and stamped a blanket "auto-related" weight 0.5 with no relatedness check, so mimir_traverse surfaced meaningless edges. Gate the link on trigram similarity (the same dependency-free measure used for dedup/conflict detection): over-fetch a bounded candidate pool, link only pairs at or above AUTO_LINK_SIM_THRESHOLD (0.3), weight the edge by the actual similarity, and stop at max_links. Empty-tag free-form entities (e.g. conversation) stay unlinked by design, matching their exclusion from the recall surface (#298/#302). +test cohere_links_only_similar_pairs (similar pair links, unrelated entity does not); existing cohere_auto_links_batched_same_source still green. Note: the issue's "graph is always empty" evidence was a mimir_cohere dry-run artifact (dry_run returns all-zeros without computing). Unifying the three promote/decay regimes (recall vs cohere vs decay_tick) is left as a separate, larger refactor. Closes #300. Co-authored-by: tcconnally <hermes@perseus.observer> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 624ecf2 commit 19f9f38

1 file changed

Lines changed: 78 additions & 6 deletions

File tree

src/db.rs

Lines changed: 78 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -917,6 +917,12 @@ impl Database {
917917
/// forgetting curve. Well above the 0.05 archive threshold.
918918
const VERIFIED_DECAY_FLOOR: f64 = 0.2;
919919

920+
/// Minimum trigram similarity for `cohere` to auto-link two same-category
921+
/// entities (#300). Below this the pair is not meaningfully related, so
922+
/// linking it would just add graph noise. Same dependency-free measure used
923+
/// for dedup / conflict detection.
924+
const AUTO_LINK_SIM_THRESHOLD: f64 = 0.3;
925+
920926
/// Compute Ebbinghaus decay score based on time since last access.
921927
/// decay = e^(-elapsed_ms / half_life_ms)
922928
/// Returns value in [0.0, 1.0] where 1.0 = just accessed.
@@ -4084,38 +4090,58 @@ last_accessed: {}
40844090
// instead of calling link() (≈4 queries each) per pair inside this write
40854091
// transaction (#209). Accumulating per e1 also keeps multiple links to
40864092
// the same source correct (the old code re-read links fresh each call).
4093+
// #300: auto-link same-category pairs, but gate on real content
4094+
// similarity so we create *meaningful* edges instead of stamping a
4095+
// blanket "auto-related" on every same-category pair (which made
4096+
// `mimir_traverse` noise). Over-fetch a bounded candidate pool, score
4097+
// each pair by trigram similarity (the dependency-free measure already
4098+
// used for dedup/conflict detection), and link only pairs at or above
4099+
// AUTO_LINK_SIM_THRESHOLD — weighting the edge by the actual similarity —
4100+
// until max_links is reached. Empty-tag free-form entities (e.g.
4101+
// conversation) stay unlinked by design, matching their exclusion from
4102+
// the recall surface (#298/#302).
40874103
let max_links = params.max_links.clamp(0, 100) as i64;
4104+
let candidate_budget = max_links.saturating_mul(50).clamp(0, 5000);
40884105
let mut pending: std::collections::HashMap<String, Vec<MemoryLink>> =
40894106
std::collections::HashMap::new();
40904107
{
40914108
let mut stmt = conn.prepare(
4092-
"SELECT e1.id, e1.links, e2.id as e2_id
4109+
"SELECT e1.id, e1.links, e2.id as e2_id, e1.body_json, e2.body_json
40934110
FROM entities e1
40944111
JOIN entities e2 ON e1.category = e2.category AND e1.id < e2.id
40954112
WHERE e1.archived = 0 AND e2.archived = 0
40964113
AND e1.tags != '[]' AND e2.tags != '[]'
40974114
LIMIT ?1",
40984115
)?;
4099-
let rows = stmt.query_map(params![max_links], |row| {
4116+
let rows = stmt.query_map(params![candidate_budget], |row| {
41004117
Ok((
41014118
row.get::<_, String>(0)?,
41024119
row.get::<_, String>(1)?,
41034120
row.get::<_, String>(2)?,
4121+
row.get::<_, String>(3)?,
4122+
row.get::<_, String>(4)?,
41044123
))
41054124
})?;
4106-
for row in rows {
4107-
let (e1_id, e1_links_json, e2_id) = row?;
4125+
'link: for row in rows {
4126+
let (e1_id, e1_links_json, e2_id, body1, body2) = row?;
4127+
let sim = Self::trigram_similarity(&body1, &body2);
4128+
if sim < Self::AUTO_LINK_SIM_THRESHOLD {
4129+
continue;
4130+
}
41084131
let entry = pending
41094132
.entry(e1_id)
41104133
.or_insert_with(|| serde_json::from_str(&e1_links_json).unwrap_or_default());
41114134
if !entry.iter().any(|l| l.target_id == e2_id) {
41124135
entry.push(MemoryLink {
41134136
target_id: e2_id,
41144137
relationship: "auto-related".to_string(),
4115-
weight: 0.5,
4138+
weight: sim,
41164139
});
4140+
linked += 1;
4141+
if linked >= max_links {
4142+
break 'link;
4143+
}
41174144
}
4118-
linked += 1;
41194145
}
41204146
}
41214147

@@ -5975,6 +6001,52 @@ mod tests {
59756001
let _ = fs::remove_file(&path);
59766002
}
59776003

6004+
#[test]
6005+
fn cohere_links_only_similar_pairs() {
6006+
// #300: auto-link must gate on content similarity, not link every
6007+
// same-category pair. A similar pair links; an unrelated same-category
6008+
// entity must not. Insert directly (bypassing remember()'s 0.7 dedup) so
6009+
// a clearly-similar pair can coexist alongside an unrelated entity.
6010+
let (db, path) = temp_db();
6011+
let ins = |id: &str, key: &str, body: &str| {
6012+
db.conn().unwrap().execute(
6013+
"INSERT INTO entities (id, category, key, body_json, status, type, tags, \
6014+
decay_score, retrieval_count, layer, topic_path, archived, archive_reason, \
6015+
links, verified, source, created_at_unix_ms, last_accessed_unix_ms) \
6016+
VALUES (?1, 'project', ?2, ?3, 'active', 'insight', '[\"x\"]', 1.0, 0, \
6017+
'working', '', 0, '', '[]', 0, 'agent', 0, 0)",
6018+
params![id, key, body],
6019+
).unwrap();
6020+
};
6021+
ins("la", "alpha", r#"{"note":"the payment service database migration plan for the Q3 rollout"}"#);
6022+
ins("lb", "beta", r#"{"note":"the payment service database migration plan for the Q4 rollout"}"#);
6023+
ins("lc", "gamma", r#"{"note":"quarterly all-hands meeting notes and the cafeteria lunch menu"}"#);
6024+
6025+
let params = crate::models::CohereParams {
6026+
dry_run: false,
6027+
max_links: 100,
6028+
promote_threshold: 0,
6029+
archive_threshold: 0.0,
6030+
};
6031+
db.cohere(&params).unwrap();
6032+
6033+
let la = db.get_entity("project", "alpha").unwrap().unwrap();
6034+
let la_targets: Vec<String> = la.links.iter().map(|l| l.target_id.clone()).collect();
6035+
assert!(
6036+
la_targets.contains(&"lb".to_string()),
6037+
"similar pair alpha->beta must link, got {la_targets:?}"
6038+
);
6039+
assert!(
6040+
!la_targets.contains(&"lc".to_string()),
6041+
"dissimilar gamma must NOT be linked to alpha, got {la_targets:?}"
6042+
);
6043+
// gamma is unrelated to everything → no links at all.
6044+
let lc = db.get_entity("project", "gamma").unwrap().unwrap();
6045+
assert!(lc.links.is_empty(), "unrelated gamma must have no links, got {:?}", lc.links);
6046+
6047+
let _ = fs::remove_file(&path);
6048+
}
6049+
59786050
#[test]
59796051
fn journal_and_timeline() {
59806052
let (db, path) = temp_db();

0 commit comments

Comments
 (0)