@@ -917,6 +917,12 @@ impl Database {
917917 /// forgetting curve. Well above the 0.05 archive threshold.
918918 const VERIFIED_DECAY_FLOOR : f64 = 0.2 ;
919919
920+ /// Minimum trigram similarity for `cohere` to auto-link two same-category
921+ /// entities (#300). Below this the pair is not meaningfully related, so
922+ /// linking it would just add graph noise. Same dependency-free measure used
923+ /// for dedup / conflict detection.
924+ const AUTO_LINK_SIM_THRESHOLD : f64 = 0.3 ;
925+
920926 /// Compute Ebbinghaus decay score based on time since last access.
921927 /// decay = e^(-elapsed_ms / half_life_ms)
922928 /// Returns value in [0.0, 1.0] where 1.0 = just accessed.
@@ -4084,38 +4090,58 @@ last_accessed: {}
40844090 // instead of calling link() (≈4 queries each) per pair inside this write
40854091 // transaction (#209). Accumulating per e1 also keeps multiple links to
40864092 // the same source correct (the old code re-read links fresh each call).
4093+ // #300: auto-link same-category pairs, but gate on real content
4094+ // similarity so we create *meaningful* edges instead of stamping a
4095+ // blanket "auto-related" on every same-category pair (which made
4096+ // `mimir_traverse` noise). Over-fetch a bounded candidate pool, score
4097+ // each pair by trigram similarity (the dependency-free measure already
4098+ // used for dedup/conflict detection), and link only pairs at or above
4099+ // AUTO_LINK_SIM_THRESHOLD — weighting the edge by the actual similarity —
4100+ // until max_links is reached. Empty-tag free-form entities (e.g.
4101+ // conversation) stay unlinked by design, matching their exclusion from
4102+ // the recall surface (#298/#302).
40874103 let max_links = params. max_links . clamp ( 0 , 100 ) as i64 ;
4104+ let candidate_budget = max_links. saturating_mul ( 50 ) . clamp ( 0 , 5000 ) ;
40884105 let mut pending: std:: collections:: HashMap < String , Vec < MemoryLink > > =
40894106 std:: collections:: HashMap :: new ( ) ;
40904107 {
40914108 let mut stmt = conn. prepare (
4092- "SELECT e1.id, e1.links, e2.id as e2_id
4109+ "SELECT e1.id, e1.links, e2.id as e2_id, e1.body_json, e2.body_json
40934110 FROM entities e1
40944111 JOIN entities e2 ON e1.category = e2.category AND e1.id < e2.id
40954112 WHERE e1.archived = 0 AND e2.archived = 0
40964113 AND e1.tags != '[]' AND e2.tags != '[]'
40974114 LIMIT ?1" ,
40984115 ) ?;
4099- let rows = stmt. query_map ( params ! [ max_links ] , |row| {
4116+ let rows = stmt. query_map ( params ! [ candidate_budget ] , |row| {
41004117 Ok ( (
41014118 row. get :: < _ , String > ( 0 ) ?,
41024119 row. get :: < _ , String > ( 1 ) ?,
41034120 row. get :: < _ , String > ( 2 ) ?,
4121+ row. get :: < _ , String > ( 3 ) ?,
4122+ row. get :: < _ , String > ( 4 ) ?,
41044123 ) )
41054124 } ) ?;
4106- for row in rows {
4107- let ( e1_id, e1_links_json, e2_id) = row?;
4125+ ' link: for row in rows {
4126+ let ( e1_id, e1_links_json, e2_id, body1, body2) = row?;
4127+ let sim = Self :: trigram_similarity ( & body1, & body2) ;
4128+ if sim < Self :: AUTO_LINK_SIM_THRESHOLD {
4129+ continue ;
4130+ }
41084131 let entry = pending
41094132 . entry ( e1_id)
41104133 . or_insert_with ( || serde_json:: from_str ( & e1_links_json) . unwrap_or_default ( ) ) ;
41114134 if !entry. iter ( ) . any ( |l| l. target_id == e2_id) {
41124135 entry. push ( MemoryLink {
41134136 target_id : e2_id,
41144137 relationship : "auto-related" . to_string ( ) ,
4115- weight : 0.5 ,
4138+ weight : sim ,
41164139 } ) ;
4140+ linked += 1 ;
4141+ if linked >= max_links {
4142+ break ' link;
4143+ }
41174144 }
4118- linked += 1 ;
41194145 }
41204146 }
41214147
@@ -5975,6 +6001,52 @@ mod tests {
59756001 let _ = fs:: remove_file ( & path) ;
59766002 }
59776003
6004+ #[ test]
6005+ fn cohere_links_only_similar_pairs ( ) {
6006+ // #300: auto-link must gate on content similarity, not link every
6007+ // same-category pair. A similar pair links; an unrelated same-category
6008+ // entity must not. Insert directly (bypassing remember()'s 0.7 dedup) so
6009+ // a clearly-similar pair can coexist alongside an unrelated entity.
6010+ let ( db, path) = temp_db ( ) ;
6011+ let ins = |id : & str , key : & str , body : & str | {
6012+ db. conn ( ) . unwrap ( ) . execute (
6013+ "INSERT INTO entities (id, category, key, body_json, status, type, tags, \
6014+ decay_score, retrieval_count, layer, topic_path, archived, archive_reason, \
6015+ links, verified, source, created_at_unix_ms, last_accessed_unix_ms) \
6016+ VALUES (?1, 'project', ?2, ?3, 'active', 'insight', '[\" x\" ]', 1.0, 0, \
6017+ 'working', '', 0, '', '[]', 0, 'agent', 0, 0)",
6018+ params ! [ id, key, body] ,
6019+ ) . unwrap ( ) ;
6020+ } ;
6021+ ins ( "la" , "alpha" , r#"{"note":"the payment service database migration plan for the Q3 rollout"}"# ) ;
6022+ ins ( "lb" , "beta" , r#"{"note":"the payment service database migration plan for the Q4 rollout"}"# ) ;
6023+ ins ( "lc" , "gamma" , r#"{"note":"quarterly all-hands meeting notes and the cafeteria lunch menu"}"# ) ;
6024+
6025+ let params = crate :: models:: CohereParams {
6026+ dry_run : false ,
6027+ max_links : 100 ,
6028+ promote_threshold : 0 ,
6029+ archive_threshold : 0.0 ,
6030+ } ;
6031+ db. cohere ( & params) . unwrap ( ) ;
6032+
6033+ let la = db. get_entity ( "project" , "alpha" ) . unwrap ( ) . unwrap ( ) ;
6034+ let la_targets: Vec < String > = la. links . iter ( ) . map ( |l| l. target_id . clone ( ) ) . collect ( ) ;
6035+ assert ! (
6036+ la_targets. contains( & "lb" . to_string( ) ) ,
6037+ "similar pair alpha->beta must link, got {la_targets:?}"
6038+ ) ;
6039+ assert ! (
6040+ !la_targets. contains( & "lc" . to_string( ) ) ,
6041+ "dissimilar gamma must NOT be linked to alpha, got {la_targets:?}"
6042+ ) ;
6043+ // gamma is unrelated to everything → no links at all.
6044+ let lc = db. get_entity ( "project" , "gamma" ) . unwrap ( ) . unwrap ( ) ;
6045+ assert ! ( lc. links. is_empty( ) , "unrelated gamma must have no links, got {:?}" , lc. links) ;
6046+
6047+ let _ = fs:: remove_file ( & path) ;
6048+ }
6049+
59786050 #[ test]
59796051 fn journal_and_timeline ( ) {
59806052 let ( db, path) = temp_db ( ) ;
0 commit comments