@@ -355,6 +355,21 @@ pub fn entity_count(conn: &Connection) -> Result<i64, Box<dyn std::error::Error>
355355 Ok ( count)
356356}
357357
358+ /// Truncate `s` to at most `max_bytes` bytes without splitting a UTF-8
359+ /// character. `&s[..n]` panics when `n` is not a char boundary, so walk the
360+ /// cut point back to the nearest boundary instead (stable-Rust equivalent of
361+ /// the nightly `floor_char_boundary`). (#352)
362+ fn truncate_at_char_boundary ( s : & str , max_bytes : usize ) -> & str {
363+ if s. len ( ) <= max_bytes {
364+ return s;
365+ }
366+ let mut end = max_bytes;
367+ while end > 0 && !s. is_char_boundary ( end) {
368+ end -= 1 ;
369+ }
370+ & s[ ..end]
371+ }
372+
358373/// Migrate from v0.1.x schema to v0.2.0.
359374///
360375/// Opens the old DB, reads all memories, writes them as entities into the new schema,
@@ -455,9 +470,12 @@ pub fn migrate_from_v0_1(
455470 }
456471 let tags_json = serde_json:: to_string ( & tags_value) . unwrap_or_else ( |_| "[]" . to_string ( ) ) ;
457472
458- // Category and key: derive from type + truncated id
473+ // Category and key: derive from type + truncated id. Truncation must
474+ // respect char boundaries: legacy v0.1 ids were written by external
475+ // systems and may contain multi-byte UTF-8 — a raw byte slice panics
476+ // if byte 20 falls inside a char, aborting the whole migration (#352).
459477 let category = "general" . to_string ( ) ;
460- let key = format ! ( "migrated-{}" , & id[ ..id . len ( ) . min ( 20 ) ] ) ;
478+ let key = format ! ( "migrated-{}" , truncate_at_char_boundary ( & id, 20 ) ) ;
461479
462480 let verified_int = if verified != 0 { 1 } else { 0 } ;
463481
@@ -1038,6 +1056,78 @@ mod tests {
10381056 let _ = std:: fs:: remove_file ( & old_path) ;
10391057 }
10401058
1059+ #[ test]
1060+ fn truncate_at_char_boundary_never_splits_chars ( ) {
1061+ // Shorter than the limit: unchanged.
1062+ assert_eq ! ( truncate_at_char_boundary( "abc" , 20 ) , "abc" ) ;
1063+ // Exactly at the limit: unchanged.
1064+ assert_eq ! ( truncate_at_char_boundary( "a" . repeat( 20 ) . as_str( ) , 20 ) , "a" . repeat( 20 ) ) ;
1065+ // ASCII over the limit: plain byte cut.
1066+ assert_eq ! ( truncate_at_char_boundary( "abcdef" , 3 ) , "abc" ) ;
1067+ // Multi-byte char straddling the cut point: back up to the boundary.
1068+ // "é" is 2 bytes; cutting "aé" at byte 2 lands mid-char.
1069+ assert_eq ! ( truncate_at_char_boundary( "aéz" , 2 ) , "a" ) ;
1070+ // 4-byte char straddling the cut.
1071+ assert_eq ! ( truncate_at_char_boundary( "ab😀z" , 4 ) , "ab" ) ;
1072+ // Degenerate limit 0.
1073+ assert_eq ! ( truncate_at_char_boundary( "é" , 0 ) , "" ) ;
1074+ }
1075+
1076+ #[ test]
1077+ fn migration_from_v0_1_handles_multibyte_id_without_panic ( ) {
1078+ // #352: a legacy id whose multi-byte UTF-8 char straddles byte offset
1079+ // 20 used to panic in the byte-index slice building `key`, aborting
1080+ // the whole one-time migration. The char at bytes 19..21 ("é") is the
1081+ // exact repro from the issue.
1082+ let ( old_conn, old_path) = temp_db ( ) ;
1083+ old_conn
1084+ . execute_batch (
1085+ "CREATE TABLE memories (
1086+ id TEXT PRIMARY KEY, content TEXT NOT NULL,
1087+ type TEXT DEFAULT 'insight', summary TEXT DEFAULT '',
1088+ relevance REAL DEFAULT 0.0, decay_score REAL DEFAULT 1.0,
1089+ retrieval_count INTEGER DEFAULT 0, layer TEXT DEFAULT 'working',
1090+ topic_path TEXT DEFAULT '', created_at_unix_ms INTEGER NOT NULL,
1091+ last_accessed_unix_ms INTEGER NOT NULL, workspace_hash TEXT DEFAULT '',
1092+ tags TEXT DEFAULT '{}', links TEXT DEFAULT '[]', source TEXT DEFAULT 'mimir',
1093+ verified INTEGER DEFAULT 0
1094+ );" ,
1095+ )
1096+ . expect ( "create v0.1 schema" ) ;
1097+
1098+ // 19 ASCII bytes, then a 2-byte char occupying bytes 19..21.
1099+ let evil_id = format ! ( "{}é-tail" , "x" . repeat( 19 ) ) ;
1100+ assert ! ( !evil_id. is_char_boundary( 20 ) , "precondition: byte 20 is mid-char" ) ;
1101+ let now = now_ms ( ) ;
1102+ old_conn
1103+ . execute (
1104+ "INSERT INTO memories (id, content, type, created_at_unix_ms, last_accessed_unix_ms)
1105+ VALUES (?1, ?2, ?3, ?4, ?5)" ,
1106+ params ! [ evil_id, "Unicode id content" , "insight" , now, now] ,
1107+ )
1108+ . expect ( "insert multibyte-id memory" ) ;
1109+ drop ( old_conn) ;
1110+
1111+ let ( new_conn, _new_path) = temp_db ( ) ;
1112+ let report = migrate_from_v0_1 ( & old_path, & new_conn) . expect ( "migrate must not panic" ) ;
1113+
1114+ assert_eq ! ( report. total_old_memories, 1 ) ;
1115+ assert_eq ! ( report. entities_created, 1 ) ;
1116+ assert ! ( report. errors. is_empty( ) , "errors: {:?}" , report. errors) ;
1117+
1118+ let key: String = new_conn
1119+ . query_row (
1120+ "SELECT key FROM entities WHERE id = ?1" ,
1121+ params ! [ evil_id] ,
1122+ |r| r. get ( 0 ) ,
1123+ )
1124+ . unwrap ( ) ;
1125+ // Boundary walked back from 20 to 19: the é is dropped, not split.
1126+ assert_eq ! ( key, format!( "migrated-{}" , "x" . repeat( 19 ) ) ) ;
1127+
1128+ let _ = std:: fs:: remove_file ( & old_path) ;
1129+ }
1130+
10411131 #[ test]
10421132 fn gather_stats_returns_expected_shape ( ) {
10431133 let ( conn, path) = temp_db ( ) ;
0 commit comments