@@ -357,6 +357,21 @@ pub fn entity_count(conn: &Connection) -> Result<i64, Box<dyn std::error::Error>
357357 Ok ( count)
358358}
359359
360+ /// Truncate `s` to at most `max_bytes` bytes without splitting a UTF-8
361+ /// character. `&s[..n]` panics when `n` is not a char boundary, so walk the
362+ /// cut point back to the nearest boundary instead (stable-Rust equivalent of
363+ /// the nightly `floor_char_boundary`). (#352)
364+ fn truncate_at_char_boundary ( s : & str , max_bytes : usize ) -> & str {
365+ if s. len ( ) <= max_bytes {
366+ return s;
367+ }
368+ let mut end = max_bytes;
369+ while end > 0 && !s. is_char_boundary ( end) {
370+ end -= 1 ;
371+ }
372+ & s[ ..end]
373+ }
374+
360375/// Migrate from v0.1.x schema to v0.2.0.
361376///
362377/// Opens the old DB, reads all memories, writes them as entities into the new schema,
@@ -457,9 +472,12 @@ pub fn migrate_from_v0_1(
457472 }
458473 let tags_json = serde_json:: to_string ( & tags_value) . unwrap_or_else ( |_| "[]" . to_string ( ) ) ;
459474
460- // Category and key: derive from type + truncated id
475+ // Category and key: derive from type + truncated id. Truncation must
476+ // respect char boundaries: legacy v0.1 ids were written by external
477+ // systems and may contain multi-byte UTF-8 — a raw byte slice panics
478+ // if byte 20 falls inside a char, aborting the whole migration (#352).
461479 let category = "general" . to_string ( ) ;
462- let key = format ! ( "migrated-{}" , & id[ ..id . len ( ) . min ( 20 ) ] ) ;
480+ let key = format ! ( "migrated-{}" , truncate_at_char_boundary ( & id, 20 ) ) ;
463481
464482 let verified_int = if verified != 0 { 1 } else { 0 } ;
465483
@@ -928,6 +946,78 @@ mod tests {
928946 let _ = std:: fs:: remove_file ( & old_path) ;
929947 }
930948
949+ #[ test]
950+ fn truncate_at_char_boundary_never_splits_chars ( ) {
951+ // Shorter than the limit: unchanged.
952+ assert_eq ! ( truncate_at_char_boundary( "abc" , 20 ) , "abc" ) ;
953+ // Exactly at the limit: unchanged.
954+ assert_eq ! ( truncate_at_char_boundary( "a" . repeat( 20 ) . as_str( ) , 20 ) , "a" . repeat( 20 ) ) ;
955+ // ASCII over the limit: plain byte cut.
956+ assert_eq ! ( truncate_at_char_boundary( "abcdef" , 3 ) , "abc" ) ;
957+ // Multi-byte char straddling the cut point: back up to the boundary.
958+ // "é" is 2 bytes; cutting "aé" at byte 2 lands mid-char.
959+ assert_eq ! ( truncate_at_char_boundary( "aéz" , 2 ) , "a" ) ;
960+ // 4-byte char straddling the cut.
961+ assert_eq ! ( truncate_at_char_boundary( "ab😀z" , 4 ) , "ab" ) ;
962+ // Degenerate limit 0.
963+ assert_eq ! ( truncate_at_char_boundary( "é" , 0 ) , "" ) ;
964+ }
965+
966+ #[ test]
967+ fn migration_from_v0_1_handles_multibyte_id_without_panic ( ) {
968+ // #352: a legacy id whose multi-byte UTF-8 char straddles byte offset
969+ // 20 used to panic in the byte-index slice building `key`, aborting
970+ // the whole one-time migration. The char at bytes 19..21 ("é") is the
971+ // exact repro from the issue.
972+ let ( old_conn, old_path) = temp_db ( ) ;
973+ old_conn
974+ . execute_batch (
975+ "CREATE TABLE memories (
976+ id TEXT PRIMARY KEY, content TEXT NOT NULL,
977+ type TEXT DEFAULT 'insight', summary TEXT DEFAULT '',
978+ relevance REAL DEFAULT 0.0, decay_score REAL DEFAULT 1.0,
979+ retrieval_count INTEGER DEFAULT 0, layer TEXT DEFAULT 'working',
980+ topic_path TEXT DEFAULT '', created_at_unix_ms INTEGER NOT NULL,
981+ last_accessed_unix_ms INTEGER NOT NULL, workspace_hash TEXT DEFAULT '',
982+ tags TEXT DEFAULT '{}', links TEXT DEFAULT '[]', source TEXT DEFAULT 'mimir',
983+ verified INTEGER DEFAULT 0
984+ );" ,
985+ )
986+ . expect ( "create v0.1 schema" ) ;
987+
988+ // 19 ASCII bytes, then a 2-byte char occupying bytes 19..21.
989+ let evil_id = format ! ( "{}é-tail" , "x" . repeat( 19 ) ) ;
990+ assert ! ( !evil_id. is_char_boundary( 20 ) , "precondition: byte 20 is mid-char" ) ;
991+ let now = now_ms ( ) ;
992+ old_conn
993+ . execute (
994+ "INSERT INTO memories (id, content, type, created_at_unix_ms, last_accessed_unix_ms)
995+ VALUES (?1, ?2, ?3, ?4, ?5)" ,
996+ params ! [ evil_id, "Unicode id content" , "insight" , now, now] ,
997+ )
998+ . expect ( "insert multibyte-id memory" ) ;
999+ drop ( old_conn) ;
1000+
1001+ let ( new_conn, _new_path) = temp_db ( ) ;
1002+ let report = migrate_from_v0_1 ( & old_path, & new_conn) . expect ( "migrate must not panic" ) ;
1003+
1004+ assert_eq ! ( report. total_old_memories, 1 ) ;
1005+ assert_eq ! ( report. entities_created, 1 ) ;
1006+ assert ! ( report. errors. is_empty( ) , "errors: {:?}" , report. errors) ;
1007+
1008+ let key: String = new_conn
1009+ . query_row (
1010+ "SELECT key FROM entities WHERE id = ?1" ,
1011+ params ! [ evil_id] ,
1012+ |r| r. get ( 0 ) ,
1013+ )
1014+ . unwrap ( ) ;
1015+ // Boundary walked back from 20 to 19: the é is dropped, not split.
1016+ assert_eq ! ( key, format!( "migrated-{}" , "x" . repeat( 19 ) ) ) ;
1017+
1018+ let _ = std:: fs:: remove_file ( & old_path) ;
1019+ }
1020+
9311021 #[ test]
9321022 fn gather_stats_returns_expected_shape ( ) {
9331023 let ( conn, path) = temp_db ( ) ;
0 commit comments