Skip to content

Commit 2446210

Browse files
authored
Merge branch 'main' into fix/353-migration-race
2 parents 56baa84 + 6d92689 commit 2446210

1 file changed

Lines changed: 92 additions & 2 deletions

File tree

src/schema.rs

Lines changed: 92 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,21 @@ pub fn entity_count(conn: &Connection) -> Result<i64, Box<dyn std::error::Error>
355355
Ok(count)
356356
}
357357

358+
/// Truncate `s` to at most `max_bytes` bytes without splitting a UTF-8
359+
/// character. `&s[..n]` panics when `n` is not a char boundary, so walk the
360+
/// cut point back to the nearest boundary instead (stable-Rust equivalent of
361+
/// the nightly `floor_char_boundary`). (#352)
362+
fn truncate_at_char_boundary(s: &str, max_bytes: usize) -> &str {
363+
if s.len() <= max_bytes {
364+
return s;
365+
}
366+
let mut end = max_bytes;
367+
while end > 0 && !s.is_char_boundary(end) {
368+
end -= 1;
369+
}
370+
&s[..end]
371+
}
372+
358373
/// Migrate from v0.1.x schema to v0.2.0.
359374
///
360375
/// Opens the old DB, reads all memories, writes them as entities into the new schema,
@@ -455,9 +470,12 @@ pub fn migrate_from_v0_1(
455470
}
456471
let tags_json = serde_json::to_string(&tags_value).unwrap_or_else(|_| "[]".to_string());
457472

458-
// Category and key: derive from type + truncated id
473+
// Category and key: derive from type + truncated id. Truncation must
474+
// respect char boundaries: legacy v0.1 ids were written by external
475+
// systems and may contain multi-byte UTF-8 — a raw byte slice panics
476+
// if byte 20 falls inside a char, aborting the whole migration (#352).
459477
let category = "general".to_string();
460-
let key = format!("migrated-{}", &id[..id.len().min(20)]);
478+
let key = format!("migrated-{}", truncate_at_char_boundary(&id, 20));
461479

462480
let verified_int = if verified != 0 { 1 } else { 0 };
463481

@@ -1038,6 +1056,78 @@ mod tests {
10381056
let _ = std::fs::remove_file(&old_path);
10391057
}
10401058

1059+
#[test]
1060+
fn truncate_at_char_boundary_never_splits_chars() {
1061+
// Shorter than the limit: unchanged.
1062+
assert_eq!(truncate_at_char_boundary("abc", 20), "abc");
1063+
// Exactly at the limit: unchanged.
1064+
assert_eq!(truncate_at_char_boundary("a".repeat(20).as_str(), 20), "a".repeat(20));
1065+
// ASCII over the limit: plain byte cut.
1066+
assert_eq!(truncate_at_char_boundary("abcdef", 3), "abc");
1067+
// Multi-byte char straddling the cut point: back up to the boundary.
1068+
// "é" is 2 bytes; cutting "aé" at byte 2 lands mid-char.
1069+
assert_eq!(truncate_at_char_boundary("aéz", 2), "a");
1070+
// 4-byte char straddling the cut.
1071+
assert_eq!(truncate_at_char_boundary("ab😀z", 4), "ab");
1072+
// Degenerate limit 0.
1073+
assert_eq!(truncate_at_char_boundary("é", 0), "");
1074+
}
1075+
1076+
#[test]
1077+
fn migration_from_v0_1_handles_multibyte_id_without_panic() {
1078+
// #352: a legacy id whose multi-byte UTF-8 char straddles byte offset
1079+
// 20 used to panic in the byte-index slice building `key`, aborting
1080+
// the whole one-time migration. The char at bytes 19..21 ("é") is the
1081+
// exact repro from the issue.
1082+
let (old_conn, old_path) = temp_db();
1083+
old_conn
1084+
.execute_batch(
1085+
"CREATE TABLE memories (
1086+
id TEXT PRIMARY KEY, content TEXT NOT NULL,
1087+
type TEXT DEFAULT 'insight', summary TEXT DEFAULT '',
1088+
relevance REAL DEFAULT 0.0, decay_score REAL DEFAULT 1.0,
1089+
retrieval_count INTEGER DEFAULT 0, layer TEXT DEFAULT 'working',
1090+
topic_path TEXT DEFAULT '', created_at_unix_ms INTEGER NOT NULL,
1091+
last_accessed_unix_ms INTEGER NOT NULL, workspace_hash TEXT DEFAULT '',
1092+
tags TEXT DEFAULT '{}', links TEXT DEFAULT '[]', source TEXT DEFAULT 'mimir',
1093+
verified INTEGER DEFAULT 0
1094+
);",
1095+
)
1096+
.expect("create v0.1 schema");
1097+
1098+
// 19 ASCII bytes, then a 2-byte char occupying bytes 19..21.
1099+
let evil_id = format!("{}é-tail", "x".repeat(19));
1100+
assert!(!evil_id.is_char_boundary(20), "precondition: byte 20 is mid-char");
1101+
let now = now_ms();
1102+
old_conn
1103+
.execute(
1104+
"INSERT INTO memories (id, content, type, created_at_unix_ms, last_accessed_unix_ms)
1105+
VALUES (?1, ?2, ?3, ?4, ?5)",
1106+
params![evil_id, "Unicode id content", "insight", now, now],
1107+
)
1108+
.expect("insert multibyte-id memory");
1109+
drop(old_conn);
1110+
1111+
let (new_conn, _new_path) = temp_db();
1112+
let report = migrate_from_v0_1(&old_path, &new_conn).expect("migrate must not panic");
1113+
1114+
assert_eq!(report.total_old_memories, 1);
1115+
assert_eq!(report.entities_created, 1);
1116+
assert!(report.errors.is_empty(), "errors: {:?}", report.errors);
1117+
1118+
let key: String = new_conn
1119+
.query_row(
1120+
"SELECT key FROM entities WHERE id = ?1",
1121+
params![evil_id],
1122+
|r| r.get(0),
1123+
)
1124+
.unwrap();
1125+
// Boundary walked back from 20 to 19: the é is dropped, not split.
1126+
assert_eq!(key, format!("migrated-{}", "x".repeat(19)));
1127+
1128+
let _ = std::fs::remove_file(&old_path);
1129+
}
1130+
10411131
#[test]
10421132
fn gather_stats_returns_expected_shape() {
10431133
let (conn, path) = temp_db();

0 commit comments

Comments
 (0)