Skip to content

Commit ee65a39

Browse files
authored
Merge branch 'main' into fix/354-grpc-ci-and-hardening
2 parents 244b7fd + 6d92689 commit ee65a39

1 file changed

Lines changed: 92 additions & 2 deletions

File tree

src/schema.rs

Lines changed: 92 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,21 @@ pub fn entity_count(conn: &Connection) -> Result<i64, Box<dyn std::error::Error>
357357
Ok(count)
358358
}
359359

360+
/// Truncate `s` to at most `max_bytes` bytes without splitting a UTF-8
361+
/// character. `&s[..n]` panics when `n` is not a char boundary, so walk the
362+
/// cut point back to the nearest boundary instead (stable-Rust equivalent of
363+
/// the nightly `floor_char_boundary`). (#352)
364+
fn truncate_at_char_boundary(s: &str, max_bytes: usize) -> &str {
365+
if s.len() <= max_bytes {
366+
return s;
367+
}
368+
let mut end = max_bytes;
369+
while end > 0 && !s.is_char_boundary(end) {
370+
end -= 1;
371+
}
372+
&s[..end]
373+
}
374+
360375
/// Migrate from v0.1.x schema to v0.2.0.
361376
///
362377
/// Opens the old DB, reads all memories, writes them as entities into the new schema,
@@ -457,9 +472,12 @@ pub fn migrate_from_v0_1(
457472
}
458473
let tags_json = serde_json::to_string(&tags_value).unwrap_or_else(|_| "[]".to_string());
459474

460-
// Category and key: derive from type + truncated id
475+
// Category and key: derive from type + truncated id. Truncation must
476+
// respect char boundaries: legacy v0.1 ids were written by external
477+
// systems and may contain multi-byte UTF-8 — a raw byte slice panics
478+
// if byte 20 falls inside a char, aborting the whole migration (#352).
461479
let category = "general".to_string();
462-
let key = format!("migrated-{}", &id[..id.len().min(20)]);
480+
let key = format!("migrated-{}", truncate_at_char_boundary(&id, 20));
463481

464482
let verified_int = if verified != 0 { 1 } else { 0 };
465483

@@ -928,6 +946,78 @@ mod tests {
928946
let _ = std::fs::remove_file(&old_path);
929947
}
930948

949+
#[test]
950+
fn truncate_at_char_boundary_never_splits_chars() {
951+
// Shorter than the limit: unchanged.
952+
assert_eq!(truncate_at_char_boundary("abc", 20), "abc");
953+
// Exactly at the limit: unchanged.
954+
assert_eq!(truncate_at_char_boundary("a".repeat(20).as_str(), 20), "a".repeat(20));
955+
// ASCII over the limit: plain byte cut.
956+
assert_eq!(truncate_at_char_boundary("abcdef", 3), "abc");
957+
// Multi-byte char straddling the cut point: back up to the boundary.
958+
// "é" is 2 bytes; cutting "aé" at byte 2 lands mid-char.
959+
assert_eq!(truncate_at_char_boundary("aéz", 2), "a");
960+
// 4-byte char straddling the cut.
961+
assert_eq!(truncate_at_char_boundary("ab😀z", 4), "ab");
962+
// Degenerate limit 0.
963+
assert_eq!(truncate_at_char_boundary("é", 0), "");
964+
}
965+
966+
#[test]
967+
fn migration_from_v0_1_handles_multibyte_id_without_panic() {
968+
// #352: a legacy id whose multi-byte UTF-8 char straddles byte offset
969+
// 20 used to panic in the byte-index slice building `key`, aborting
970+
// the whole one-time migration. The char at bytes 19..21 ("é") is the
971+
// exact repro from the issue.
972+
let (old_conn, old_path) = temp_db();
973+
old_conn
974+
.execute_batch(
975+
"CREATE TABLE memories (
976+
id TEXT PRIMARY KEY, content TEXT NOT NULL,
977+
type TEXT DEFAULT 'insight', summary TEXT DEFAULT '',
978+
relevance REAL DEFAULT 0.0, decay_score REAL DEFAULT 1.0,
979+
retrieval_count INTEGER DEFAULT 0, layer TEXT DEFAULT 'working',
980+
topic_path TEXT DEFAULT '', created_at_unix_ms INTEGER NOT NULL,
981+
last_accessed_unix_ms INTEGER NOT NULL, workspace_hash TEXT DEFAULT '',
982+
tags TEXT DEFAULT '{}', links TEXT DEFAULT '[]', source TEXT DEFAULT 'mimir',
983+
verified INTEGER DEFAULT 0
984+
);",
985+
)
986+
.expect("create v0.1 schema");
987+
988+
// 19 ASCII bytes, then a 2-byte char occupying bytes 19..21.
989+
let evil_id = format!("{}é-tail", "x".repeat(19));
990+
assert!(!evil_id.is_char_boundary(20), "precondition: byte 20 is mid-char");
991+
let now = now_ms();
992+
old_conn
993+
.execute(
994+
"INSERT INTO memories (id, content, type, created_at_unix_ms, last_accessed_unix_ms)
995+
VALUES (?1, ?2, ?3, ?4, ?5)",
996+
params![evil_id, "Unicode id content", "insight", now, now],
997+
)
998+
.expect("insert multibyte-id memory");
999+
drop(old_conn);
1000+
1001+
let (new_conn, _new_path) = temp_db();
1002+
let report = migrate_from_v0_1(&old_path, &new_conn).expect("migrate must not panic");
1003+
1004+
assert_eq!(report.total_old_memories, 1);
1005+
assert_eq!(report.entities_created, 1);
1006+
assert!(report.errors.is_empty(), "errors: {:?}", report.errors);
1007+
1008+
let key: String = new_conn
1009+
.query_row(
1010+
"SELECT key FROM entities WHERE id = ?1",
1011+
params![evil_id],
1012+
|r| r.get(0),
1013+
)
1014+
.unwrap();
1015+
// Boundary walked back from 20 to 19: the é is dropped, not split.
1016+
assert_eq!(key, format!("migrated-{}", "x".repeat(19)));
1017+
1018+
let _ = std::fs::remove_file(&old_path);
1019+
}
1020+
9311021
#[test]
9321022
fn gather_stats_returns_expected_shape() {
9331023
let (conn, path) = temp_db();

0 commit comments

Comments
 (0)