|
| 1 | +//! Local, deterministic knowledge extraction (#234). |
| 2 | +//! |
| 3 | +//! Parses raw memory text into structured items — facts, preferences, temporal |
| 4 | +//! events, episodes — using pure, dependency-free heuristics. No cloud LLM, no |
| 5 | +//! embedding/API call, no network: this preserves Mimir's air-gapped, |
| 6 | +//! zero-dependency path (unlike GoodMem/Synap, which require a Gemini key). |
| 7 | +//! |
| 8 | +//! [`Extractor`] is the plugin point. [`NoopExtractor`] is the default (pure |
| 9 | +//! storage, no extraction); [`RuleBasedExtractor`] is a concrete local |
| 10 | +//! implementation. A future model-based extractor can slot in behind the same |
| 11 | +//! trait without touching callers — keeping extraction strictly opt-in. |
| 12 | +
|
| 13 | +use serde::Serialize; |
| 14 | + |
| 15 | +/// The structured kind of an extracted knowledge item. |
| 16 | +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] |
| 17 | +#[serde(rename_all = "snake_case")] |
| 18 | +pub enum ExtractKind { |
| 19 | + /// A declarative statement of fact ("X is Y", "the service uses Z"). |
| 20 | + Fact, |
| 21 | + /// A first-person preference ("I prefer X", "my favorite is Y"). |
| 22 | + Preference, |
| 23 | + /// A statement anchored to a date/time ("shipped on 2026-06-20", "met tuesday"). |
| 24 | + TemporalEvent, |
| 25 | + /// A first-person experiential action ("we deployed the worker tier"). |
| 26 | + Episode, |
| 27 | +} |
| 28 | + |
| 29 | +/// A single structured item extracted from raw memory text. |
| 30 | +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] |
| 31 | +pub struct ExtractedItem { |
| 32 | + pub kind: ExtractKind, |
| 33 | + pub text: String, |
| 34 | +} |
| 35 | + |
| 36 | +/// The extraction plugin point. Implementors turn raw text into structured items. |
| 37 | +pub trait Extractor { |
| 38 | + fn extract(&self, text: &str) -> Vec<ExtractedItem>; |
| 39 | +} |
| 40 | + |
| 41 | +/// The default: no extraction (pure storage). Keeps the zero-dependency path intact. |
| 42 | +pub struct NoopExtractor; |
| 43 | + |
| 44 | +impl Extractor for NoopExtractor { |
| 45 | + fn extract(&self, _text: &str) -> Vec<ExtractedItem> { |
| 46 | + Vec::new() |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +/// A concrete, fully-local, deterministic rule-based extractor. |
| 51 | +/// |
| 52 | +/// Splits text into sentences and classifies each with a fixed priority: |
| 53 | +/// temporal marker → [`ExtractKind::TemporalEvent`]; first-person preference cue → |
| 54 | +/// [`ExtractKind::Preference`]; first-person past action → [`ExtractKind::Episode`]; |
| 55 | +/// declarative copula → [`ExtractKind::Fact`]; otherwise the sentence is skipped |
| 56 | +/// (precision over recall). Identical items are de-duplicated, order preserved. |
| 57 | +pub struct RuleBasedExtractor; |
| 58 | + |
| 59 | +const PREFERENCE_CUES: &[&str] = &[ |
| 60 | + "i prefer", "i like", "i love", "i hate", "i dislike", "i want", "i'd rather", |
| 61 | + "i would rather", "my favorite", "my favourite", "we prefer", "prefer to use", |
| 62 | +]; |
| 63 | + |
| 64 | +// First-person experiential actions (past or habitual) → episodes. |
| 65 | +const EPISODE_CUES: &[&str] = &[ |
| 66 | + "i did", "i went", "i met", "i built", "i wrote", "i fixed", "i shipped", |
| 67 | + "i deployed", "i decided", "i finished", "i completed", "i added", "i removed", |
| 68 | + "we did", "we met", "we built", "we shipped", "we deployed", "we decided", |
| 69 | + "we added", "we fixed", "we migrated", "we launched", |
| 70 | +]; |
| 71 | + |
| 72 | +const MONTHS: &[&str] = &[ |
| 73 | + "january", "february", "march", "april", "may", "june", "july", "august", |
| 74 | + "september", "october", "november", "december", |
| 75 | + "jan ", "feb ", "mar ", "apr ", "jun ", "jul ", "aug ", "sep ", "sept ", |
| 76 | + "oct ", "nov ", "dec ", |
| 77 | +]; |
| 78 | + |
| 79 | +const WEEKDAYS: &[&str] = &[ |
| 80 | + "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", |
| 81 | +]; |
| 82 | + |
| 83 | +const RELATIVE_TIME: &[&str] = &[ |
| 84 | + "yesterday", "today", "tomorrow", "last week", "last month", "last year", |
| 85 | + "next week", "next month", "this morning", "this afternoon", "tonight", |
| 86 | +]; |
| 87 | + |
| 88 | +// Declarative copulas / relations that mark a factual statement. |
| 89 | +const FACT_MARKERS: &[&str] = &[ |
| 90 | + " is ", " are ", " was ", " were ", " has ", " have ", " uses ", " runs on ", |
| 91 | + " consists of ", " supports ", " requires ", " depends on ", " stores ", |
| 92 | +]; |
| 93 | + |
| 94 | +impl RuleBasedExtractor { |
| 95 | + /// True when the sentence carries an explicit date/time marker. |
| 96 | + fn has_temporal_marker(lower: &str) -> bool { |
| 97 | + if Self::contains_year(lower) { |
| 98 | + return true; |
| 99 | + } |
| 100 | + if Self::contains_clock_time(lower) { |
| 101 | + return true; |
| 102 | + } |
| 103 | + MONTHS.iter().any(|m| lower.contains(m)) |
| 104 | + || WEEKDAYS.iter().any(|d| lower.contains(d)) |
| 105 | + || RELATIVE_TIME.iter().any(|r| lower.contains(r)) |
| 106 | + } |
| 107 | + |
| 108 | + /// Detects a 4-digit year in 1900–2099 (the dominant date signal). |
| 109 | + fn contains_year(lower: &str) -> bool { |
| 110 | + let bytes = lower.as_bytes(); |
| 111 | + let mut i = 0; |
| 112 | + while i + 4 <= bytes.len() { |
| 113 | + // Must be a 4-digit run not bordered by other digits. |
| 114 | + let window = &bytes[i..i + 4]; |
| 115 | + let all_digits = window.iter().all(|b| b.is_ascii_digit()); |
| 116 | + let left_ok = i == 0 || !bytes[i - 1].is_ascii_digit(); |
| 117 | + let right_ok = i + 4 == bytes.len() || !bytes[i + 4].is_ascii_digit(); |
| 118 | + if all_digits && left_ok && right_ok { |
| 119 | + let yr = std::str::from_utf8(window).unwrap_or("0"); |
| 120 | + if matches!(&yr[0..2], "19" | "20") { |
| 121 | + return true; |
| 122 | + } |
| 123 | + } |
| 124 | + i += 1; |
| 125 | + } |
| 126 | + false |
| 127 | + } |
| 128 | + |
| 129 | + /// Detects a clock time like `14:30` or `9:05`. |
| 130 | + fn contains_clock_time(lower: &str) -> bool { |
| 131 | + let bytes = lower.as_bytes(); |
| 132 | + for (i, &b) in bytes.iter().enumerate() { |
| 133 | + if b == b':' && i >= 1 && i + 2 < bytes.len() { |
| 134 | + let before = bytes[i - 1].is_ascii_digit(); |
| 135 | + let after = bytes[i + 1].is_ascii_digit() && bytes[i + 2].is_ascii_digit(); |
| 136 | + if before && after { |
| 137 | + return true; |
| 138 | + } |
| 139 | + } |
| 140 | + } |
| 141 | + false |
| 142 | + } |
| 143 | + |
| 144 | + fn classify(sentence: &str) -> Option<ExtractKind> { |
| 145 | + let lower = sentence.to_lowercase(); |
| 146 | + if Self::has_temporal_marker(&lower) { |
| 147 | + return Some(ExtractKind::TemporalEvent); |
| 148 | + } |
| 149 | + if PREFERENCE_CUES.iter().any(|c| lower.contains(c)) { |
| 150 | + return Some(ExtractKind::Preference); |
| 151 | + } |
| 152 | + if EPISODE_CUES.iter().any(|c| lower.contains(c)) { |
| 153 | + return Some(ExtractKind::Episode); |
| 154 | + } |
| 155 | + if FACT_MARKERS.iter().any(|m| lower.contains(m)) { |
| 156 | + return Some(ExtractKind::Fact); |
| 157 | + } |
| 158 | + None |
| 159 | + } |
| 160 | +} |
| 161 | + |
| 162 | +/// Split text into trimmed sentences on `.`, `!`, `?`, and newlines. |
| 163 | +fn split_sentences(text: &str) -> Vec<String> { |
| 164 | + let mut out = Vec::new(); |
| 165 | + let mut buf = String::new(); |
| 166 | + for ch in text.chars() { |
| 167 | + match ch { |
| 168 | + '.' | '!' | '?' | '\n' | '\r' => { |
| 169 | + let s = buf.trim(); |
| 170 | + if !s.is_empty() { |
| 171 | + out.push(s.to_string()); |
| 172 | + } |
| 173 | + buf.clear(); |
| 174 | + } |
| 175 | + _ => buf.push(ch), |
| 176 | + } |
| 177 | + } |
| 178 | + let s = buf.trim(); |
| 179 | + if !s.is_empty() { |
| 180 | + out.push(s.to_string()); |
| 181 | + } |
| 182 | + out |
| 183 | +} |
| 184 | + |
| 185 | +impl Extractor for RuleBasedExtractor { |
| 186 | + fn extract(&self, text: &str) -> Vec<ExtractedItem> { |
| 187 | + let mut out: Vec<ExtractedItem> = Vec::new(); |
| 188 | + for sentence in split_sentences(text) { |
| 189 | + if let Some(kind) = Self::classify(&sentence) { |
| 190 | + let item = ExtractedItem { kind, text: sentence }; |
| 191 | + if !out.contains(&item) { |
| 192 | + out.push(item); |
| 193 | + } |
| 194 | + } |
| 195 | + } |
| 196 | + out |
| 197 | + } |
| 198 | +} |
| 199 | + |
| 200 | +/// Resolve a strategy name to an extractor. Unknown / "none" → [`NoopExtractor`]. |
| 201 | +pub fn extractor_for(strategy: &str) -> Box<dyn Extractor> { |
| 202 | + match strategy { |
| 203 | + "rule_based" => Box::new(RuleBasedExtractor), |
| 204 | + _ => Box::new(NoopExtractor), |
| 205 | + } |
| 206 | +} |
| 207 | + |
| 208 | +#[cfg(test)] |
| 209 | +mod tests { |
| 210 | + use super::*; |
| 211 | + |
| 212 | + fn kinds(items: &[ExtractedItem]) -> Vec<ExtractKind> { |
| 213 | + items.iter().map(|i| i.kind).collect() |
| 214 | + } |
| 215 | + |
| 216 | + #[test] |
| 217 | + fn noop_extracts_nothing() { |
| 218 | + let items = NoopExtractor.extract("I prefer dark mode. The db is Postgres."); |
| 219 | + assert!(items.is_empty()); |
| 220 | + } |
| 221 | + |
| 222 | + #[test] |
| 223 | + fn classifies_preference_fact_temporal_episode() { |
| 224 | + let text = "I prefer dark mode. The database is PostgreSQL. \ |
| 225 | + We deployed the worker tier. We shipped v2 on 2026-06-20."; |
| 226 | + let items = RuleBasedExtractor.extract(text); |
| 227 | + assert_eq!( |
| 228 | + kinds(&items), |
| 229 | + vec![ |
| 230 | + ExtractKind::Preference, |
| 231 | + ExtractKind::Fact, |
| 232 | + ExtractKind::Episode, |
| 233 | + ExtractKind::TemporalEvent, // dated → temporal wins over episode |
| 234 | + ] |
| 235 | + ); |
| 236 | + } |
| 237 | + |
| 238 | + #[test] |
| 239 | + fn temporal_marker_takes_priority() { |
| 240 | + // A clock time and a weekday both mark temporal events. |
| 241 | + let items = RuleBasedExtractor.extract("The standup is at 09:30. We met on Tuesday."); |
| 242 | + assert_eq!(kinds(&items), vec![ExtractKind::TemporalEvent, ExtractKind::TemporalEvent]); |
| 243 | + } |
| 244 | + |
| 245 | + #[test] |
| 246 | + fn year_detection_is_bounded() { |
| 247 | + assert!(RuleBasedExtractor::contains_year("released in 2026")); |
| 248 | + assert!(RuleBasedExtractor::contains_year("back in 1998 it shipped")); |
| 249 | + assert!(!RuleBasedExtractor::contains_year("order 12345 failed")); // 5 digits, not a year |
| 250 | + assert!(!RuleBasedExtractor::contains_year("port 8080 is open")); // not 19/20xx |
| 251 | + } |
| 252 | + |
| 253 | + #[test] |
| 254 | + fn unclassifiable_sentences_are_skipped() { |
| 255 | + // No copula, no cue, no date → nothing (precision over recall). |
| 256 | + let items = RuleBasedExtractor.extract("Hello there. Wow!"); |
| 257 | + assert!(items.is_empty()); |
| 258 | + } |
| 259 | + |
| 260 | + #[test] |
| 261 | + fn deduplicates_identical_items() { |
| 262 | + let items = RuleBasedExtractor.extract("The db is Postgres. The db is Postgres."); |
| 263 | + assert_eq!(items.len(), 1); |
| 264 | + } |
| 265 | + |
| 266 | + #[test] |
| 267 | + fn extraction_is_deterministic() { |
| 268 | + let text = "I like Rust. We shipped on 2026-01-02. The cache is an LRU."; |
| 269 | + let a = RuleBasedExtractor.extract(text); |
| 270 | + let b = RuleBasedExtractor.extract(text); |
| 271 | + assert_eq!(a, b); |
| 272 | + } |
| 273 | + |
| 274 | + #[test] |
| 275 | + fn extractor_for_unknown_is_noop() { |
| 276 | + assert!(extractor_for("nope").extract("The db is Postgres.").is_empty()); |
| 277 | + assert!(!extractor_for("rule_based").extract("The db is Postgres.").is_empty()); |
| 278 | + } |
| 279 | +} |
0 commit comments