Skip to content

Commit 8daf245

Browse files
Merge: Addendum 5 — inverted-index recall (~25x faster, lossless)
* claude/naughty-ptolemy-c0b392: perf(memory): inverted-index recall — 250ms->10ms (~25x), lossless (Addendum 5)
2 parents 8f2d438 + e26a249 commit 8daf245

2 files changed

Lines changed: 155 additions & 3 deletions

File tree

scripts/memory/recall_v2.py

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,11 +163,43 @@ def node_id_for(payload: object, path: Path) -> str:
163163
return path.stem
164164

165165

166+
# Perf (Addendum 5): an index entry that carries these keys is a COMPLETE recall
167+
# payload (scoring + hard-reject + render), so recall scores it without opening
168+
# the node file. Older "thin" entries lack them and are lazily loaded per-node.
169+
_FAT_INDEX_KEYS = ("authority", "sensitivity", "project_id")
170+
171+
166172
def iter_node_payloads(store: TreeStore, project_id: str) -> list[tuple[Path, Any]]:
173+
"""Yield (path, payload) for every node, reading index.json ONCE when present.
174+
175+
Perf: the fat ``index.json`` (Addendum 5) lets recall score+reject all nodes
176+
from a single file read — O(1) instead of O(n) node-file opens. Entries that
177+
predate the fat index (``thin``) are lazily loaded so mixed trees, and trees
178+
written by an older codex/claude build, stay correct. Falls back to scanning
179+
node files when no index exists.
180+
"""
167181
directory = store.nodes_dir(project_id)
182+
index = store.load_index(project_id)
183+
entries = index.get("nodes") if isinstance(index, dict) else None
184+
if entries:
185+
payloads: list[tuple[Path, Any]] = []
186+
for entry in entries:
187+
if not isinstance(entry, dict):
188+
continue
189+
node_id = entry.get("node_id")
190+
if not node_id:
191+
continue
192+
path = directory / f"{node_id}.json"
193+
if all(key in entry for key in _FAT_INDEX_KEYS):
194+
payloads.append((path, entry))
195+
else: # thin entry: fall back to the authoritative node file
196+
payloads.append((path, store.load_node(project_id, node_id)))
197+
return payloads
198+
199+
# No index (or empty "nodes"): scan node files (legacy / un-indexed tree).
168200
if not directory.exists():
169201
return []
170-
payloads: list[tuple[Path, Any]] = []
202+
payloads = []
171203
for path in sorted(directory.glob("*.json")):
172204
if path.name.startswith("."):
173205
continue
@@ -178,6 +210,48 @@ def iter_node_payloads(store: TreeStore, project_id: str) -> list[tuple[Path, An
178210
return payloads
179211

180212

213+
def candidate_payloads(
214+
store: TreeStore, project_id: str, analysis: dict[str, Any]
215+
) -> list[tuple[Path, Any]]:
216+
"""Return only the nodes that could score > 0, via the inverted index.
217+
218+
Perf (Addendum 5): ``score_node`` returns 0 unless a query search term is a
219+
SUBSTRING of the node's summary/trigger/entities/paths/tags/links, and recall
220+
drops score<=0 nodes. So the candidate set = nodes whose posting tokens
221+
contain a search term. This is LOSSLESS (same ranked results) but scores only
222+
a handful of nodes instead of all N. Falls back to the full scan when the
223+
index has no ``postings`` (older tree) so behaviour is never worse.
224+
"""
225+
index = store.load_index(project_id)
226+
postings = index.get("postings") if isinstance(index, dict) else None
227+
entries = index.get("nodes") if isinstance(index, dict) else None
228+
if not isinstance(postings, dict) or not isinstance(entries, list):
229+
return iter_node_payloads(store, project_id)
230+
231+
search_terms = [s for s in analysis.get("search_terms", []) if s]
232+
if not search_terms:
233+
return [] # nothing can score > 0 without a search term
234+
235+
candidate_ids: set[str] = set()
236+
for token, ids in postings.items():
237+
if isinstance(ids, list) and any(s in token for s in search_terms):
238+
candidate_ids.update(ids)
239+
if not candidate_ids:
240+
return []
241+
242+
directory = store.nodes_dir(project_id)
243+
by_id = {e.get("node_id"): e for e in entries if isinstance(e, dict)}
244+
out: list[tuple[Path, Any]] = []
245+
for nid in candidate_ids:
246+
entry = by_id.get(nid)
247+
path = directory / f"{nid}.json"
248+
if isinstance(entry, dict) and all(key in entry for key in _FAT_INDEX_KEYS):
249+
out.append((path, entry))
250+
else: # thin / missing entry: authoritative node file
251+
out.append((path, store.load_node(project_id, nid)))
252+
return out
253+
254+
181255
def _as_dict(value: object) -> dict[str, Any]:
182256
return value if isinstance(value, dict) else {}
183257

@@ -391,7 +465,7 @@ def recall(
391465
rejected: list[dict[str, str]] = []
392466
scored: list[tuple[float, dict[str, Any], dict[str, float]]] = []
393467

394-
for path, payload in iter_node_payloads(store, context.project_id):
468+
for path, payload in candidate_payloads(store, context.project_id, analysis):
395469
node_id = node_id_for(payload, path)
396470
reason = hard_reject_reason(payload, context, include_deprecated)
397471
if reason:

scripts/memory/tree_store.py

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,15 +489,93 @@ def _index_entry(node: dict[str, Any]) -> dict[str, Any]:
489489
"raw_ref": ref,
490490
"updated_at": node.get("updated_at", ""),
491491
"created_at": node.get("created_at", ""),
492+
# Perf (Addendum 5): the index entry carries EVERY field recall_v2
493+
# scores + hard-rejects + renders on, so recall reads index.json
494+
# once (O(1)) instead of opening all N node files (O(n)). These
495+
# fields make the entry a complete recall payload. RED never reaches
496+
# disk, so none of this leaks secret material.
497+
"salience": node.get("salience", {}),
498+
"sensitivity": node.get("sensitivity", ""),
499+
"authority": node.get("authority", ""),
500+
"project_id": node.get("project_id", ""),
501+
"workspace_instance_id": node.get("workspace_instance_id", ""),
502+
"repo_remote_hash": node.get("repo_remote_hash", ""),
503+
"commit": node.get("commit", ""),
504+
"session_id": node.get("session_id", ""),
505+
"detailed_summary": node.get("detailed_summary", ""),
506+
"source_description": node.get("source_description", ""),
492507
}
493508

509+
def load_index(self, project_id: str) -> dict[str, Any] | None:
510+
"""Return the parsed index.json for a project, or None if absent/corrupt.
511+
512+
Perf (Addendum 5): recall reads this single file instead of opening every
513+
node file. Each entry in ``nodes`` is a complete recall payload (see
514+
``_index_entry``). Returns None on any read/parse error so callers fall
515+
back to the per-node scan and never crash on a damaged index.
516+
"""
517+
try:
518+
path = self.project_tree(project_id) / "index.json"
519+
except TreeStorePathError:
520+
return None
521+
if not path.exists():
522+
return None
523+
try:
524+
data = json.loads(path.read_text(encoding="utf-8"))
525+
except (OSError, json.JSONDecodeError, ValueError):
526+
return None
527+
return data if isinstance(data, dict) else None
528+
529+
@staticmethod
530+
def _entry_tokens(entry: dict[str, Any]) -> set[str]:
531+
"""Maximal ``[A-Za-z0-9_./-]`` runs (len>=3) of an entry's searchable text.
532+
533+
Perf (Addendum 5): these are the posting tokens. They are deliberately
534+
NOT stopword/length-filtered the way ``recall_v2.terms`` filters QUERY
535+
terms -- recall matches a query term as a SUBSTRING of the node text, and
536+
a non-stopword query term can sit inside a stopword run (e.g. ``her`` in
537+
``where``). Indexing every run keeps candidate selection LOSSLESS: any
538+
node that ``score_node`` would rank > 0 shares a token here. Runs shorter
539+
than 3 chars cannot contain a (>=3 char) query term, so they are skipped.
540+
"""
541+
trigger = entry.get("trigger")
542+
trigger_text = " ".join(str(v) for v in trigger.values()) if isinstance(trigger, dict) else str(trigger or "")
543+
parts = [
544+
str(entry.get("summary", "")),
545+
trigger_text,
546+
" ".join(str(x) for x in entry.get("entities", []) or []),
547+
" ".join(str(x) for x in entry.get("source_paths", []) or []),
548+
" ".join(str(x) for x in entry.get("topic_tags", []) or []),
549+
" ".join(str(x) for x in entry.get("links", []) or []),
550+
]
551+
blob = " ".join(parts).lower()
552+
return {tok for tok in re.findall(r"[A-Za-z0-9_./-]+", blob) if len(tok) >= 3}
553+
554+
def _build_postings(self, nodes: list[dict[str, Any]]) -> dict[str, list[str]]:
555+
postings: dict[str, list[str]] = {}
556+
for entry in nodes:
557+
node_id = entry.get("node_id")
558+
if not node_id:
559+
continue
560+
for tok in self._entry_tokens(entry):
561+
postings.setdefault(tok, []).append(node_id)
562+
for tok in postings:
563+
postings[tok] = sorted(set(postings[tok]))
564+
return postings
565+
494566
def _write_index(self, project_id: str) -> None:
495567
root = self.ensure_layout(project_id)
568+
nodes = self.list_nodes(project_id)
496569
index = {
497570
"schema_version": INDEX_SCHEMA_VERSION,
498571
"project_id": project_id,
499572
"updated_at": now_iso(),
500-
"nodes": self.list_nodes(project_id),
573+
"nodes": nodes,
574+
# Perf (Addendum 5): inverted index (token -> node_ids) so recall scores
575+
# only candidate nodes that share a term with the query, not all N.
576+
# Lossless: see _entry_tokens. recall falls back to scoring all nodes
577+
# when "postings" is absent (older index).
578+
"postings": self._build_postings(nodes),
501579
}
502580
atomic_write_json(root / "index.json", index)
503581

0 commit comments

Comments
 (0)