Merge: Addendum 5 — inverted-index recall (~25x faster, lossless)

alfredolopez80 · alfredolopez80 · commit 8daf2457be2e · 2026-06-17T14:48:04.000+02:00
* claude/naughty-ptolemy-c0b392:
  perf(memory): inverted-index recall — 250ms-&gt;10ms (~25x), lossless (Addendum 5)
diff --git a/scripts/memory/recall_v2.py b/scripts/memory/recall_v2.py
@@ -163,11 +163,43 @@ def node_id_for(payload: object, path: Path) -> str:
     return path.stem
 
 
+# Perf (Addendum 5): an index entry that carries these keys is a COMPLETE recall
+# payload (scoring + hard-reject + render), so recall scores it without opening
+# the node file. Older "thin" entries lack them and are lazily loaded per-node.
+_FAT_INDEX_KEYS = ("authority", "sensitivity", "project_id")
+
+
 def iter_node_payloads(store: TreeStore, project_id: str) -> list[tuple[Path, Any]]:
+    """Yield (path, payload) for every node, reading index.json ONCE when present.
+
+    Perf: the fat ``index.json`` (Addendum 5) lets recall score+reject all nodes
+    from a single file read — O(1) instead of O(n) node-file opens. Entries that
+    predate the fat index (``thin``) are lazily loaded so mixed trees, and trees
+    written by an older codex/claude build, stay correct. Falls back to scanning
+    node files when no index exists.
+    """
     directory = store.nodes_dir(project_id)
+    index = store.load_index(project_id)
+    entries = index.get("nodes") if isinstance(index, dict) else None
+    if entries:
+        payloads: list[tuple[Path, Any]] = []
+        for entry in entries:
+            if not isinstance(entry, dict):
+                continue
+            node_id = entry.get("node_id")
+            if not node_id:
+                continue
+            path = directory / f"{node_id}.json"
+            if all(key in entry for key in _FAT_INDEX_KEYS):
+                payloads.append((path, entry))
+            else:  # thin entry: fall back to the authoritative node file
+                payloads.append((path, store.load_node(project_id, node_id)))
+        return payloads
+
+    # No index (or empty "nodes"): scan node files (legacy / un-indexed tree).
     if not directory.exists():
         return []
-    payloads: list[tuple[Path, Any]] = []
+    payloads = []
     for path in sorted(directory.glob("*.json")):
         if path.name.startswith("."):
             continue
@@ -178,6 +210,48 @@ def iter_node_payloads(store: TreeStore, project_id: str) -> list[tuple[Path, An
     return payloads
 
 
+def candidate_payloads(
+    store: TreeStore, project_id: str, analysis: dict[str, Any]
+) -> list[tuple[Path, Any]]:
+    """Return only the nodes that could score > 0, via the inverted index.
+
+    Perf (Addendum 5): ``score_node`` returns 0 unless a query search term is a
+    SUBSTRING of the node's summary/trigger/entities/paths/tags/links, and recall
+    drops score<=0 nodes. So the candidate set = nodes whose posting tokens
+    contain a search term. This is LOSSLESS (same ranked results) but scores only
+    a handful of nodes instead of all N. Falls back to the full scan when the
+    index has no ``postings`` (older tree) so behaviour is never worse.
+    """
+    index = store.load_index(project_id)
+    postings = index.get("postings") if isinstance(index, dict) else None
+    entries = index.get("nodes") if isinstance(index, dict) else None
+    if not isinstance(postings, dict) or not isinstance(entries, list):
+        return iter_node_payloads(store, project_id)
+
+    search_terms = [s for s in analysis.get("search_terms", []) if s]
+    if not search_terms:
+        return []  # nothing can score > 0 without a search term
+
+    candidate_ids: set[str] = set()
+    for token, ids in postings.items():
+        if isinstance(ids, list) and any(s in token for s in search_terms):
+            candidate_ids.update(ids)
+    if not candidate_ids:
+        return []
+
+    directory = store.nodes_dir(project_id)
+    by_id = {e.get("node_id"): e for e in entries if isinstance(e, dict)}
+    out: list[tuple[Path, Any]] = []
+    for nid in candidate_ids:
+        entry = by_id.get(nid)
+        path = directory / f"{nid}.json"
+        if isinstance(entry, dict) and all(key in entry for key in _FAT_INDEX_KEYS):
+            out.append((path, entry))
+        else:  # thin / missing entry: authoritative node file
+            out.append((path, store.load_node(project_id, nid)))
+    return out
+
+
 def _as_dict(value: object) -> dict[str, Any]:
     return value if isinstance(value, dict) else {}
 
@@ -391,7 +465,7 @@ def recall(
     rejected: list[dict[str, str]] = []
     scored: list[tuple[float, dict[str, Any], dict[str, float]]] = []
 
-    for path, payload in iter_node_payloads(store, context.project_id):
+    for path, payload in candidate_payloads(store, context.project_id, analysis):
         node_id = node_id_for(payload, path)
         reason = hard_reject_reason(payload, context, include_deprecated)
         if reason:
diff --git a/scripts/memory/tree_store.py b/scripts/memory/tree_store.py
@@ -489,15 +489,93 @@ def _index_entry(node: dict[str, Any]) -> dict[str, Any]:
             "raw_ref": ref,
             "updated_at": node.get("updated_at", ""),
             "created_at": node.get("created_at", ""),
+            # Perf (Addendum 5): the index entry carries EVERY field recall_v2
+            # scores + hard-rejects + renders on, so recall reads index.json
+            # once (O(1)) instead of opening all N node files (O(n)). These
+            # fields make the entry a complete recall payload. RED never reaches
+            # disk, so none of this leaks secret material.
+            "salience": node.get("salience", {}),
+            "sensitivity": node.get("sensitivity", ""),
+            "authority": node.get("authority", ""),
+            "project_id": node.get("project_id", ""),
+            "workspace_instance_id": node.get("workspace_instance_id", ""),
+            "repo_remote_hash": node.get("repo_remote_hash", ""),
+            "commit": node.get("commit", ""),
+            "session_id": node.get("session_id", ""),
+            "detailed_summary": node.get("detailed_summary", ""),
+            "source_description": node.get("source_description", ""),
         }
 
+    def load_index(self, project_id: str) -> dict[str, Any] | None:
+        """Return the parsed index.json for a project, or None if absent/corrupt.
+
+        Perf (Addendum 5): recall reads this single file instead of opening every
+        node file. Each entry in ``nodes`` is a complete recall payload (see
+        ``_index_entry``). Returns None on any read/parse error so callers fall
+        back to the per-node scan and never crash on a damaged index.
+        """
+        try:
+            path = self.project_tree(project_id) / "index.json"
+        except TreeStorePathError:
+            return None
+        if not path.exists():
+            return None
+        try:
+            data = json.loads(path.read_text(encoding="utf-8"))
+        except (OSError, json.JSONDecodeError, ValueError):
+            return None
+        return data if isinstance(data, dict) else None
+
+    @staticmethod
+    def _entry_tokens(entry: dict[str, Any]) -> set[str]:
+        """Maximal ``[A-Za-z0-9_./-]`` runs (len>=3) of an entry's searchable text.
+
+        Perf (Addendum 5): these are the posting tokens. They are deliberately
+        NOT stopword/length-filtered the way ``recall_v2.terms`` filters QUERY
+        terms -- recall matches a query term as a SUBSTRING of the node text, and
+        a non-stopword query term can sit inside a stopword run (e.g. ``her`` in
+        ``where``). Indexing every run keeps candidate selection LOSSLESS: any
+        node that ``score_node`` would rank > 0 shares a token here. Runs shorter
+        than 3 chars cannot contain a (>=3 char) query term, so they are skipped.
+        """
+        trigger = entry.get("trigger")
+        trigger_text = " ".join(str(v) for v in trigger.values()) if isinstance(trigger, dict) else str(trigger or "")
+        parts = [
+            str(entry.get("summary", "")),
+            trigger_text,
+            " ".join(str(x) for x in entry.get("entities", []) or []),
+            " ".join(str(x) for x in entry.get("source_paths", []) or []),
+            " ".join(str(x) for x in entry.get("topic_tags", []) or []),
+            " ".join(str(x) for x in entry.get("links", []) or []),
+        ]
+        blob = " ".join(parts).lower()
+        return {tok for tok in re.findall(r"[A-Za-z0-9_./-]+", blob) if len(tok) >= 3}
+
+    def _build_postings(self, nodes: list[dict[str, Any]]) -> dict[str, list[str]]:
+        postings: dict[str, list[str]] = {}
+        for entry in nodes:
+            node_id = entry.get("node_id")
+            if not node_id:
+                continue
+            for tok in self._entry_tokens(entry):
+                postings.setdefault(tok, []).append(node_id)
+        for tok in postings:
+            postings[tok] = sorted(set(postings[tok]))
+        return postings
+
     def _write_index(self, project_id: str) -> None:
         root = self.ensure_layout(project_id)
+        nodes = self.list_nodes(project_id)
         index = {
             "schema_version": INDEX_SCHEMA_VERSION,
             "project_id": project_id,
             "updated_at": now_iso(),
-            "nodes": self.list_nodes(project_id),
+            "nodes": nodes,
+            # Perf (Addendum 5): inverted index (token -> node_ids) so recall scores
+            # only candidate nodes that share a term with the query, not all N.
+            # Lossless: see _entry_tokens. recall falls back to scoring all nodes
+            # when "postings" is absent (older index).
+            "postings": self._build_postings(nodes),
         }
         atomic_write_json(root / "index.json", index)