@@ -163,11 +163,43 @@ def node_id_for(payload: object, path: Path) -> str:
163163 return path .stem
164164
165165
166+ # Perf (Addendum 5): an index entry that carries these keys is a COMPLETE recall
167+ # payload (scoring + hard-reject + render), so recall scores it without opening
168+ # the node file. Older "thin" entries lack them and are lazily loaded per-node.
169+ _FAT_INDEX_KEYS = ("authority" , "sensitivity" , "project_id" )
170+
171+
166172def iter_node_payloads (store : TreeStore , project_id : str ) -> list [tuple [Path , Any ]]:
173+ """Yield (path, payload) for every node, reading index.json ONCE when present.
174+
175+ Perf: the fat ``index.json`` (Addendum 5) lets recall score+reject all nodes
176+ from a single file read — O(1) instead of O(n) node-file opens. Entries that
177+ predate the fat index (``thin``) are lazily loaded so mixed trees, and trees
178+ written by an older codex/claude build, stay correct. Falls back to scanning
179+ node files when no index exists.
180+ """
167181 directory = store .nodes_dir (project_id )
182+ index = store .load_index (project_id )
183+ entries = index .get ("nodes" ) if isinstance (index , dict ) else None
184+ if entries :
185+ payloads : list [tuple [Path , Any ]] = []
186+ for entry in entries :
187+ if not isinstance (entry , dict ):
188+ continue
189+ node_id = entry .get ("node_id" )
190+ if not node_id :
191+ continue
192+ path = directory / f"{ node_id } .json"
193+ if all (key in entry for key in _FAT_INDEX_KEYS ):
194+ payloads .append ((path , entry ))
195+ else : # thin entry: fall back to the authoritative node file
196+ payloads .append ((path , store .load_node (project_id , node_id )))
197+ return payloads
198+
199+ # No index (or empty "nodes"): scan node files (legacy / un-indexed tree).
168200 if not directory .exists ():
169201 return []
170- payloads : list [ tuple [ Path , Any ]] = []
202+ payloads = []
171203 for path in sorted (directory .glob ("*.json" )):
172204 if path .name .startswith ("." ):
173205 continue
@@ -178,6 +210,48 @@ def iter_node_payloads(store: TreeStore, project_id: str) -> list[tuple[Path, An
178210 return payloads
179211
180212
213+ def candidate_payloads (
214+ store : TreeStore , project_id : str , analysis : dict [str , Any ]
215+ ) -> list [tuple [Path , Any ]]:
216+ """Return only the nodes that could score > 0, via the inverted index.
217+
218+ Perf (Addendum 5): ``score_node`` returns 0 unless a query search term is a
219+ SUBSTRING of the node's summary/trigger/entities/paths/tags/links, and recall
220+ drops score<=0 nodes. So the candidate set = nodes whose posting tokens
221+ contain a search term. This is LOSSLESS (same ranked results) but scores only
222+ a handful of nodes instead of all N. Falls back to the full scan when the
223+ index has no ``postings`` (older tree) so behaviour is never worse.
224+ """
225+ index = store .load_index (project_id )
226+ postings = index .get ("postings" ) if isinstance (index , dict ) else None
227+ entries = index .get ("nodes" ) if isinstance (index , dict ) else None
228+ if not isinstance (postings , dict ) or not isinstance (entries , list ):
229+ return iter_node_payloads (store , project_id )
230+
231+ search_terms = [s for s in analysis .get ("search_terms" , []) if s ]
232+ if not search_terms :
233+ return [] # nothing can score > 0 without a search term
234+
235+ candidate_ids : set [str ] = set ()
236+ for token , ids in postings .items ():
237+ if isinstance (ids , list ) and any (s in token for s in search_terms ):
238+ candidate_ids .update (ids )
239+ if not candidate_ids :
240+ return []
241+
242+ directory = store .nodes_dir (project_id )
243+ by_id = {e .get ("node_id" ): e for e in entries if isinstance (e , dict )}
244+ out : list [tuple [Path , Any ]] = []
245+ for nid in candidate_ids :
246+ entry = by_id .get (nid )
247+ path = directory / f"{ nid } .json"
248+ if isinstance (entry , dict ) and all (key in entry for key in _FAT_INDEX_KEYS ):
249+ out .append ((path , entry ))
250+ else : # thin / missing entry: authoritative node file
251+ out .append ((path , store .load_node (project_id , nid )))
252+ return out
253+
254+
181255def _as_dict (value : object ) -> dict [str , Any ]:
182256 return value if isinstance (value , dict ) else {}
183257
@@ -391,7 +465,7 @@ def recall(
391465 rejected : list [dict [str , str ]] = []
392466 scored : list [tuple [float , dict [str , Any ], dict [str , float ]]] = []
393467
394- for path , payload in iter_node_payloads (store , context .project_id ):
468+ for path , payload in candidate_payloads (store , context .project_id , analysis ):
395469 node_id = node_id_for (payload , path )
396470 reason = hard_reject_reason (payload , context , include_deprecated )
397471 if reason :
0 commit comments