Skip to content

Commit 444c5db

Browse files
authored
Merge pull request #3 from hanamorix/week-3-memory-substrate
feat: Week 3 — brain/memory substrate (store, embeddings, hebbian, search)
2 parents 51aa6f6 + db8cfe8 commit 444c5db

13 files changed

Lines changed: 3973 additions & 0 deletions

File tree

brain/memory/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""The memory substrate — SQLite-backed store + embeddings + Hebbian + search.
2+
3+
Four sub-modules, each with a single responsibility:
4+
- store: Memory dataclass + MemoryStore (SQLite-backed CRUD)
5+
- embeddings: provider abstraction + content-hash cache
6+
- hebbian: connection matrix + spreading activation
7+
- search: semantic + emotional + temporal + spreading queries
8+
9+
See spec Section 4.1 for the file-tree and Section 10.1 for the SQLite
10+
data-layer decision (replaces OG's JSON/numpy files).
11+
"""
12+
13+
from brain.memory.store import Memory, MemoryStore
14+
15+
__all__ = ["Memory", "MemoryStore"]

brain/memory/embeddings.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
"""Embedding provider abstraction + content-hash cache.
2+
3+
Provider interface: EmbeddingProvider ABC. Two concrete providers:
4+
- FakeEmbeddingProvider: deterministic hash-based, zero network, used in tests.
5+
- OllamaEmbeddingProvider: calls local Ollama /api/embeddings endpoint
6+
(will be added in Week 5 when the bridge lands).
7+
8+
Cache: EmbeddingCache layers a SQLite content-hash cache on top of any
9+
provider. `get_or_compute(content)` returns the vector, hitting cache on
10+
repeat calls. Content hashed via SHA-256; first 32 hex chars used as key.
11+
12+
Design per spec Section 4.1 (brain/memory/embeddings.py) and Section 10.1
13+
(content-hash embedding cache).
14+
"""
15+
16+
from __future__ import annotations
17+
18+
import hashlib
19+
import sqlite3
20+
from abc import ABC, abstractmethod
21+
from pathlib import Path
22+
23+
import numpy as np
24+
25+
_DEFAULT_DIM = 256
26+
27+
28+
class EmbeddingProvider(ABC):
29+
"""Abstract embedding provider. Subclasses implement `embed` and `embedding_dim`."""
30+
31+
@abstractmethod
32+
def embed(self, text: str) -> np.ndarray:
33+
"""Return a 1-D numpy array of dimension `embedding_dim()`."""
34+
35+
@abstractmethod
36+
def embedding_dim(self) -> int:
37+
"""Return the output dimension of vectors this provider produces."""
38+
39+
40+
class FakeEmbeddingProvider(EmbeddingProvider):
41+
"""Deterministic pseudo-random embedding provider for tests.
42+
43+
Uses SHA-256 of the input text to seed a NumPy Generator, then produces
44+
a unit-norm vector. Same text always produces the same vector; different
45+
text produces different vectors. No network, no external dependencies.
46+
"""
47+
48+
def __init__(self, dim: int = _DEFAULT_DIM) -> None:
49+
self._dim = dim
50+
51+
def embed(self, text: str) -> np.ndarray:
52+
h = hashlib.sha256(text.encode("utf-8")).digest()
53+
seed = int.from_bytes(h[:8], byteorder="big", signed=False)
54+
rng = np.random.default_rng(seed=seed)
55+
vec = rng.standard_normal(self._dim)
56+
norm = np.linalg.norm(vec)
57+
if norm == 0.0:
58+
raise ValueError(f"FakeEmbeddingProvider produced a zero-norm vector (dim={self._dim})")
59+
return vec / norm
60+
61+
def embedding_dim(self) -> int:
62+
return self._dim
63+
64+
65+
class EmbeddingCache:
66+
"""Content-hash cache on top of any EmbeddingProvider.
67+
68+
Storage: SQLite table with (content_hash TEXT PRIMARY KEY, vector BLOB,
69+
dim INTEGER, created_at TEXT). Hash is SHA-256 hex (first 32 chars).
70+
Vector stored as raw float32 bytes via np.ndarray.tobytes().
71+
"""
72+
73+
_SCHEMA = """
74+
CREATE TABLE IF NOT EXISTS embedding_cache (
75+
content_hash TEXT PRIMARY KEY,
76+
vector BLOB NOT NULL,
77+
dim INTEGER NOT NULL,
78+
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
79+
);
80+
"""
81+
82+
def __init__(self, db_path: str | Path, provider: EmbeddingProvider) -> None:
83+
self._conn = sqlite3.connect(str(db_path))
84+
self._conn.executescript(self._SCHEMA)
85+
self._conn.commit()
86+
self._provider = provider
87+
88+
def close(self) -> None:
89+
"""Close the underlying connection."""
90+
self._conn.close()
91+
92+
def get_or_compute(self, content: str) -> np.ndarray:
93+
"""Return the cached embedding for content, computing + storing on miss."""
94+
key = self._hash(content)
95+
row = self._conn.execute(
96+
"SELECT vector, dim FROM embedding_cache WHERE content_hash = ?", (key,)
97+
).fetchone()
98+
if row is not None:
99+
return np.frombuffer(row[0], dtype=np.float32).copy().reshape(row[1])
100+
101+
vec = self._provider.embed(content).astype(np.float32)
102+
self._conn.execute(
103+
"INSERT INTO embedding_cache (content_hash, vector, dim) VALUES (?, ?, ?)",
104+
(key, vec.tobytes(), vec.shape[0]),
105+
)
106+
self._conn.commit()
107+
# Return a float32 copy for consistency with cache hits.
108+
return vec.copy()
109+
110+
def count(self) -> int:
111+
"""Return the number of cached embeddings."""
112+
return int(self._conn.execute("SELECT COUNT(*) FROM embedding_cache").fetchone()[0])
113+
114+
@staticmethod
115+
def _hash(content: str) -> str:
116+
return hashlib.sha256(content.encode("utf-8")).hexdigest()[:32]
117+
118+
119+
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
120+
"""Return cosine similarity between two vectors. Range [-1, 1]."""
121+
denom = float(np.linalg.norm(a) * np.linalg.norm(b))
122+
if denom == 0.0:
123+
return 0.0
124+
return float(np.dot(a, b) / denom)

brain/memory/hebbian.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
"""Hebbian connection matrix with spreading activation.
2+
3+
Edges are undirected: edge (a, b) is stored canonically with the lower id
4+
first to avoid duplicate rows. Weight accumulates over repeated strengthen()
5+
calls; decay_all() reduces all weights (floored at 0); garbage_collect()
6+
removes weak edges to keep the graph compact.
7+
8+
Spreading activation is a bounded BFS that propagates seed activation
9+
through the graph, attenuating by (weight * decay_per_hop) at each hop.
10+
Multi-path arrivals take the max (not sum) — prevents an activation
11+
runaway on densely connected graphs.
12+
13+
Design per spec Section 4.1 (brain/memory/hebbian.py) and OG's F32/F33
14+
Hebbian work.
15+
"""
16+
17+
from __future__ import annotations
18+
19+
import sqlite3
20+
from collections.abc import Iterable
21+
from pathlib import Path
22+
23+
24+
class HebbianMatrix:
25+
"""SQLite-backed sparse weighted graph between memory ids."""
26+
27+
_SCHEMA = """
28+
CREATE TABLE IF NOT EXISTS hebbian_edges (
29+
memory_a TEXT NOT NULL,
30+
memory_b TEXT NOT NULL,
31+
weight REAL NOT NULL DEFAULT 0.0,
32+
last_strengthened_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
33+
PRIMARY KEY (memory_a, memory_b)
34+
);
35+
36+
CREATE INDEX IF NOT EXISTS idx_hebbian_a ON hebbian_edges(memory_a);
37+
CREATE INDEX IF NOT EXISTS idx_hebbian_b ON hebbian_edges(memory_b);
38+
"""
39+
40+
def __init__(self, db_path: str | Path) -> None:
41+
self._conn = sqlite3.connect(str(db_path))
42+
self._conn.executescript(self._SCHEMA)
43+
self._conn.commit()
44+
45+
def close(self) -> None:
46+
self._conn.close()
47+
48+
def strengthen(self, a: str, b: str, delta: float = 0.1) -> None:
49+
"""Add `delta` to the weight of edge (a, b). Creates the edge if new.
50+
51+
`delta` must be positive — the module contract is that weights are
52+
non-negative. Callers that want to weaken an edge use `decay_all`
53+
or `garbage_collect`. Negative delta raises ValueError.
54+
"""
55+
if a == b:
56+
return # self-edges not tracked
57+
if delta <= 0.0:
58+
raise ValueError(f"delta must be positive, got {delta!r}")
59+
lo, hi = _canonical(a, b)
60+
self._conn.execute(
61+
"""
62+
INSERT INTO hebbian_edges (memory_a, memory_b, weight)
63+
VALUES (?, ?, ?)
64+
ON CONFLICT(memory_a, memory_b)
65+
DO UPDATE SET weight = weight + excluded.weight,
66+
last_strengthened_at = CURRENT_TIMESTAMP
67+
""",
68+
(lo, hi, delta),
69+
)
70+
self._conn.commit()
71+
72+
def weight(self, a: str, b: str) -> float:
73+
"""Return the weight of edge (a, b). Zero if no edge."""
74+
if a == b:
75+
return 0.0
76+
lo, hi = _canonical(a, b)
77+
row = self._conn.execute(
78+
"SELECT weight FROM hebbian_edges WHERE memory_a = ? AND memory_b = ?",
79+
(lo, hi),
80+
).fetchone()
81+
return float(row[0]) if row else 0.0
82+
83+
def neighbors(self, memory_id: str) -> list[tuple[str, float]]:
84+
"""Return [(other_id, weight), ...] for every edge touching memory_id."""
85+
rows = self._conn.execute(
86+
"""
87+
SELECT memory_b, weight FROM hebbian_edges WHERE memory_a = ?
88+
UNION ALL
89+
SELECT memory_a, weight FROM hebbian_edges WHERE memory_b = ?
90+
""",
91+
(memory_id, memory_id),
92+
).fetchall()
93+
return [(other, float(weight)) for other, weight in rows]
94+
95+
def decay_all(self, rate: float) -> None:
96+
"""Subtract `rate` from every weight, floored at 0.
97+
98+
`rate` must be non-negative. A negative rate would inflate every
99+
weight in a single scheduled batch — silent corruption for
100+
dream/heartbeat cycles. ValueError guards the sign.
101+
"""
102+
if rate < 0.0:
103+
raise ValueError(f"decay rate must be non-negative, got {rate!r}")
104+
self._conn.execute("UPDATE hebbian_edges SET weight = MAX(weight - ?, 0.0)", (rate,))
105+
self._conn.commit()
106+
107+
def garbage_collect(self, threshold: float = 0.01) -> int:
108+
"""Remove edges with weight < threshold. Returns the count removed."""
109+
cursor = self._conn.execute("DELETE FROM hebbian_edges WHERE weight < ?", (threshold,))
110+
self._conn.commit()
111+
return cursor.rowcount
112+
113+
def spreading_activation(
114+
self,
115+
seed_ids: Iterable[str],
116+
depth: int = 2,
117+
decay_per_hop: float = 0.5,
118+
) -> dict[str, float]:
119+
"""BFS spreading activation from seed_ids, returning activation by id.
120+
121+
Seed nodes have activation 1.0 and are protected: propagation
122+
cannot lower them. Each hop multiplies the source activation by
123+
(edge_weight * decay_per_hop) to produce the neighbour's
124+
activation. Multi-path arrivals take the max (not sum) — prevents
125+
activation runaway on densely connected graphs.
126+
127+
Returns a dict {memory_id: activation}.
128+
"""
129+
activation: dict[str, float] = {}
130+
for sid in seed_ids:
131+
activation[sid] = 1.0
132+
133+
frontier = set(activation)
134+
for _ in range(depth):
135+
next_frontier: set[str] = set()
136+
for node in frontier:
137+
for neighbour, weight in self.neighbors(node):
138+
propagated = activation[node] * weight * decay_per_hop
139+
if propagated > activation.get(neighbour, 0.0):
140+
activation[neighbour] = propagated
141+
next_frontier.add(neighbour)
142+
frontier = next_frontier
143+
if not frontier:
144+
break
145+
return activation
146+
147+
148+
def _canonical(a: str, b: str) -> tuple[str, str]:
149+
"""Sort the pair so edge (a, b) and (b, a) hash to the same row."""
150+
return (a, b) if a <= b else (b, a)

0 commit comments

Comments
 (0)