hanamorix
diff --git a/‎brain/memory/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎brain/memory/__init__.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎brain/memory/embeddings.py‎
Lines changed: 124 additions & 0 deletions b/‎brain/memory/embeddings.py‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎brain/memory/hebbian.py‎
Lines changed: 150 additions & 0 deletions b/‎brain/memory/hebbian.py‎
Lines changed: 150 additions & 0 deletions
@@ -0,0 +1,15 @@
+"""The memory substrate — SQLite-backed store + embeddings + Hebbian + search.
+
+Four sub-modules, each with a single responsibility:
+- store: Memory dataclass + MemoryStore (SQLite-backed CRUD)
+- embeddings: provider abstraction + content-hash cache
+- hebbian: connection matrix + spreading activation
+- search: semantic + emotional + temporal + spreading queries
+
+See spec Section 4.1 for the file-tree and Section 10.1 for the SQLite
+data-layer decision (replaces OG's JSON/numpy files).
+"""
+
+from brain.memory.store import Memory, MemoryStore
+
+__all__ = ["Memory", "MemoryStore"]
@@ -0,0 +1,124 @@
+"""Embedding provider abstraction + content-hash cache.
+
+Provider interface: EmbeddingProvider ABC. Two concrete providers:
+- FakeEmbeddingProvider: deterministic hash-based, zero network, used in tests.
+- OllamaEmbeddingProvider: calls local Ollama /api/embeddings endpoint
+  (will be added in Week 5 when the bridge lands).
+
+Cache: EmbeddingCache layers a SQLite content-hash cache on top of any
+provider. `get_or_compute(content)` returns the vector, hitting cache on
+repeat calls. Content hashed via SHA-256; first 32 hex chars used as key.
+
+Design per spec Section 4.1 (brain/memory/embeddings.py) and Section 10.1
+(content-hash embedding cache).
+"""
+
+from __future__ import annotations
+
+import hashlib
+import sqlite3
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+import numpy as np
+
+_DEFAULT_DIM = 256
+
+
+class EmbeddingProvider(ABC):
+    """Abstract embedding provider. Subclasses implement `embed` and `embedding_dim`."""
+
+    @abstractmethod
+    def embed(self, text: str) -> np.ndarray:
+        """Return a 1-D numpy array of dimension `embedding_dim()`."""
+
+    @abstractmethod
+    def embedding_dim(self) -> int:
+        """Return the output dimension of vectors this provider produces."""
+
+
+class FakeEmbeddingProvider(EmbeddingProvider):
+    """Deterministic pseudo-random embedding provider for tests.
+
+    Uses SHA-256 of the input text to seed a NumPy Generator, then produces
+    a unit-norm vector. Same text always produces the same vector; different
+    text produces different vectors. No network, no external dependencies.
+    """
+
+    def __init__(self, dim: int = _DEFAULT_DIM) -> None:
+        self._dim = dim
+
+    def embed(self, text: str) -> np.ndarray:
+        h = hashlib.sha256(text.encode("utf-8")).digest()
+        seed = int.from_bytes(h[:8], byteorder="big", signed=False)
+        rng = np.random.default_rng(seed=seed)
+        vec = rng.standard_normal(self._dim)
+        norm = np.linalg.norm(vec)
+        if norm == 0.0:
+            raise ValueError(f"FakeEmbeddingProvider produced a zero-norm vector (dim={self._dim})")
+        return vec / norm
+
+    def embedding_dim(self) -> int:
+        return self._dim
+
+
+class EmbeddingCache:
+    """Content-hash cache on top of any EmbeddingProvider.
+
+    Storage: SQLite table with (content_hash TEXT PRIMARY KEY, vector BLOB,
+    dim INTEGER, created_at TEXT). Hash is SHA-256 hex (first 32 chars).
+    Vector stored as raw float32 bytes via np.ndarray.tobytes().
+    """
+
+    _SCHEMA = """
+    CREATE TABLE IF NOT EXISTS embedding_cache (
+        content_hash TEXT PRIMARY KEY,
+        vector BLOB NOT NULL,
+        dim INTEGER NOT NULL,
+        created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+    );
+    """
+
+    def __init__(self, db_path: str | Path, provider: EmbeddingProvider) -> None:
+        self._conn = sqlite3.connect(str(db_path))
+        self._conn.executescript(self._SCHEMA)
+        self._conn.commit()
+        self._provider = provider
+
+    def close(self) -> None:
+        """Close the underlying connection."""
+        self._conn.close()
+
+    def get_or_compute(self, content: str) -> np.ndarray:
+        """Return the cached embedding for content, computing + storing on miss."""
+        key = self._hash(content)
+        row = self._conn.execute(
+            "SELECT vector, dim FROM embedding_cache WHERE content_hash = ?", (key,)
+        ).fetchone()
+        if row is not None:
+            return np.frombuffer(row[0], dtype=np.float32).copy().reshape(row[1])
+
+        vec = self._provider.embed(content).astype(np.float32)
+        self._conn.execute(
+            "INSERT INTO embedding_cache (content_hash, vector, dim) VALUES (?, ?, ?)",
+            (key, vec.tobytes(), vec.shape[0]),
+        )
+        self._conn.commit()
+        # Return a float32 copy for consistency with cache hits.
+        return vec.copy()
+
+    def count(self) -> int:
+        """Return the number of cached embeddings."""
+        return int(self._conn.execute("SELECT COUNT(*) FROM embedding_cache").fetchone()[0])
+
+    @staticmethod
+    def _hash(content: str) -> str:
+        return hashlib.sha256(content.encode("utf-8")).hexdigest()[:32]
+
+
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    """Return cosine similarity between two vectors. Range [-1, 1]."""
+    denom = float(np.linalg.norm(a) * np.linalg.norm(b))
+    if denom == 0.0:
+        return 0.0
+    return float(np.dot(a, b) / denom)
@@ -0,0 +1,150 @@
+"""Hebbian connection matrix with spreading activation.
+
+Edges are undirected: edge (a, b) is stored canonically with the lower id
+first to avoid duplicate rows. Weight accumulates over repeated strengthen()
+calls; decay_all() reduces all weights (floored at 0); garbage_collect()
+removes weak edges to keep the graph compact.
+
+Spreading activation is a bounded BFS that propagates seed activation
+through the graph, attenuating by (weight * decay_per_hop) at each hop.
+Multi-path arrivals take the max (not sum) — prevents an activation
+runaway on densely connected graphs.
+
+Design per spec Section 4.1 (brain/memory/hebbian.py) and OG's F32/F33
+Hebbian work.
+"""
+
+from __future__ import annotations
+
+import sqlite3
+from collections.abc import Iterable
+from pathlib import Path
+
+
+class HebbianMatrix:
+    """SQLite-backed sparse weighted graph between memory ids."""
+
+    _SCHEMA = """
+    CREATE TABLE IF NOT EXISTS hebbian_edges (
+        memory_a TEXT NOT NULL,
+        memory_b TEXT NOT NULL,
+        weight REAL NOT NULL DEFAULT 0.0,
+        last_strengthened_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        PRIMARY KEY (memory_a, memory_b)
+    );
+
+    CREATE INDEX IF NOT EXISTS idx_hebbian_a ON hebbian_edges(memory_a);
+    CREATE INDEX IF NOT EXISTS idx_hebbian_b ON hebbian_edges(memory_b);
+    """
+
+    def __init__(self, db_path: str | Path) -> None:
+        self._conn = sqlite3.connect(str(db_path))
+        self._conn.executescript(self._SCHEMA)
+        self._conn.commit()
+
+    def close(self) -> None:
+        self._conn.close()
+
+    def strengthen(self, a: str, b: str, delta: float = 0.1) -> None:
+        """Add `delta` to the weight of edge (a, b). Creates the edge if new.
+
+        `delta` must be positive — the module contract is that weights are
+        non-negative. Callers that want to weaken an edge use `decay_all`
+        or `garbage_collect`. Negative delta raises ValueError.
+        """
+        if a == b:
+            return  # self-edges not tracked
+        if delta <= 0.0:
+            raise ValueError(f"delta must be positive, got {delta!r}")
+        lo, hi = _canonical(a, b)
+        self._conn.execute(
+            """
+            INSERT INTO hebbian_edges (memory_a, memory_b, weight)
+            VALUES (?, ?, ?)
+            ON CONFLICT(memory_a, memory_b)
+                DO UPDATE SET weight = weight + excluded.weight,
+                              last_strengthened_at = CURRENT_TIMESTAMP
+            """,
+            (lo, hi, delta),
+        )
+        self._conn.commit()
+
+    def weight(self, a: str, b: str) -> float:
+        """Return the weight of edge (a, b). Zero if no edge."""
+        if a == b:
+            return 0.0
+        lo, hi = _canonical(a, b)
+        row = self._conn.execute(
+            "SELECT weight FROM hebbian_edges WHERE memory_a = ? AND memory_b = ?",
+            (lo, hi),
+        ).fetchone()
+        return float(row[0]) if row else 0.0
+
+    def neighbors(self, memory_id: str) -> list[tuple[str, float]]:
+        """Return [(other_id, weight), ...] for every edge touching memory_id."""
+        rows = self._conn.execute(
+            """
+            SELECT memory_b, weight FROM hebbian_edges WHERE memory_a = ?
+            UNION ALL
+            SELECT memory_a, weight FROM hebbian_edges WHERE memory_b = ?
+            """,
+            (memory_id, memory_id),
+        ).fetchall()
+        return [(other, float(weight)) for other, weight in rows]
+
+    def decay_all(self, rate: float) -> None:
+        """Subtract `rate` from every weight, floored at 0.
+
+        `rate` must be non-negative. A negative rate would inflate every
+        weight in a single scheduled batch — silent corruption for
+        dream/heartbeat cycles. ValueError guards the sign.
+        """
+        if rate < 0.0:
+            raise ValueError(f"decay rate must be non-negative, got {rate!r}")
+        self._conn.execute("UPDATE hebbian_edges SET weight = MAX(weight - ?, 0.0)", (rate,))
+        self._conn.commit()
+
+    def garbage_collect(self, threshold: float = 0.01) -> int:
+        """Remove edges with weight < threshold. Returns the count removed."""
+        cursor = self._conn.execute("DELETE FROM hebbian_edges WHERE weight < ?", (threshold,))
+        self._conn.commit()
+        return cursor.rowcount
+
+    def spreading_activation(
+        self,
+        seed_ids: Iterable[str],
+        depth: int = 2,
+        decay_per_hop: float = 0.5,
+    ) -> dict[str, float]:
+        """BFS spreading activation from seed_ids, returning activation by id.
+
+        Seed nodes have activation 1.0 and are protected: propagation
+        cannot lower them. Each hop multiplies the source activation by
+        (edge_weight * decay_per_hop) to produce the neighbour's
+        activation. Multi-path arrivals take the max (not sum) — prevents
+        activation runaway on densely connected graphs.
+
+        Returns a dict {memory_id: activation}.
+        """
+        activation: dict[str, float] = {}
+        for sid in seed_ids:
+            activation[sid] = 1.0
+
+        frontier = set(activation)
+        for _ in range(depth):
+            next_frontier: set[str] = set()
+            for node in frontier:
+                for neighbour, weight in self.neighbors(node):
+                    propagated = activation[node] * weight * decay_per_hop
+                    if propagated > activation.get(neighbour, 0.0):
+                        activation[neighbour] = propagated
+                        next_frontier.add(neighbour)
+            frontier = next_frontier
+            if not frontier:
+                break
+        return activation
+
+
+def _canonical(a: str, b: str) -> tuple[str, str]:
+    """Sort the pair so edge (a, b) and (b, a) hash to the same row."""
+    return (a, b) if a <= b else (b, a)