perf: reuse sentence embeddings for chunk vectors

zizhaof · claude · zizhaof · commit 61750ace1660 · 2026-04-23T18:44:59.000-07:00
On an Oracle Free Tier 4-vCPU ARM box with no GPU, bge-m3 encode is the
single biggest cost of attachment upload. Measured in-container on a
synthetic 7KB, 150-sentence text: semantic-chunking + chunk-storage needed
two separate embed_texts passes (~8.5s each) = ~17s wall time total, which
dominated the user-visible extract latency.

Fold the per-chunk embed into the sentence-level pass:

- New chunk_and_embed() does the same sentence-boundary detection as
  chunk_text_semantic, but returns the resulting (chunks, embeddings) pair.
  Per-chunk embeddings are the L2-renormalized sum of their constituent
  sentence vectors. Since bge-m3 sentence embeddings are already unit-norm,
  this is equivalent to mean-pool + renormalize.
- process_attachment uses chunk_and_embed instead of chunk_text_semantic +
  a second embed_texts(chunks) call.
- chunk_text_semantic stays as a thin text-only wrapper (kept for other
  call sites and test compat).
- Fallback path: when the single sentence-embed pass fails, fall back to
  _chunk_fixed and re-embed the fixed chunks once (same cost shape as the
  old double-pass, acceptable for the error path only).

Also set TOKENIZERS_PARALLELISM=true in the backend compose env so the HF
tokenizer can fork during encode's tokenize step (tiny speedup, zero risk —
we don't fork post-import).

Expected staging wall-time: extract ~17s → ~9s, dominated by the now-single
sentence embed pass. Will verify on staging.

Tests
- Updated process_attachment tests to mock chunk_and_embed.
- Updated chunk_text_semantic fallback test (fallback path now re-embeds).
- Added TestChunkAndEmbed covering: aligned (chunks, embeddings) lengths,
  unit-norm of pooled chunk vectors, re-embed fallback on failure,
  empty-input returns ([], []).

250 pytest passed (was 246 + 4 new).

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/backend/services/attachment_processor.py b/backend/services/attachment_processor.py
@@ -13,6 +13,7 @@
 import base64
 import io
 import logging
+import math
 import re
 from typing import Optional
 
@@ -217,67 +218,131 @@ def _split_sentences(text: str) -> list[str]:
     return sentences
 
 
+def _group_sentences_into_chunks(
+    sentences: list[str],
+    sent_embs: list[list[float]],
+) -> tuple[list[str], list[list[int]]]:
+    """
+    Walk sentences with precomputed embeddings, cutting on semantic jumps or size overflow.
+    Returns (chunk_texts, chunk_sentence_indices).
+    """
+    chunks: list[str] = []
+    chunk_idx: list[list[int]] = []
+    current_sents: list[str] = [sentences[0]]
+    current_idx: list[int] = [0]
+    current_len: int = len(sentences[0])
+
+    for i in range(1, len(sentences)):
+        sent = sentences[i]
+        sent_len = len(sent)
+
+        # Dot product equals cosine similarity because vectors are L2-normalized
+        sim: float = sum(a * b for a, b in zip(sent_embs[i - 1], sent_embs[i]))
+
+        # Break on semantic jump or size overflow (only if current chunk meets MIN_CHUNK_CHARS)
+        should_break = (
+            sim < SEMANTIC_THRESHOLD or current_len + sent_len > MAX_CHUNK_CHARS
+        ) and current_len >= MIN_CHUNK_CHARS
+
+        if should_break:
+            chunks.append("".join(current_sents))
+            chunk_idx.append(current_idx)
+            current_sents = [sent]
+            current_idx = [i]
+            current_len = sent_len
+        else:
+            current_sents.append(sent)
+            current_idx.append(i)
+            current_len += sent_len
+
+    if current_sents:
+        tail = "".join(current_sents)
+        # Merge a too-short tail chunk into the previous one to avoid tiny orphan chunks
+        if chunks and len(tail) < MIN_CHUNK_CHARS:
+            chunks[-1] += tail
+            chunk_idx[-1].extend(current_idx)
+        else:
+            chunks.append(tail)
+            chunk_idx.append(current_idx)
+
+    return chunks, chunk_idx
+
+
+def _pool_chunk_embedding(sent_embs: list[list[float]], idxs: list[int]) -> list[float]:
+    """
+    Derive a chunk embedding by L2-normalizing the sum of its sentence embeddings.
+    Equivalent to mean-pool + renormalize since bge-m3 sentence vectors are unit-length.
+    """
+    if not idxs:
+        raise ValueError("empty sentence index list")
+    dim = len(sent_embs[idxs[0]])
+    acc = [0.0] * dim
+    for i in idxs:
+        v = sent_embs[i]
+        for j in range(dim):
+            acc[j] += v[j]
+    norm = math.sqrt(sum(x * x for x in acc)) or 1.0
+    return [x / norm for x in acc]
+
+
 async def chunk_text_semantic(text: str) -> list[str]:
     """
-    Semantic chunking based on embedding cosine similarity.
+    Semantic chunking based on embedding cosine similarity; returns text chunks only.
+    Thin wrapper around chunk_and_embed for call sites that don't need the embeddings.
+    """
+    chunks, _ = await chunk_and_embed(text)
+    return chunks
 
-    Flow:
-      1. Split by sentence boundaries -> sentences
-      2. Batch-embed all sentences once (bge-m3 is normalized; dot product = cosine similarity)
-      3. When adjacent-sentence similarity < SEMANTIC_THRESHOLD, treat as a semantic break and cut
-      4. Merge sentences into chunks while keeping length under MAX_CHUNK_CHARS
 
-    Falls back to fixed-size chunking on any error.
+async def chunk_and_embed(text: str) -> tuple[list[str], list[list[float]]]:
+    """
+    Semantic chunking + per-chunk embeddings in a single embed pass.
+
+    Flow:
+      1. Split into sentences.
+      2. Batch-embed every sentence once (bge-m3 is L2-normalized; dot product = cosine sim).
+      3. Group adjacent sentences into chunks, breaking on semantic jumps or size overflow.
+      4. Derive each chunk's embedding by summing + L2-renormalizing the sentence vectors
+         that compose it. Saves a full second embed pass that process_attachment used to
+         do on the joined chunk text — embedding is ~half the extract wall time on CPU.
+
+    Returns (chunks, chunk_embeddings) with aligned lengths.
+    Falls back to fixed-size chunking + a re-embed on any error.
     """
     from services.embedding_service import embed_texts
 
     sentences = _split_sentences(text)
     if not sentences:
-        return []
+        return [], []
     if len(sentences) == 1:
-        return [sentences[0]] if len(sentences[0]) >= MIN_CHUNK_CHARS else []
+        sent = sentences[0]
+        if len(sent) < MIN_CHUNK_CHARS:
+            return [], []
+        embs = await embed_texts([sent])
+        return [sent], list(embs)
 
     try:
-        # Single batch embed call — avoids repeated executor dispatch overhead
-        embeddings = await embed_texts(sentences)
-
-        chunks: list[str] = []
-        current: list[str] = [sentences[0]]
-        current_len: int = len(sentences[0])
-
-        for i in range(1, len(sentences)):
-            sent = sentences[i]
-            sent_len = len(sent)
-
-            # Dot product equals cosine similarity because vectors are L2-normalized
-            sim: float = sum(a * b for a, b in zip(embeddings[i - 1], embeddings[i]))
-
-            # Break on semantic jump or size overflow (only if current chunk meets MIN_CHUNK_CHARS)
-            should_break = (
-                sim < SEMANTIC_THRESHOLD or current_len + sent_len > MAX_CHUNK_CHARS
-            ) and current_len >= MIN_CHUNK_CHARS
-
-            if should_break:
-                chunks.append("".join(current))
-                current = [sent]
-                current_len = sent_len
-            else:
-                current.append(sent)
-                current_len += sent_len
-
-        if current:
-            tail = "".join(current)
-            # Merge a too-short tail chunk into the previous one to avoid tiny orphan chunks
-            if chunks and len(tail) < MIN_CHUNK_CHARS:
-                chunks[-1] += tail
-            else:
-                chunks.append(tail)
-
-        return [c for c in chunks if c.strip()]
+        sent_embs = await embed_texts(sentences)
+
+        chunks, chunk_idx = _group_sentences_into_chunks(sentences, sent_embs)
+
+        # Drop empty-after-strip chunks and their embeddings in lockstep
+        kept: list[tuple[str, list[float]]] = []
+        for text_chunk, idxs in zip(chunks, chunk_idx):
+            if text_chunk.strip() and idxs:
+                kept.append((text_chunk, _pool_chunk_embedding(sent_embs, idxs)))
+        if not kept:
+            return [], []
+        kept_chunks, kept_embs = zip(*kept)
+        return list(kept_chunks), list(kept_embs)
 
     except Exception as e:
-        logger.warning("语义切分失败，fallback 到固定切分 / Semantic chunking failed, falling back: %s", e)
-        return _chunk_fixed(text)
+        logger.warning("Semantic chunk+embed failed, falling back to fixed chunking + re-embed: %s", e)
+        fallback_chunks = _chunk_fixed(text)
+        if not fallback_chunks:
+            return [], []
+        fallback_embs = await embed_texts(fallback_chunks)
+        return fallback_chunks, list(fallback_embs)
 
 
 def _chunk_fixed(text: str) -> list[str]:
@@ -372,14 +437,11 @@ async def process_attachment(
             logger.info("附件内联模式 / Attachment inline: %s (%d chars)", filename, len(text))
             return {"chunk_count": 0, "inline_text": text.strip()}
 
-        # Long text: semantic chunk → embed → store
-        chunks = await chunk_text_semantic(text)
+        # Long text: semantic chunk + embed in a single pass → store
+        chunks, embeddings = await chunk_and_embed(text)
         if not chunks:
             return {"chunk_count": 0, "inline_text": None}
 
-        from services.embedding_service import embed_texts
-        embeddings = await embed_texts(chunks)
-
         await _store_chunks(session_id, filename, chunks, embeddings)
         logger.info("附件 RAG 模式 / Attachment RAG: %s (%d chunks, session=%s)",
                     filename, len(chunks), session_id)
diff --git a/backend/tests/test_attachment_processor.py b/backend/tests/test_attachment_processor.py
@@ -118,18 +118,92 @@ async def test_max_chunk_size_triggers_break(self):
 
     @pytest.mark.asyncio
     async def test_embed_failure_falls_back_to_fixed(self):
-        """Falls back to fixed chunking on embed failure."""
-        from services.attachment_processor import chunk_text_semantic, CHUNK_SIZE
+        """Falls back to fixed chunking on semantic-pass embed failure."""
+        from services.attachment_processor import chunk_text_semantic
 
-        # Text needs sentence punctuation so _split_sentences yields >1 sentence and embed_texts is called
         single_sent = "这是一个用于测试的句子，内容并不重要只是用来触发分块逻辑。"
         text = single_sent * 20   # Repeated 20 times; each ends with a period, totaling 20 sentences
-        with patch("services.embedding_service.embed_texts", new=AsyncMock(side_effect=Exception("model error"))):
+
+        # First call (sentence embed for semantic-boundary detection) blows up;
+        # fallback path does a second embed on the fixed chunks, which succeeds.
+        calls: list[int] = []
+        async def fake_embed(texts):
+            calls.append(len(texts))
+            if len(calls) == 1:
+                raise Exception("model error")
+            return [[0.1] * 1024 for _ in texts]
+
+        with patch("services.embedding_service.embed_texts", new=fake_embed):
             result = await chunk_text_semantic(text)
 
         assert len(result) > 1  # Fallback fixed-split still produces multiple chunks
 
 
+# ── chunk_and_embed ───────────────────────────────────────────────────
+
+class TestChunkAndEmbed:
+    @pytest.mark.asyncio
+    async def test_returns_aligned_lists(self):
+        """chunks and embeddings are same length and per-chunk."""
+        from services.attachment_processor import chunk_and_embed
+        text = (
+            "First sentence about dogs. Second about dogs too. "
+            "Totally unrelated topic: quantum computing basics. "
+            "More on quantum. Yet more on quantum theory."
+        ) * 10
+        # Two clusters of sentences — embeddings differ per cluster to trigger a break
+        dog_vec = [1.0, 0.0] + [0.0] * 1022
+        qc_vec = [0.0, 1.0] + [0.0] * 1022
+        sent_embs = [dog_vec, dog_vec, qc_vec, qc_vec, qc_vec] * 10
+        with patch("services.embedding_service.embed_texts",
+                   new=AsyncMock(return_value=sent_embs)):
+            chunks, embs = await chunk_and_embed(text)
+        assert len(chunks) == len(embs) > 0
+
+    @pytest.mark.asyncio
+    async def test_chunk_embeddings_are_unit_norm(self):
+        """Pooled chunk embeddings are L2-normalized (unit length)."""
+        from services.attachment_processor import chunk_and_embed
+        # Two similar sentences → one chunk covering both. Vectors unit-norm but not identical.
+        text = "Alpha statement. Beta statement."
+        v1 = [1.0, 0.0] + [0.0] * 1022
+        v2 = [0.8, 0.6] + [0.0] * 1022  # unit length
+        with patch("services.embedding_service.embed_texts",
+                   new=AsyncMock(return_value=[v1, v2])):
+            chunks, embs = await chunk_and_embed(text)
+        assert len(embs) == 1
+        norm_sq = sum(x * x for x in embs[0])
+        assert abs(norm_sq - 1.0) < 1e-6
+
+    @pytest.mark.asyncio
+    async def test_fallback_reembeds_fixed_chunks(self):
+        """On semantic-embed failure, falls back to fixed chunks + a re-embed call."""
+        from services.attachment_processor import chunk_and_embed
+        text = "Sentence one. Sentence two. Sentence three." * 200
+
+        call_log: list[str] = []
+
+        async def fake_embed(texts):
+            # First call (sentence embed during semantic path) fails;
+            # second call (fallback chunk embed) succeeds and returns one vec per chunk.
+            call_log.append("call")
+            if len(call_log) == 1:
+                raise RuntimeError("embed model exploded")
+            return [[0.1] * 1024 for _ in texts]
+
+        with patch("services.embedding_service.embed_texts", new=fake_embed):
+            chunks, embs = await chunk_and_embed(text)
+        assert len(chunks) == len(embs) > 0
+        assert len(call_log) == 2  # Sentence pass failed, fallback pass succeeded
+
+    @pytest.mark.asyncio
+    async def test_empty_text_returns_empty(self):
+        """Empty / whitespace-only text returns ([], [])."""
+        from services.attachment_processor import chunk_and_embed
+        assert await chunk_and_embed("") == ([], [])
+        assert await chunk_and_embed("   \n   ") == ([], [])
+
+
 # ── _chunk_fixed ──────────────────────────────────────────────────────
 
 class TestChunkFixed:
@@ -355,9 +429,9 @@ async def test_long_text_goes_to_rag(self):
         fake_embeddings = [[0.1] * 1024] * 3
 
         with patch("services.attachment_processor.extract_text", new=AsyncMock(return_value=long_text)), \
-             patch("services.attachment_processor.chunk_text_semantic", new=AsyncMock(return_value=fake_chunks)), \
-             patch("services.attachment_processor._store_chunks", new=AsyncMock()), \
-             patch("services.embedding_service.embed_texts", new=AsyncMock(return_value=fake_embeddings)):
+             patch("services.attachment_processor.chunk_and_embed",
+                   new=AsyncMock(return_value=(fake_chunks, fake_embeddings))), \
+             patch("services.attachment_processor._store_chunks", new=AsyncMock()):
             result = await process_attachment("session-1", "doc.pdf", b"pdf bytes")
 
         assert result["chunk_count"] == len(fake_chunks)
@@ -371,7 +445,8 @@ async def test_no_chunks_returns_failure(self):
         long_text = "X" * (INLINE_THRESHOLD + 100)
 
         with patch("services.attachment_processor.extract_text", new=AsyncMock(return_value=long_text)), \
-             patch("services.attachment_processor.chunk_text_semantic", new=AsyncMock(return_value=[])):
+             patch("services.attachment_processor.chunk_and_embed",
+                   new=AsyncMock(return_value=([], []))):
             result = await process_attachment("session-1", "file.txt", b"content")
 
         assert result == {"chunk_count": 0, "inline_text": None}
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -7,6 +7,9 @@ services:
     env_file: ./backend/.env
     environment:
       - SEARXNG_URL=${SEARXNG_INTERNAL}
+      # Let HuggingFace tokenizers fork for parallel encode (small speedup during
+      # bge-m3 encode's tokenize step; safe since we don't fork post-import).
+      - TOKENIZERS_PARALLELISM=true
     volumes:
       - ./logs:/app/logs   # 日志持久化到宿主机 / persist logs to host
       - hf-cache:/root/.cache/huggingface   # 共享 HuggingFace 模型缓存（prod/staging 复用同一份 bge-m3）