fix: close bypass vectors from adversarial audit and harden pipeline

Fieldnote-Echo · claude · Fieldnote-Echo · commit 28d136c06b12 · 2026-02-28T19:28:50.000-06:00
Security fixes (2 CRITICAL, 3 HIGH, 4 MEDIUM):
- Strip supplementary variation selectors U+E0100-U+E01EF (invisible bypass)
- Strip U+E0000 LANGUAGE TAG (expand tag block range to U+E0000-U+E007F)
- Strip LRM U+200E and RLM U+200F (zero-width directional marks)
- Add Greek homoglyphs: iota U+03B9→i, nu U+03BD→v, rho U+03C1→p
- Normalize backslashes in path escaper (Windows traversal bypass)
- Sanitize dictionary keys in walk() (previously only values)
- Add runtime TypeError on non-str input to clean()
- Validate escaper return type (must be str)
- Null byte warnings now include count per project convention

Tests: 44 new regression tests (240 total), placeholder tests filled,
weak assertions fixed. Classifiers and project.urls added to pyproject.toml.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -43,12 +43,15 @@ pre-commit run --all-files
 
 ## CI
 
-GitHub Actions runs on push to `main` and all PRs. Four parallel jobs:
+GitHub Actions runs on push to `main` and all PRs. Five parallel jobs:
 
 - **lint** — `ruff check` + `ruff format --check`
 - **typecheck** — `mypy --strict src/navi_sanitize/`
 - **test** — pytest across Python 3.12 + 3.13, `--benchmark-disable`
-- **build** — gates on all three above; builds wheel, smoke-tests public API, uploads artifact
+- **security** — `pip-audit` dependency vulnerability scan
+- **build** — gates on all four above; builds wheel, smoke-tests public API, uploads artifact
+
+Additional security workflows: Semgrep SAST, CodeQL (`python` + `actions`), OpenSSF Scorecard.
 
 Benchmarks run via manual dispatch only (`.github/workflows/benchmark.yml`).
 
@@ -96,7 +99,7 @@ Each stage returns `(cleaned_string, changed: bool)`. Stages have no side effect
 ## Gotchas
 
 - **`ruff` rules `RUF001`/`RUF003`** fire on intentional Cyrillic/Greek/Armenian/Cherokee in test and data files — use `# ruff: noqa: RUF001, RUF003` or `# ruff: noqa: RUF003` at top of those files
-- **Tag block range starts at `U+E0001`**, not `U+E0000`
+- **Tag block range starts at `U+E0000`** (includes the deprecated LANGUAGE TAG character)
 - **pytest-benchmark `pedantic()`** required for large payloads (100KB) — standard mode runs too many iterations
 - **No CLI, no config files, no framework dependencies** — this is a library only
 - **No LLM prompt escaper** — vendor syntax moves too fast; pluggable design lets users build their own
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,21 @@ authors = [
 maintainers = [
     { name = "Nelson Spence" }
 ]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Security",
+    "Topic :: Text Processing :: Filters",
+    "Typing :: Typed",
+]
+
+[project.urls]
+Repository = "https://github.com/Project-Navi/navi-sanitize"
+Issues = "https://github.com/Project-Navi/navi-sanitize/issues"
 
 [dependency-groups]
 dev = [
diff --git a/src/navi_sanitize/_homoglyphs.py b/src/navi_sanitize/_homoglyphs.py
@@ -58,7 +58,10 @@
     "\u03a7": "X",
     # Greek → Latin (lowercase)
     "\u03b1": "a",
+    "\u03b9": "i",  # iota ι
+    "\u03bd": "v",  # nu ν
     "\u03bf": "o",
+    "\u03c1": "p",  # rho ρ
     # Typographic
     "\u2212": "-",  # minus sign
     "\u2013": "-",  # en dash
diff --git a/src/navi_sanitize/_invisible.py b/src/navi_sanitize/_invisible.py
@@ -14,6 +14,8 @@
     "\u200b",  # zero-width space
     "\u200c",  # zero-width non-joiner
     "\u200d",  # zero-width joiner
+    "\u200e",  # left-to-right mark
+    "\u200f",  # right-to-left mark
     "\u2060",  # word joiner
     "\ufeff",  # BOM / zero-width no-break space
     "\u180e",  # Mongolian vowel separator
@@ -34,14 +36,17 @@
     "\ufffc",  # object replacement character
 }
 
-# --- Variation selectors (U+FE00-U+FE0F) ---
-# Invisible modifiers that change glyph presentation.
+# --- Variation selectors ---
+# BMP range (U+FE00-U+FE0F) = VS1-VS16.
+# Supplementary range (U+E0100-U+E01EF) = VS17-VS256.
+# Both are invisible modifiers that change glyph presentation.
 VARIATION_SELECTOR_RANGE = (0xFE00, 0xFE0F)
+VARIATION_SELECTOR_SUPPLEMENT_RANGE = (0xE0100, 0xE01EF)
 
-# --- Unicode Tag block (U+E0001-U+E007F) ---
-# Encodes invisible ASCII that tokenizers read but humans can't see.
-# Used in tag smuggling attacks against LLMs.
-TAG_BLOCK_RANGE = (0xE0001, 0xE007F)
+# --- Unicode Tag block (U+E0000-U+E007F) ---
+# U+E0000 is the deprecated LANGUAGE TAG; U+E0001-U+E007F encode invisible
+# ASCII that tokenizers read but humans can't see (tag smuggling attacks).
+TAG_BLOCK_RANGE = (0xE0000, 0xE007F)
 
 # --- Bidirectional override/isolate characters ---
 # Used to reorder displayed text, hiding malicious content.
@@ -79,6 +84,12 @@
     + "-"
     + chr(TAG_BLOCK_RANGE[1])
     + "]"
+    # Variation selectors supplement (range)
+    + "|["
+    + chr(VARIATION_SELECTOR_SUPPLEMENT_RANGE[0])
+    + "-"
+    + chr(VARIATION_SELECTOR_SUPPLEMENT_RANGE[1])
+    + "]"
     # Bidi controls (individual chars)
     + "|["
     + "".join(BIDI_CONTROL_CHARS)
diff --git a/src/navi_sanitize/_pipeline.py b/src/navi_sanitize/_pipeline.py
@@ -24,11 +24,12 @@
 Escaper = Callable[[str], str]
 
 
-def _strip_null_bytes(s: str) -> tuple[str, bool]:
-    """Strip null bytes. Returns (cleaned, changed)."""
-    if "\x00" in s:
-        return s.replace("\x00", ""), True
-    return s, False
+def _strip_null_bytes(s: str) -> tuple[str, int]:
+    """Strip null bytes. Returns (cleaned, count_removed)."""
+    count = s.count("\x00")
+    if count:
+        return s.replace("\x00", ""), count
+    return s, 0
 
 
 def _strip_invisible(s: str) -> tuple[str, int]:
@@ -68,10 +69,13 @@ def clean(text: str, *, escaper: Escaper | None = None) -> str:
 
     Always returns output. Logs warnings when input is modified.
     """
+    if not isinstance(text, str):
+        raise TypeError(f"clean() requires str, got {type(text).__name__}")
+
     # Stage 1: Null bytes
-    text, had_nulls = _strip_null_bytes(text)
-    if had_nulls:
-        logger.warning("Removed null byte(s) from value")
+    text, null_count = _strip_null_bytes(text)
+    if null_count:
+        logger.warning("Removed %d null byte(s) from value", null_count)
 
     # Stage 2: Invisible characters
     text, invis_count = _strip_invisible(text)
@@ -91,6 +95,8 @@ def clean(text: str, *, escaper: Escaper | None = None) -> str:
     # Stage 5: Escaper
     if escaper is not None:
         text = escaper(text)
+        if not isinstance(text, str):
+            raise TypeError(f"Escaper must return str, got {type(text).__name__}")
 
     return text
 
@@ -107,8 +113,12 @@ def walk[T](data: T, *, escaper: Escaper | None = None) -> T:
 def _walk_inner(obj: object, *, escaper: Escaper | None = None) -> object:
     """Walk and sanitize in place on the deep-copied structure."""
     if isinstance(obj, dict):
+        new_dict: dict[object, object] = {}
         for k, v in obj.items():
-            obj[k] = _walk_inner(v, escaper=escaper)
+            clean_key = clean(k, escaper=escaper) if isinstance(k, str) else k
+            new_dict[clean_key] = _walk_inner(v, escaper=escaper)
+        obj.clear()
+        obj.update(new_dict)
         return obj
     if isinstance(obj, list):
         for i, item in enumerate(obj):
diff --git a/src/navi_sanitize/escapers/_path.py b/src/navi_sanitize/escapers/_path.py
@@ -10,6 +10,7 @@ def path_escaper(text: str) -> str:
     Strips ../ and ./ segments, leading /, and embedded .. within segments
     (which can appear when earlier pipeline stages concatenate fragments).
     """
+    text = text.replace("\\", "/")
     text = text.lstrip("/")
     parts = text.split("/")
     clean_parts: list[str] = []
diff --git a/tests/test_audit_remediation.py b/tests/test_audit_remediation.py
diff --git a/tests/test_bypass_attempts.py b/tests/test_bypass_attempts.py
diff --git a/tests/test_garbage.py b/tests/test_garbage.py