feat: add decode_evasion, detect_scripts, is_mixed_script (#2)

Fieldnote-Echo · claude · web-flow · commit 6cac6a57331c · 2026-03-01T12:44:34.000-06:00
* feat: add decode_evasion(), detect_scripts(), and is_mixed_script()

Opt-in primitives for multi-encoding evasion decode and mixed-script
detection. Core clean() pipeline is unchanged.

- decode_evasion() iteratively peels URL, HTML entity, and hex escape
  layers (up to max_layers, default 3). Never logs decoded content.
- detect_scripts() returns script buckets (latin, cyrillic, greek, etc.)
  present in text. Unknown scripts silently ignored.
- is_mixed_script() returns True when 2+ scripts are detected.
- 62 new tests (34 scripts, 28 decode) — 303 total, all passing.
- Zero new dependencies (stdlib only: html, urllib.parse, unicodedata).

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* docs: add opt-in utilities to README

Document decode_evasion(), detect_scripts(), and is_mixed_script()
with usage examples and composition pattern.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* fix: preserve malformed percent-encoded bytes in decode_evasion

Use unquote_to_bytes() with surrogateescape to keep invalid UTF-8
sequences (e.g. %FF, lone %80) as literal %XX instead of replacing
them with U+FFFD. Also adds test for unnamed-but-alphabetic chars
in detect_scripts to close the codecov gap.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/README.md b/README.md
@@ -18,6 +18,8 @@ clean("price:\u200b 0")  # "price: 0" — zero-width space stripped
 clean("file\x00.txt")  # "file.txt" — null byte removed
 ```
 
+Opt-in utilities for deeper analysis: `decode_evasion()` peels nested URL/HTML/hex encodings, `detect_scripts()` and `is_mixed_script()` flag mixed-script spoofing.
+
 ## Why This Matters
 
 Untrusted text contains invisible attacks: homoglyph substitution, zero-width characters, null bytes, fullwidth encoding, template/prompt injection delimiters. These bypass validation, poison templates, and fool humans.
@@ -106,6 +108,33 @@ from navi_sanitize import walk
 spec = walk(untrusted_json)
 ```
 
+## Opt-in Utilities
+
+**These utilities are not part of `clean()` and are never run automatically.** You must call them explicitly.
+
+```python
+from navi_sanitize import decode_evasion, clean, detect_scripts, is_mixed_script, path_escaper
+
+# Double-encoded path traversal
+raw = "%252e%252e%252fetc%252fpasswd"
+
+# 1. Peel nested encodings (URL → HTML entities → hex escapes)
+peeled = decode_evasion(raw)           # "../etc/passwd"
+
+# 2. Sanitize through the universal pipeline
+cleaned = clean(peeled, escaper=path_escaper)  # "etc/passwd"
+
+# 3. Check for mixed-script spoofing (useful on raw or pre-clean input)
+if is_mixed_script(raw) or is_mixed_script(peeled):
+    flag_for_review(raw)
+```
+
+- **`decode_evasion(text, *, max_layers=3)`** — iterative URL/HTML/hex decoding; stops when a pass produces no change
+- **`detect_scripts(text)`** — returns script buckets present in text (`latin`, `cyrillic`, `greek`, etc.)
+- **`is_mixed_script(text)`** — `True` when 2+ scripts detected
+
+Script detection can be applied pre-clean too — most useful on raw input for phishing detection.
+
 ## Warnings
 
 The pipeline never errors. It always produces output. When it changes something, it logs a warning.
diff --git a/src/navi_sanitize/__init__.py b/src/navi_sanitize/__init__.py
@@ -6,13 +6,24 @@
 import logging
 from collections.abc import Callable
 
+from navi_sanitize._decode import decode_evasion
 from navi_sanitize._pipeline import clean, walk
+from navi_sanitize._scripts import detect_scripts, is_mixed_script
 from navi_sanitize.escapers import jinja2_escaper, path_escaper
 
 Escaper = Callable[[str], str]
 
 __version__ = "0.1.0"
-__all__ = ["Escaper", "clean", "jinja2_escaper", "path_escaper", "walk"]
+__all__ = [
+    "Escaper",
+    "clean",
+    "decode_evasion",
+    "detect_scripts",
+    "is_mixed_script",
+    "jinja2_escaper",
+    "path_escaper",
+    "walk",
+]
 
 # Library logging best practice: NullHandler
 logging.getLogger("navi_sanitize").addHandler(logging.NullHandler())
diff --git a/src/navi_sanitize/_decode.py b/src/navi_sanitize/_decode.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: MIT
+"""Multi-encoding evasion decoder for untrusted text.
+
+Opt-in pre-processor — not part of the default ``clean()`` pipeline.
+Callers compose it with ``clean()`` explicitly::
+
+    text = decode_evasion(user_input)       # peel encoding layers
+    cleaned = clean(text, escaper=...)      # sanitize
+
+Iteratively decodes URL encoding, HTML entities, and hex escapes
+(``\\xHH``). Stops when a full pass produces no changes or
+*max_layers* is reached.
+"""
+
+from __future__ import annotations
+
+import html
+import logging
+import re
+import urllib.parse
+
+logger = logging.getLogger("navi_sanitize")
+
+MAX_DECODE_LAYERS: int = 3
+
+_HEX_RE = re.compile(r"\\x([0-9a-fA-F]{2})")
+
+
+def _decode_url(s: str) -> str:
+    """Decode URL percent-encoding, preserving malformed byte sequences.
+
+    Valid UTF-8 percent-encoded sequences decode normally.
+    Invalid byte sequences (e.g. ``%FF``, lone ``%80``) are kept
+    as literal percent-encoded text instead of being replaced with
+    U+FFFD.
+    """
+    raw = urllib.parse.unquote_to_bytes(s)
+    # Decode valid UTF-8; map invalid bytes to surrogates so we can
+    # re-encode them back to %XX in the next step.
+    text = raw.decode("utf-8", errors="surrogateescape")
+    # Re-encode any lone surrogates (from invalid bytes) back to %XX
+    parts: list[str] = []
+    for ch in text:
+        if "\udc80" <= ch <= "\udcff":
+            parts.append(f"%{ord(ch) & 0xFF:02X}")
+        else:
+            parts.append(ch)
+    return "".join(parts)
+
+
+def _decode_html_entities(s: str) -> str:
+    """Decode HTML/XML character entities."""
+    return html.unescape(s)
+
+
+def _decode_hex_escapes(s: str) -> str:
+    r"""Decode literal ``\xHH`` escape sequences."""
+
+    def _replace(m: re.Match[str]) -> str:
+        return chr(int(m.group(1), 16))
+
+    return _HEX_RE.sub(_replace, s)
+
+
+def decode_evasion(text: str, *, max_layers: int = MAX_DECODE_LAYERS) -> str:
+    """Iteratively decode nested encodings from *text*.
+
+    Runs URL decoding, HTML entity unescaping, and hex escape decoding
+    in sequence as a single pass. A pass counts as one layer if the
+    output differs from the input. Stops when a pass produces no
+    changes or *max_layers* is reached.
+
+    Logs a warning with the layer count when decoding occurs. Never
+    includes decoded content in log messages.
+
+    Never errors on invalid or partial encodings — they pass through
+    unchanged.
+    """
+    if max_layers <= 0:
+        return text
+
+    layers = 0
+    for _ in range(max_layers):
+        # Run all three decoders in sequence (one pass)
+        decoded = _decode_url(text)
+        decoded = _decode_html_entities(decoded)
+        decoded = _decode_hex_escapes(decoded)
+        # "changed" = output differs from input for this pass
+        if decoded == text:
+            break
+        text = decoded
+        layers += 1
+
+    if layers:
+        logger.warning("Decoded %d encoding layer(s) from value", layers)
+
+    return text
diff --git a/src/navi_sanitize/_scripts.py b/src/navi_sanitize/_scripts.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: MIT
+"""Mixed-script detection for untrusted text.
+
+Opt-in analysis primitive — no transformation, no blocking. Callers use the
+results to decide whether to warn, block, or confirm. Not part of the default
+``clean()`` pipeline.
+
+Only known script buckets are returned; characters whose Unicode name doesn't
+match any known prefix are silently ignored.
+"""
+
+from __future__ import annotations
+
+import unicodedata
+
+# First token of unicodedata.name() → bucket
+_SCRIPT_PREFIXES: dict[str, str] = {
+    "LATIN": "latin",
+    "CYRILLIC": "cyrillic",
+    "GREEK": "greek",
+    "ARABIC": "arabic",
+    "HEBREW": "hebrew",
+    "ARMENIAN": "armenian",
+    "CHEROKEE": "cherokee",
+    "CJK": "cjk",
+    "HIRAGANA": "cjk",
+    "KATAKANA": "cjk",
+    "HANGUL": "cjk",
+}
+
+
+def detect_scripts(text: str) -> set[str]:
+    """Return the set of script buckets present in *text*.
+
+    Only alphabetic characters are considered; digits, punctuation, emoji,
+    and characters with no Unicode name are skipped. Unknown scripts (not in
+    the prefix map) are silently ignored.
+
+    Buckets: ``latin``, ``cyrillic``, ``greek``, ``arabic``, ``hebrew``,
+    ``armenian``, ``cherokee``, ``cjk`` (covers CJK Unified, Hiragana,
+    Katakana, and Hangul).
+    """
+    scripts: set[str] = set()
+    for ch in text:
+        if not ch.isalpha():
+            continue
+        name = unicodedata.name(ch, "")
+        if not name:
+            continue
+        head = name.split(" ", 1)[0]
+        bucket = _SCRIPT_PREFIXES.get(head)
+        if bucket is not None:
+            scripts.add(bucket)
+    return scripts
+
+
+def is_mixed_script(text: str) -> bool:
+    """Return ``True`` if *text* contains characters from two or more scripts.
+
+    Non-alphabetic characters (digits, punctuation, emoji) are not counted,
+    so ``"hello 123"`` is *not* considered mixed.
+    """
+    return len(detect_scripts(text)) >= 2
diff --git a/tests/test_audit_remediation.py b/tests/test_audit_remediation.py
@@ -348,6 +348,9 @@ def test_public_exports(self) -> None:
         assert hasattr(navi_sanitize, "__version__")
         assert set(navi_sanitize.__all__) == {
             "clean",
+            "decode_evasion",
+            "detect_scripts",
+            "is_mixed_script",
             "walk",
             "jinja2_escaper",
             "path_escaper",
diff --git a/tests/test_decode.py b/tests/test_decode.py
diff --git a/tests/test_scripts.py b/tests/test_scripts.py