keyhog/.keyhog.toml.example at main · santhsecurity/keyhog · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# KeyHog Configuration File
# Place this file as `.keyhog.toml` in your project root
# All options are optional - KeyHog uses sensible defaults

# ============================================================================
# BASIC OPTIONS
# ============================================================================

# Path to detector TOML directory (default: "detectors")
detectors = "detectors"

# Minimum severity to report: info, low, medium, high, critical
severity = "medium"

# Output format: text, json, jsonl, sarif
format = "text"

# Number of parallel scanning threads (default: number of CPU cores)
threads = 8

# Deduplication scope: credential, file, none
dedup = "credential"

# ============================================================================
# SCANNING MODES
# ============================================================================

# Fast mode: pattern matching only, maximum speed
# Use for pre-commit hooks and quick scans
# WARNING: --fast disables decode + entropy + ML simultaneously, the largest
# detection blind spot keyhog ships. Encoded secrets (base64, hex, URL),
# novel high-entropy strings, and ML-only catches will all be missed.
# This flag is REFUSED in --lockdown mode (kimi-wave3 §5).
fast = false

# Deep mode: all features enabled, maximum recall
# Use for security audits and CI/CD pipelines
deep = false

# Skip decode-through scanning (base64, hex, URL decoding)
# WARNING: encoded secrets like base64('AKIA…') will slip through.
# REFUSED in --lockdown mode.
no_decode = false

# Skip entropy-based detection for unknown patterns
# WARNING: only catch for novel / unknown high-entropy secrets.
# REFUSED in --lockdown mode.
no_entropy = false

# ============================================================================
# ADVANCED ENCODING OPTIONS
# ============================================================================

# Maximum depth for recursive decoding (1-10, default: 10)
# - 2: Fast, basic detection
# - 4: catches double/triple encoding
# - 8: catches sophisticated evasion
# - 10: Maximum (default) - catches state-level adversaries
decode_depth = 10

# Maximum file size for decode-through scanning (default: 512KB)
# Files larger than this skip encoding detection for performance
# Accepts: "32KB", "1MB", "10MB", "100MB"
decode_size_limit = "512KB"

# ============================================================================
# ENTROPY DETECTION OPTIONS
# ============================================================================

# Enable entropy scanning in source code files (.py, .js, .go, etc.)
# Disabled by default to reduce false positives in code
entropy_source_files = false

# Entropy threshold in bits per byte (default: 4.5)
# - 3.5: Aggressive (more findings, more false positives)
# - 4.5: Balanced (default)
# - 5.5: Conservative (fewer findings, fewer false positives)
entropy_threshold = 4.5

# Admit credential-keyword-anchored values (PASSWORD=, *_PASS=, secret:,
# api_key= ...) on a far lower entropy floor than the bare generic gate. The
# keyword key is the evidence; precision is carried by the ML model + shape
# filters. This is what surfaces real-world low-entropy config passwords (the
# dominant real-corpus credential shape). Set false (CLI: --no-keyword-low-entropy)
# to restore the high-entropy-only generic gate: higher precision, far lower
# real-world recall. Default: true.
generic_keyword_low_entropy = true

# ============================================================================
# CONFIDENCE & ML OPTIONS
# ============================================================================

# Minimum confidence score to report findings (0.0 - 1.0, default: 0.40)
# Bench-tuned on the SecretBench mirror (max F1). NOTE: the floor is
# currently non-monotonic in precision around 0.50 (a scan-time/ML
# entanglement, tracked in backlog DET-08); 0.40 is the measured optimum.
# - 0.30: Aggressive (more findings, more false positives)
# - 0.40: Balanced (default)
# - 0.85: Conservative (fewer findings, fewer false positives)
min_confidence = 0.40

# Disable ML-based confidence scoring
# WARNING: increases false positives, hurts recall on hand-crafted
# near-misses. REFUSED in --lockdown mode.
no_ml = false

# ML weight for confidence scoring (0.0-1.0, default: 0.5)
# Higher values trust ML more; lower values favor heuristics
ml_weight = 0.5

# ============================================================================
# UNICODE & SECURITY OPTIONS
# ============================================================================

# Disable Unicode normalization (NOT recommended)
# Leaving enabled prevents homoglyph and evasion attacks where an attacker
# uses Cyrillic/Greek lookalikes (e.g. `аpі_key` with Cyrillic а) to
# evade detection. REFUSED in --lockdown mode.
no_unicode_norm = false

# ============================================================================
# VERIFICATION OPTIONS (requires --verify feature)
# ============================================================================

# Verify discovered credentials via API calls
# WARNING: enabling this sends every detected secret to a third-party
# HTTPS endpoint (only domains on the per-detector allowlist or the
# builtin service map — see `verifier::domain_allowlist`). Use only in
# isolated environments and never in CI logs that capture verifier
# output. REFUSED in --lockdown mode.
verify = false

# Verification timeout in seconds
timeout = 5

# Max concurrent verification requests per service
rate = 5

# ============================================================================
# GIT OPTIONS
# ============================================================================

# Maximum git commits to traverse in history scan
max_commits = 1000

# ============================================================================
# OUTPUT OPTIONS
# ============================================================================

# Show full credentials in output (default: redacted)
# WARNING: enabling this prints plaintext credentials to stdout, log
# files, CI artifacts, and anywhere keyhog's output is routed. NEVER
# enable in CI, on shared hosts, or with --output to a path that
# survives the run. REFUSED in --lockdown mode.
show_secrets = false

# ============================================================================
# INCREMENTAL SCAN (v0.3.0+)
# ============================================================================

# Persist a BLAKE3 Merkle index of file hashes; subsequent runs skip
# unchanged files entirely. 10–100× speedup on CI re-runs.
# CLI override: `--incremental` (with optional `--incremental-cache <path>`).
# Default cache path: `~/.cache/keyhog/merkle.idx` (platform-aware via `dirs`).
# incremental = false
# incremental_cache = "~/.cache/keyhog/merkle.idx"

# ============================================================================
# BAYESIAN CALIBRATION (v0.3.0+)
# ============================================================================

# Per-detector Beta(α, β) confidence multiplier. Update with
#   keyhog calibrate --tp <detector-id>     # record true positive
#   keyhog calibrate --fp <detector-id>     # record false positive
#   keyhog calibrate --show                  # posterior-mean bar chart
# Persisted at `~/.cache/keyhog/calibration.json`. Detectors with zero
# observations are not multiplied (no penalty on a fresh install).

# ============================================================================
# DIFF-AWARE SEVERITY (v0.3.0+)
# ============================================================================

# When scanning git history (`--git-history`), findings whose blob OID is
# reachable from HEAD keep their detector's declared severity (live leak).
# Findings only present in older commits drop one tier (Critical → High,
# etc.). No config knob — always on for git sources.

# ============================================================================
# WATCH DAEMON (v0.3.0+)
# ============================================================================

# `keyhog watch <path>` starts a notify-based file watcher and re-scans
# any modified file with the same compiled detector corpus. Sub-100ms
# detection on file save. No config knob — invoke as a separate command.

# ============================================================================
# NESTED TABLES  (must come AFTER every flat key above — TOML rule)
# ============================================================================
# All flat keys above also have a nested equivalent. Use either shape; if
# both set the same value the flat form wins. Full reference:
# docs/src/reference/configuration.md

# [scan] — runtime scan policy (mirrors the flat scalars above).
# [scan]
# severity = "high"
# min_confidence = 0.40
# format = "sarif"
# exclude = ["**/test/fixtures/**", "vendor/"]
# threads = 8
# dedup = "credential"

# [detector.<id>] — per-detector overrides keyed by detector id
# (`keyhog detectors` lists ids; `keyhog explain <id>` shows one).
#   enabled = false          drops the detector from the corpus entirely
#   min_confidence = <0..1>  per-detector floor; OVERRIDES the global
#                            min_confidence for findings from this detector.
# Compiled Tier-A floors ship in the binary (config.rs SHIPPED_DETECTOR_FLOORS)
# and apply even with no .keyhog.toml; a file entry here overrides the
# compiled value for that id. (Tier-B = the per-detector `min_confidence`
# inside each detector TOML under detectors/.)
# [detector.generic-api-key]
# enabled = false
# [detector.twilio-api-key]
# min_confidence = 0.6

# [lockdown] — refuse to run unless the operator passes --lockdown.
# A repo that demands maximum-security scanning sets this so a plain
# `keyhog scan` fails closed rather than silently running unhardened.
# [lockdown]
# require = true