Skip to content

Commit dc2648a

Browse files
committed
DetTrace: add replay failure similarity search
1 parent e795fc9 commit dc2648a

5 files changed

Lines changed: 197 additions & 0 deletions

File tree

README.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2238,3 +2238,31 @@ Example:
22382238
python3 regression_intelligence/build_regression_radar.py --build candidate_42 --signals retry_storm,timeout_chain,duplicate_retry_window,config_read_timeout
22392239
22402240
Safe scope: heuristic replay-based build regression analysis for diagnostics review. This does not claim production release automation, CI ownership, hardware-lab testing, drivers, firmware, kernel, or real hardware emulation.
2241+
2242+
---
2243+
2244+
## Failure Similarity Search
2245+
2246+
DetTrace includes replay failure-similarity search under `failure_similarity/`.
2247+
2248+
The search compares a new trace against historical failure patterns:
2249+
2250+
- timeout
2251+
- disconnect
2252+
- retry storm
2253+
- state corruption
2254+
- enumeration failure
2255+
2256+
Output includes:
2257+
2258+
- most similar failure family
2259+
- similarity score
2260+
- confidence score
2261+
- matched evidence
2262+
- likely root cause
2263+
2264+
Example:
2265+
2266+
python3 failure_similarity/search_similar_failures.py io_transport_validation/timeout_retry_chain.json
2267+
2268+
Safe scope: heuristic replay failure-similarity search for diagnostics review. This does not claim ML, production incident ranking, hardware-lab testing, drivers, firmware, kernel, or real hardware emulation.
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{
2+
"safe_claim": "heuristic replay failure-similarity search; not ML or production incident ranking",
3+
"patterns": [
4+
{
5+
"id": "timeout_pattern",
6+
"family": "timeout",
7+
"signals": ["timeout", "deadline_exceeded", "ack_missing", "missed_deadline"],
8+
"likely_root_cause": "timeout_chain"
9+
},
10+
{
11+
"id": "disconnect_pattern",
12+
"family": "disconnect",
13+
"signals": ["disconnect", "reconnect", "session_ready", "device_ready"],
14+
"likely_root_cause": "reconnect_ordering_issue"
15+
},
16+
{
17+
"id": "retry_storm_pattern",
18+
"family": "retry_storm",
19+
"signals": ["retry_storm", "retry_send", "duplicate_retry_window", "bounded_retry"],
20+
"likely_root_cause": "unbounded_or_duplicate_retry"
21+
},
22+
{
23+
"id": "state_corruption_pattern",
24+
"family": "state_corruption",
25+
"signals": ["stale_device_state", "stale_session", "state_refresh", "checksum_error"],
26+
"likely_root_cause": "stale_or_corrupt_state"
27+
},
28+
{
29+
"id": "enumeration_failure_pattern",
30+
"family": "enumeration_failure",
31+
"signals": ["config_read_timeout", "missing_bar_assignment", "interrupt_route_missing"],
32+
"likely_root_cause": "enumeration_sequence_failure"
33+
}
34+
]
35+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"input_trace": "io_transport_validation/timeout_retry_chain.json",
3+
"safe_claim": "heuristic replay failure-similarity search; not ML or production incident ranking",
4+
"most_similar_failure": {
5+
"pattern_id": "timeout_pattern",
6+
"failure_family": "timeout",
7+
"similarity": 0.25,
8+
"confidence": 0.69,
9+
"matched_evidence": [
10+
"timeout"
11+
],
12+
"likely_root_cause": "timeout_chain"
13+
},
14+
"top_matches": [
15+
{
16+
"pattern_id": "timeout_pattern",
17+
"failure_family": "timeout",
18+
"similarity": 0.25,
19+
"confidence": 0.69,
20+
"matched_evidence": [
21+
"timeout"
22+
],
23+
"likely_root_cause": "timeout_chain"
24+
},
25+
{
26+
"pattern_id": "retry_storm_pattern",
27+
"failure_family": "retry_storm",
28+
"similarity": 0.25,
29+
"confidence": 0.69,
30+
"matched_evidence": [
31+
"retry_send"
32+
],
33+
"likely_root_cause": "unbounded_or_duplicate_retry"
34+
}
35+
]
36+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Failure Similarity Search Report
2+
3+
## Safe claim
4+
5+
heuristic replay failure-similarity search; not ML or production incident ranking
6+
7+
## Most similar failure
8+
9+
- family: `timeout`
10+
- similarity: `0.25`
11+
- confidence: `0.69`
12+
- likely root cause: `timeout_chain`
13+
- evidence: `['timeout']`
14+
15+
## Top matches
16+
17+
- `timeout` similarity=`0.25` confidence=`0.69` evidence=`['timeout']` root_cause=`timeout_chain`
18+
- `retry_storm` similarity=`0.25` confidence=`0.69` evidence=`['retry_send']` root_cause=`unbounded_or_duplicate_retry`
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import json
4+
from pathlib import Path
5+
6+
PATTERNS = Path("failure_similarity/failure_patterns.json")
7+
REPORT_JSON = Path("failure_similarity/failure_similarity_report.json")
8+
REPORT_MD = Path("failure_similarity/failure_similarity_report.md")
9+
10+
11+
def text_for(path):
12+
return Path(path).read_text().lower()
13+
14+
15+
def score_pattern(trace_text, pattern):
16+
hits = [signal for signal in pattern["signals"] if signal.lower() in trace_text]
17+
score = round(len(hits) / len(pattern["signals"]), 2)
18+
confidence = round(min(0.99, 0.60 + score * 0.35), 2) if hits else 0.0
19+
return {
20+
"pattern_id": pattern["id"],
21+
"failure_family": pattern["family"],
22+
"similarity": score,
23+
"confidence": confidence,
24+
"matched_evidence": hits,
25+
"likely_root_cause": pattern["likely_root_cause"]
26+
}
27+
28+
29+
def main():
30+
parser = argparse.ArgumentParser(description="Search similar DetTrace replay failure patterns")
31+
parser.add_argument("trace", help="Path to trace or replay artifact")
32+
args = parser.parse_args()
33+
34+
trace_text = text_for(args.trace)
35+
patterns = json.loads(PATTERNS.read_text())["patterns"]
36+
37+
matches = [score_pattern(trace_text, p) for p in patterns]
38+
matches = [m for m in matches if m["similarity"] > 0]
39+
matches.sort(key=lambda m: (m["confidence"], m["similarity"], len(m["matched_evidence"])), reverse=True)
40+
41+
best = matches[0] if matches else {
42+
"pattern_id": "none",
43+
"failure_family": "unknown",
44+
"similarity": 0.0,
45+
"confidence": 0.0,
46+
"matched_evidence": [],
47+
"likely_root_cause": "unknown"
48+
}
49+
50+
report = {
51+
"input_trace": args.trace,
52+
"safe_claim": "heuristic replay failure-similarity search; not ML or production incident ranking",
53+
"most_similar_failure": best,
54+
"top_matches": matches[:5]
55+
}
56+
57+
REPORT_JSON.write_text(json.dumps(report, indent=2))
58+
REPORT_MD.write_text(
59+
"# Failure Similarity Search Report\n\n"
60+
"## Safe claim\n\n"
61+
f"{report['safe_claim']}\n\n"
62+
"## Most similar failure\n\n"
63+
f"- family: `{best['failure_family']}`\n"
64+
f"- similarity: `{best['similarity']}`\n"
65+
f"- confidence: `{best['confidence']}`\n"
66+
f"- likely root cause: `{best['likely_root_cause']}`\n"
67+
f"- evidence: `{best['matched_evidence']}`\n\n"
68+
"## Top matches\n\n"
69+
+ "\n".join(
70+
f"- `{m['failure_family']}` similarity=`{m['similarity']}` confidence=`{m['confidence']}` evidence=`{m['matched_evidence']}` root_cause=`{m['likely_root_cause']}`"
71+
for m in report["top_matches"]
72+
)
73+
+ "\n"
74+
)
75+
76+
print(json.dumps(report, indent=2))
77+
78+
79+
if __name__ == "__main__":
80+
main()

0 commit comments

Comments
 (0)