Skip to content

Commit a18c606

Browse files
author
Perseus Hermes
committed
fix(#57228): idle watchdog self-terminates orphaned stdio MCP servers
A long-lived Hermes worker reconnects its MCP client per turn but leaks the write-end of prior stdio children, so those children never receive EOF and block forever in reader.lines() — one orphan leaks per reconnect until SQLite handle contention makes the vault appear 'down' (~50 procs / 1.4GB observed). Rewrite run_server() into a reader-thread + recv_timeout loop. A stdio server with zero client traffic for MIMIR_IDLE_TIMEOUT_SECS (default 600, 0=disable) self-terminates and frees its DB handle. Active clients always send a request within the window and are unaffected; orphans self-reap. Fixes the leak at the source instead of relying on an external process reaper. - parse_idle_timeout() extracted + unit-tested (idle_timeout_parsing_*) - runtime-verified: idle exits on schedule, active survives, =0 disables
1 parent 6de3e11 commit a18c606

1 file changed

Lines changed: 86 additions & 3 deletions

File tree

src/mcp.rs

Lines changed: 86 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,20 +48,82 @@ impl MCPState {
4848
}
4949
}
5050

51+
/// Parse the `MIMIR_IDLE_TIMEOUT_SECS` env value into an idle-watchdog duration.
52+
///
53+
/// - unset / unparseable -> default 600s (Some)
54+
/// - "0" -> disabled (None)
55+
/// - "N" -> Some(N seconds)
56+
///
57+
/// Factored out of `run_server` so the orphan-leak guard (#57228) is unit-tested.
58+
pub fn parse_idle_timeout(raw: Option<&str>) -> Option<std::time::Duration> {
59+
match raw {
60+
Some(v) => match v.trim().parse::<u64>() {
61+
Ok(0) => None,
62+
Ok(secs) => Some(std::time::Duration::from_secs(secs)),
63+
Err(_) => Some(std::time::Duration::from_secs(600)),
64+
},
65+
None => Some(std::time::Duration::from_secs(600)),
66+
}
67+
}
68+
5169
/// Run the MCP server loop: read JSON-RPC from stdin, write responses to stdout.
5270
///
5371
/// Takes `Arc<Database>` (#402) so main.rs can hand the SAME pooled Database
5472
/// to the web dashboard / gRPC surfaces instead of each opening a second
5573
/// `Database` (a second 16-conn pool) on the same file.
5674
pub fn run_server(db: std::sync::Arc<Database>) {
57-
let stdin = std::io::stdin();
5875
let mut stdout = std::io::stdout();
59-
let reader = BufReader::new(stdin.lock());
6076
let state = MCPState::new();
6177

78+
// Idle watchdog (fixes NousResearch/hermes-agent#57228 from the server side).
79+
//
80+
// A stdio MCP server that receives ZERO traffic for `idle_timeout` is, by
81+
// definition, an abandoned/orphaned child: its client (a long-lived Hermes
82+
// worker) reconnected and leaked the write-end of this pipe, so we will never
83+
// see EOF and would otherwise block in the read forever — accumulating one
84+
// orphan per reconnect until SQLite handle contention makes the vault appear
85+
// "down". An ACTIVE client always issues a tools/call (or at least a ping)
86+
// well within the window, so it is never affected; an orphan self-terminates
87+
// and frees its DB handle. Override with MIMIR_IDLE_TIMEOUT_SECS (0 disables).
88+
let idle_timeout: Option<std::time::Duration> =
89+
parse_idle_timeout(std::env::var("MIMIR_IDLE_TIMEOUT_SECS").ok().as_deref());
90+
91+
// Read stdin on a dedicated thread so the main loop can time out on silence.
92+
let (tx, rx) = std::sync::mpsc::channel::<std::io::Result<String>>();
93+
std::thread::spawn(move || {
94+
let stdin = std::io::stdin();
95+
let reader = BufReader::new(stdin.lock());
96+
for line in reader.lines() {
97+
// If the main loop has exited (idle timeout), the receiver is dropped
98+
// and send() errors — stop reading and let this thread end.
99+
if tx.send(line).is_err() {
100+
break;
101+
}
102+
}
103+
// EOF: closing tx makes the main loop's recv return Disconnected.
104+
});
105+
62106
eprintln!("mimir: MCP server ready");
63107

64-
for line in reader.lines() {
108+
loop {
109+
let line = match idle_timeout {
110+
Some(timeout) => match rx.recv_timeout(timeout) {
111+
Ok(l) => l,
112+
Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
113+
eprintln!(
114+
"mimir: no client activity for {}s — exiting idle stdio server (orphan-leak guard, #57228)",
115+
timeout.as_secs()
116+
);
117+
break;
118+
}
119+
Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => break,
120+
},
121+
None => match rx.recv() {
122+
Ok(l) => l,
123+
Err(_) => break,
124+
},
125+
};
126+
65127
let line = match line {
66128
Ok(l) => l,
67129
Err(e) => {
@@ -4127,4 +4189,25 @@ mod tests {
41274189

41284190
let _ = fs::remove_file(db_path);
41294191
}
4192+
4193+
#[test]
4194+
fn idle_timeout_parsing_covers_orphan_guard_cases() {
4195+
use std::time::Duration;
4196+
// Unset -> 10-minute default (guard ON).
4197+
assert_eq!(parse_idle_timeout(None), Some(Duration::from_secs(600)));
4198+
// Explicit "0" -> disabled (guard OFF, for interactive/debug use).
4199+
assert_eq!(parse_idle_timeout(Some("0")), None);
4200+
// Explicit value -> honored.
4201+
assert_eq!(parse_idle_timeout(Some("30")), Some(Duration::from_secs(30)));
4202+
// Whitespace tolerated.
4203+
assert_eq!(
4204+
parse_idle_timeout(Some(" 120 ")),
4205+
Some(Duration::from_secs(120))
4206+
);
4207+
// Garbage -> safe default, never panics.
4208+
assert_eq!(
4209+
parse_idle_timeout(Some("banana")),
4210+
Some(Duration::from_secs(600))
4211+
);
4212+
}
41304213
}

0 commit comments

Comments
 (0)