From 655560e2e561c41a6ec98a08d4c99da55ed5a717 Mon Sep 17 00:00:00 2001 From: Stefan Steiner Date: Sun, 7 Jun 2026 11:11:41 -0700 Subject: [PATCH 1/2] fix(daemon): stop redundant off-base daemon after concurrent cold-start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When two MCP clients cold-start simultaneously, both scan and find the base port free, both call spawn_detached(base). One daemon wins the bind (stays on base); the other fails bind and exits. A third (or the same race victim) scans again, finds base now occupied, lands on base+1 — and writes daemon.json overwriting the base-port daemon's file, producing two live daemons on adjacent ports and doubling the idle hyperd CPU overhead. Fix: after wait_for_daemon() returns a daemon on a non-base port, re-scan the ports below it. If a lower-port daemon is live, STOP the off-base daemon (best-effort) and adopt the base-port one instead. The lower-port daemon wins because it bound the socket first and is the canonical single instance. --- hyperdb-mcp/src/daemon/spawn.rs | 25 ++++++++++++++++++++++- hyperdb-mcp/tests/daemon_tests.rs | 34 +++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/hyperdb-mcp/src/daemon/spawn.rs b/hyperdb-mcp/src/daemon/spawn.rs index bda2a43..b88efb5 100644 --- a/hyperdb-mcp/src/daemon/spawn.rs +++ b/hyperdb-mcp/src/daemon/spawn.rs @@ -40,7 +40,30 @@ pub fn ensure_daemon(scan: PortScan) -> io::Result { ScanOutcome::FreePort(port) => { info!(port, "no running daemon detected, spawning on free port"); spawn_detached(port)?; - wait_for_daemon() + let info = wait_for_daemon()?; + // If the daemon we just spawned bound a port above the scan base (because + // concurrent clients raced and one of them grabbed the base port first), + // prefer the lower-port daemon so we don't accumulate redundant + // daemon+hyperd pairs on adjacent ports. The lower-port daemon wins + // because it bound first and is the canonical single instance. + if info.health_port > scan.base { + let lower_scan = PortScan { + base: scan.base, + span: info.health_port.saturating_sub(scan.base), + }; + if let ScanOutcome::Found(lower_info) = discovery::scan_for_daemon(lower_scan) { + debug!( + prefer_port = lower_info.health_port, + stop_port = info.health_port, + "found lower-port daemon from concurrent spawn; stopping off-base daemon" + ); + // Best-effort STOP — if it fails the off-base daemon idles + // harmlessly (it has no clients and will only cost background CPU). + let _ = super::health::send_command(info.health_port, "STOP"); + return Ok(*lower_info); + } + } + Ok(info) } ScanOutcome::AllOccupied => Err(io::Error::new( io::ErrorKind::AddrInUse, diff --git a/hyperdb-mcp/tests/daemon_tests.rs b/hyperdb-mcp/tests/daemon_tests.rs index 60c0331..26ddabf 100644 --- a/hyperdb-mcp/tests/daemon_tests.rs +++ b/hyperdb-mcp/tests/daemon_tests.rs @@ -753,6 +753,40 @@ fn discover_finds_live_daemon() { assert_eq!(discovered.health_port, port); } +// ─── Unit tests: concurrent-spawn dedup (no env vars, safe parallel) ──────────── + +#[test] +fn scan_for_daemon_prefers_lower_port_daemon() { + // Simulates the concurrent-spawn race: two daemons ended up on adjacent ports + // (base and base+1). The client that landed on base+1 should prefer the + // base-port daemon when it re-scans the lower range. + // + // We model this by starting two real health listeners (which answer PING with + // the identifying token), then asserting that scan_for_daemon with a 2-port + // range returns the LOWER port's daemon as Found. + let (lower_port, _lower_handle, _lower_state) = start_health_listener(); + let (higher_port, _higher_handle, _higher_state) = start_health_listener(); + + // Make sure lower_port < higher_port for a predictable result. + let (base, _top) = if lower_port < higher_port { + (lower_port, higher_port) + } else { + (higher_port, lower_port) + }; + + let scan = PortScan { base, span: 2 }; + + match discovery::scan_for_daemon(scan) { + discovery::ScanOutcome::Found(info) => { + assert_eq!( + info.health_port, base, + "scan should return the LOWEST-port daemon (the first one bound)" + ); + } + other => panic!("expected Found, got {other:?}"), + } +} + // ─── Unit tests: Version takeover decision (no env vars, safe parallel) ───────── #[test] From 8de3b0e8c2ec23eeae15477cf6731877dc3d074f Mon Sep 17 00:00:00 2001 From: Stefan Steiner Date: Sun, 7 Jun 2026 15:57:55 -0700 Subject: [PATCH 2/2] fix(client): enable TCP keepalive on hyperd connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A long-lived idle connection to hyperd that goes half-open (laptop sleep, network blip, or a hyperd that vanished without a FIN) had no way to be detected: the next blocking read would hang until the OS default keepalive idle timeout (7200s / 2h on macOS and Linux). Because the MCP serializes all tool calls behind a single engine mutex with no per-op timeout, one such stalled connection makes EVERY tool call — including `status` — appear to hang indefinitely. This became materially more likely in 0.5.0, where the daemon went resident-by-default: connections now live indefinitely across laptop suspends instead of being torn down by the old 30-minute idle shutdown. Fix: set TCP keepalive (60s idle / 10s interval / 3 probes -> dead peer in ~90s) at both the sync and async TCP connect sites, alongside the existing nodelay/buffer-size socket options. Best-effort (.ok()): a kernel that rejects a knob leaves the connection at OS defaults. Probe count is Linux-only (macOS honors idle+interval). Deliberately NOT a query timeout — HyperDB runs legitimately long analytics queries, and keepalive only probes during true silence, so it can't abort live work. Follow-up (not in this commit): the single global engine mutex means a slow/stalled op blocks unrelated tool calls. That is an architectural change warranting its own design pass. --- hyperdb-api-core/src/client/async_client.rs | 12 ++++++++++ hyperdb-api-core/src/client/client.rs | 25 +++++++++++++++++++++ hyperdb-mcp/CHANGELOG.md | 14 ++++++++++++ 3 files changed, 51 insertions(+) diff --git a/hyperdb-api-core/src/client/async_client.rs b/hyperdb-api-core/src/client/async_client.rs index bfaaf68..acdfe3b 100644 --- a/hyperdb-api-core/src/client/async_client.rs +++ b/hyperdb-api-core/src/client/async_client.rs @@ -130,6 +130,18 @@ impl AsyncClient { let sock = socket2::SockRef::from(&tcp_stream); sock.set_recv_buffer_size(4 * 1024 * 1024).ok(); sock.set_send_buffer_size(4 * 1024 * 1024).ok(); + // TCP keepalive: detect a half-open peer (laptop sleep, network blip, + // a hyperd that vanished without a FIN) in ~90s instead of the 2h OS + // idle default. See the rationale on `super::client::apply_tcp_keepalive` + // (the sync mirror). Best-effort: a rejected knob leaves OS defaults. + { + let keepalive = socket2::TcpKeepalive::new() + .with_time(std::time::Duration::from_secs(60)) + .with_interval(std::time::Duration::from_secs(10)); + #[cfg(not(any(target_os = "macos", target_os = "windows")))] + let keepalive = keepalive.with_retries(3); + sock.set_tcp_keepalive(&keepalive).ok(); + } let stream = AsyncStream::tcp(tcp_stream); let mut connection = AsyncRawConnection::new(stream); diff --git a/hyperdb-api-core/src/client/client.rs b/hyperdb-api-core/src/client/client.rs index 9aeeced..8933e8d 100644 --- a/hyperdb-api-core/src/client/client.rs +++ b/hyperdb-api-core/src/client/client.rs @@ -36,9 +36,33 @@ use std::net::TcpStream; use std::sync::{Arc, Mutex, MutexGuard}; +use std::time::Duration; use tracing::{debug, info, trace, warn}; +/// Enable TCP keepalive on a connection socket so a half-open peer (laptop +/// sleep, network blip, a hyperd that vanished without a FIN) is detected in +/// ~90s instead of blocking a blocking `read()` until the OS default idle +/// timeout (7200s / 2h on macOS and Linux). +/// +/// This matters most for long-lived idle connections — e.g. an MCP client +/// holding a connection to a resident daemon's hyperd across a laptop suspend. +/// Without keepalive, the next query on a silently-dead socket hangs for hours. +/// +/// Tuning: 60s idle before the first probe, 10s between probes, 3 probes → +/// the peer is declared dead ~90s after it goes silent. Probe count is only +/// honored on platforms whose `socket2` build exposes `with_retries`; macOS +/// honors idle+interval. All calls are best-effort (`.ok()`): a kernel that +/// rejects a knob leaves the connection working at OS defaults. +fn apply_tcp_keepalive(sock: &socket2::SockRef<'_>) { + let keepalive = socket2::TcpKeepalive::new() + .with_time(Duration::from_secs(60)) + .with_interval(Duration::from_secs(10)); + #[cfg(not(any(target_os = "macos", target_os = "windows")))] + let keepalive = keepalive.with_retries(3); + sock.set_tcp_keepalive(&keepalive).ok(); +} + #[cfg(unix)] use std::os::unix::net::UnixStream; @@ -192,6 +216,7 @@ impl Client { let sock = socket2::SockRef::from(&tcp_stream); sock.set_recv_buffer_size(4 * 1024 * 1024).ok(); sock.set_send_buffer_size(4 * 1024 * 1024).ok(); + apply_tcp_keepalive(&sock); let stream = SyncStream::tcp(tcp_stream); let mut connection = RawConnection::new(stream); diff --git a/hyperdb-mcp/CHANGELOG.md b/hyperdb-mcp/CHANGELOG.md index 0356e3e..3fa3b12 100644 --- a/hyperdb-mcp/CHANGELOG.md +++ b/hyperdb-mcp/CHANGELOG.md @@ -5,6 +5,20 @@ All notable changes to the `hyperdb-mcp` crate will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/), and this project adheres to [Semantic Versioning](https://semver.org/). +## [Unreleased] + +### Fixed + +- **TCP keepalive on the `hyperd` connection.** Connections to `hyperd` now + enable TCP keepalive (60s idle, 10s interval, ~90s to declare a dead peer) + instead of relying on the OS default 2-hour idle timeout. Without it, a + long-lived idle connection that went half-open — laptop sleep, a network + blip, or a `hyperd` that vanished without sending a FIN — would make the + next query (including the `status` tool) block for up to two hours instead + of failing fast and reconnecting. This is most visible since the daemon + became resident-by-default in 0.5.0, which made long-lived idle connections + the norm. (Fixed in `hyperdb-api-core` for both the sync and async clients.) + ## [0.5.0] - 2026-06-07 ### Fixed