From 655560e2e561c41a6ec98a08d4c99da55ed5a717 Mon Sep 17 00:00:00 2001
From: Stefan Steiner <ssteiner@tableau.com>
Date: Sun, 7 Jun 2026 11:11:41 -0700
Subject: [PATCH 1/2] fix(daemon): stop redundant off-base daemon after
 concurrent cold-start
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When two MCP clients cold-start simultaneously, both scan and find the
base port free, both call spawn_detached(base). One daemon wins the
bind (stays on base); the other fails bind and exits. A third (or the
same race victim) scans again, finds base now occupied, lands on
base+1 — and writes daemon.json overwriting the base-port daemon's
file, producing two live daemons on adjacent ports and doubling the
idle hyperd CPU overhead.

Fix: after wait_for_daemon() returns a daemon on a non-base port,
re-scan the ports below it. If a lower-port daemon is live, STOP the
off-base daemon (best-effort) and adopt the base-port one instead.
The lower-port daemon wins because it bound the socket first and is
the canonical single instance.
---
 hyperdb-mcp/src/daemon/spawn.rs   | 25 ++++++++++++++++++++++-
 hyperdb-mcp/tests/daemon_tests.rs | 34 +++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)
diff --git a/hyperdb-mcp/src/daemon/spawn.rs b/hyperdb-mcp/src/daemon/spawn.rs
index bda2a43..b88efb5 100644
--- a/hyperdb-mcp/src/daemon/spawn.rs
+++ b/hyperdb-mcp/src/daemon/spawn.rs
@@ -40,7 +40,30 @@ pub fn ensure_daemon(scan: PortScan) -> io::Result<DaemonInfo> {
         ScanOutcome::FreePort(port) => {
             info!(port, "no running daemon detected, spawning on free port");
             spawn_detached(port)?;
-            wait_for_daemon()
+            let info = wait_for_daemon()?;
+            // If the daemon we just spawned bound a port above the scan base (because
+            // concurrent clients raced and one of them grabbed the base port first),
+            // prefer the lower-port daemon so we don't accumulate redundant
+            // daemon+hyperd pairs on adjacent ports. The lower-port daemon wins
+            // because it bound first and is the canonical single instance.
+            if info.health_port > scan.base {
+                let lower_scan = PortScan {
+                    base: scan.base,
+                    span: info.health_port.saturating_sub(scan.base),
+                };
+                if let ScanOutcome::Found(lower_info) = discovery::scan_for_daemon(lower_scan) {
+                    debug!(
+                        prefer_port = lower_info.health_port,
+                        stop_port = info.health_port,
+                        "found lower-port daemon from concurrent spawn; stopping off-base daemon"
+                    );
+                    // Best-effort STOP — if it fails the off-base daemon idles
+                    // harmlessly (it has no clients and will only cost background CPU).
+                    let _ = super::health::send_command(info.health_port, "STOP");
+                    return Ok(*lower_info);
+                }
+            }
+            Ok(info)
         }
         ScanOutcome::AllOccupied => Err(io::Error::new(
             io::ErrorKind::AddrInUse,
diff --git a/hyperdb-mcp/tests/daemon_tests.rs b/hyperdb-mcp/tests/daemon_tests.rs
index 60c0331..26ddabf 100644
--- a/hyperdb-mcp/tests/daemon_tests.rs
+++ b/hyperdb-mcp/tests/daemon_tests.rs
@@ -753,6 +753,40 @@ fn discover_finds_live_daemon() {
     assert_eq!(discovered.health_port, port);
 }
 
+// ─── Unit tests: concurrent-spawn dedup (no env vars, safe parallel) ────────────
+
+#[test]
+fn scan_for_daemon_prefers_lower_port_daemon() {
+    // Simulates the concurrent-spawn race: two daemons ended up on adjacent ports
+    // (base and base+1). The client that landed on base+1 should prefer the
+    // base-port daemon when it re-scans the lower range.
+    //
+    // We model this by starting two real health listeners (which answer PING with
+    // the identifying token), then asserting that scan_for_daemon with a 2-port
+    // range returns the LOWER port's daemon as Found.
+    let (lower_port, _lower_handle, _lower_state) = start_health_listener();
+    let (higher_port, _higher_handle, _higher_state) = start_health_listener();
+
+    // Make sure lower_port < higher_port for a predictable result.
+    let (base, _top) = if lower_port < higher_port {
+        (lower_port, higher_port)
+    } else {
+        (higher_port, lower_port)
+    };
+
+    let scan = PortScan { base, span: 2 };
+
+    match discovery::scan_for_daemon(scan) {
+        discovery::ScanOutcome::Found(info) => {
+            assert_eq!(
+                info.health_port, base,
+                "scan should return the LOWEST-port daemon (the first one bound)"
+            );
+        }
+        other => panic!("expected Found, got {other:?}"),
+    }
+}
+
 // ─── Unit tests: Version takeover decision (no env vars, safe parallel) ─────────
 
 #[test]

From 8de3b0e8c2ec23eeae15477cf6731877dc3d074f Mon Sep 17 00:00:00 2001
From: Stefan Steiner <ssteiner@tableau.com>
Date: Sun, 7 Jun 2026 15:57:55 -0700
Subject: [PATCH 2/2] fix(client): enable TCP keepalive on hyperd connections
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A long-lived idle connection to hyperd that goes half-open (laptop
sleep, network blip, or a hyperd that vanished without a FIN) had no
way to be detected: the next blocking read would hang until the OS
default keepalive idle timeout (7200s / 2h on macOS and Linux). Because
the MCP serializes all tool calls behind a single engine mutex with no
per-op timeout, one such stalled connection makes EVERY tool call —
including `status` — appear to hang indefinitely.

This became materially more likely in 0.5.0, where the daemon went
resident-by-default: connections now live indefinitely across laptop
suspends instead of being torn down by the old 30-minute idle shutdown.

Fix: set TCP keepalive (60s idle / 10s interval / 3 probes -> dead peer
in ~90s) at both the sync and async TCP connect sites, alongside the
existing nodelay/buffer-size socket options. Best-effort (.ok()): a
kernel that rejects a knob leaves the connection at OS defaults. Probe
count is Linux-only (macOS honors idle+interval). Deliberately NOT a
query timeout — HyperDB runs legitimately long analytics queries, and
keepalive only probes during true silence, so it can't abort live work.

Follow-up (not in this commit): the single global engine mutex means a
slow/stalled op blocks unrelated tool calls. That is an architectural
change warranting its own design pass.
---
 hyperdb-api-core/src/client/async_client.rs | 12 ++++++++++
 hyperdb-api-core/src/client/client.rs       | 25 +++++++++++++++++++++
 hyperdb-mcp/CHANGELOG.md                    | 14 ++++++++++++
 3 files changed, 51 insertions(+)

diff --git a/hyperdb-api-core/src/client/async_client.rs b/hyperdb-api-core/src/client/async_client.rs
index bfaaf68..acdfe3b 100644
--- a/hyperdb-api-core/src/client/async_client.rs
+++ b/hyperdb-api-core/src/client/async_client.rs
@@ -130,6 +130,18 @@ impl AsyncClient {
         let sock = socket2::SockRef::from(&tcp_stream);
         sock.set_recv_buffer_size(4 * 1024 * 1024).ok();
         sock.set_send_buffer_size(4 * 1024 * 1024).ok();
+        // TCP keepalive: detect a half-open peer (laptop sleep, network blip,
+        // a hyperd that vanished without a FIN) in ~90s instead of the 2h OS
+        // idle default. See the rationale on `super::client::apply_tcp_keepalive`
+        // (the sync mirror). Best-effort: a rejected knob leaves OS defaults.
+        {
+            let keepalive = socket2::TcpKeepalive::new()
+                .with_time(std::time::Duration::from_secs(60))
+                .with_interval(std::time::Duration::from_secs(10));
+            #[cfg(not(any(target_os = "macos", target_os = "windows")))]
+            let keepalive = keepalive.with_retries(3);
+            sock.set_tcp_keepalive(&keepalive).ok();
+        }
 
         let stream = AsyncStream::tcp(tcp_stream);
         let mut connection = AsyncRawConnection::new(stream);
diff --git a/hyperdb-api-core/src/client/client.rs b/hyperdb-api-core/src/client/client.rs
index 9aeeced..8933e8d 100644
--- a/hyperdb-api-core/src/client/client.rs
+++ b/hyperdb-api-core/src/client/client.rs
@@ -36,9 +36,33 @@
 
 use std::net::TcpStream;
 use std::sync::{Arc, Mutex, MutexGuard};
+use std::time::Duration;
 
 use tracing::{debug, info, trace, warn};
 
+/// Enable TCP keepalive on a connection socket so a half-open peer (laptop
+/// sleep, network blip, a hyperd that vanished without a FIN) is detected in
+/// ~90s instead of blocking a blocking `read()` until the OS default idle
+/// timeout (7200s / 2h on macOS and Linux).
+///
+/// This matters most for long-lived idle connections — e.g. an MCP client
+/// holding a connection to a resident daemon's hyperd across a laptop suspend.
+/// Without keepalive, the next query on a silently-dead socket hangs for hours.
+///
+/// Tuning: 60s idle before the first probe, 10s between probes, 3 probes →
+/// the peer is declared dead ~90s after it goes silent. Probe count is only
+/// honored on platforms whose `socket2` build exposes `with_retries`; macOS
+/// honors idle+interval. All calls are best-effort (`.ok()`): a kernel that
+/// rejects a knob leaves the connection working at OS defaults.
+fn apply_tcp_keepalive(sock: &socket2::SockRef<'_>) {
+    let keepalive = socket2::TcpKeepalive::new()
+        .with_time(Duration::from_secs(60))
+        .with_interval(Duration::from_secs(10));
+    #[cfg(not(any(target_os = "macos", target_os = "windows")))]
+    let keepalive = keepalive.with_retries(3);
+    sock.set_tcp_keepalive(&keepalive).ok();
+}
+
 #[cfg(unix)]
 use std::os::unix::net::UnixStream;
 
@@ -192,6 +216,7 @@ impl Client {
         let sock = socket2::SockRef::from(&tcp_stream);
         sock.set_recv_buffer_size(4 * 1024 * 1024).ok();
         sock.set_send_buffer_size(4 * 1024 * 1024).ok();
+        apply_tcp_keepalive(&sock);
 
         let stream = SyncStream::tcp(tcp_stream);
         let mut connection = RawConnection::new(stream);
diff --git a/hyperdb-mcp/CHANGELOG.md b/hyperdb-mcp/CHANGELOG.md
index 0356e3e..3fa3b12 100644
--- a/hyperdb-mcp/CHANGELOG.md
+++ b/hyperdb-mcp/CHANGELOG.md
@@ -5,6 +5,20 @@ All notable changes to the `hyperdb-mcp` crate will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/),
 and this project adheres to [Semantic Versioning](https://semver.org/).
 
+## [Unreleased]
+
+### Fixed
+
+- **TCP keepalive on the `hyperd` connection.** Connections to `hyperd` now
+  enable TCP keepalive (60s idle, 10s interval, ~90s to declare a dead peer)
+  instead of relying on the OS default 2-hour idle timeout. Without it, a
+  long-lived idle connection that went half-open — laptop sleep, a network
+  blip, or a `hyperd` that vanished without sending a FIN — would make the
+  next query (including the `status` tool) block for up to two hours instead
+  of failing fast and reconnecting. This is most visible since the daemon
+  became resident-by-default in 0.5.0, which made long-lived idle connections
+  the norm. (Fixed in `hyperdb-api-core` for both the sync and async clients.)
+
 ## [0.5.0] - 2026-06-07
 
 ### Fixed