Skip to content

Commit 4d96941

Browse files
dsarnoclaude
andauthored
Fix WebSocket connection reliability and domain reload recovery (#656)
* Add server-side ping/pong heartbeat to detect dead WebSocket connections On Windows, WebSocket connections can die silently (OSError 64) without either side being notified. This causes commands to fail with "Unity session not available" until Unity eventually detects the dead connection. Changes: - Add PingMessage model for server->client pings - Add ping loop in PluginHub that sends pings every 10 seconds - Track last pong time per session; close connection if no pong within 20s - Include session_id in pong messages from Unity for server-side tracking - Clean up debug/timing logs from Issue #654 investigation The server will now proactively detect dead connections within 20 seconds instead of waiting indefinitely for the next command to fail. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * Fix connection recovery after Unity domain reloads (#654) When Unity performs a domain reload (after script changes, test runs, or large payload transfers), the MCP connection drops and needs to reconnect. The previous reconnection timeout (2s) was too short for domain reloads which can take 10-30s. Changes: - Increase UNITY_MCP_RELOAD_MAX_WAIT_S default from 2s to 30s - Increase backoff cap when reloading from 0.8s to 5.0s - Skip PluginHub session resolution for stdio transport (was causing unnecessary waits on every command) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * Fix ping/pong heartbeat, reduce timeout to 20s, fix test flakiness - Add server-side ping loop to detect dead WebSocket connections - Include session_id in pong messages for tracking - Reduce domain reload timeout from 30s to 20s - Add ClassVar annotations for mutable class attributes - Add lock protection for _last_pong access - Change debug stack trace log from Warn to Debug level - Remove unused TIMING-STDIO variable - Fix flaky async duration test (allow 20% timer variance) - Fix Python test that cleared HOME env var on Windows - Skip Unix-path test on Windows (path separator difference) - Add LogAssert.Expect to PropertyConversion tests Fixes #654, #643 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 61284cc commit 4d96941

13 files changed

Lines changed: 579 additions & 25 deletions

MCPForUnity/Editor/Services/Transport/TransportCommandDispatcher.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,15 @@ public PendingCommand(
3636
CompletionSource = completionSource;
3737
CancellationToken = cancellationToken;
3838
CancellationRegistration = registration;
39+
QueuedAt = DateTime.UtcNow;
3940
}
4041

4142
public string CommandJson { get; }
4243
public TaskCompletionSource<string> CompletionSource { get; }
4344
public CancellationToken CancellationToken { get; }
4445
public CancellationTokenRegistration CancellationRegistration { get; }
4546
public bool IsExecuting { get; set; }
47+
public DateTime QueuedAt { get; }
4648

4749
public void Dispose()
4850
{

MCPForUnity/Editor/Services/Transport/Transports/WebSocketTransportClient.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,7 @@ private Task SendPongAsync(CancellationToken token)
619619
var payload = new JObject
620620
{
621621
["type"] = "pong",
622+
["session_id"] = _sessionId // Include session ID for server-side tracking
622623
};
623624
return SendJsonAsync(payload, token);
624625
}
@@ -652,6 +653,10 @@ private async Task SendJsonAsync(JObject payload, CancellationToken token)
652653

653654
private async Task HandleSocketClosureAsync(string reason)
654655
{
656+
// Capture stack trace for debugging disconnection triggers
657+
var stackTrace = new System.Diagnostics.StackTrace(true);
658+
McpLog.Debug($"[WebSocket] HandleSocketClosureAsync called. Reason: {reason}\nStack trace:\n{stackTrace}");
659+
655660
if (_lifecycleCts == null || _lifecycleCts.IsCancellationRequested)
656661
{
657662
return;

Server/src/transport/legacy/unity_connection.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -306,8 +306,10 @@ def read_status_file(target_hash: str | None = None) -> dict | None:
306306
for attempt in range(attempts + 1):
307307
try:
308308
# Ensure connected (handshake occurs within connect())
309+
t_conn_start = time.time()
309310
if not self.sock and not self.connect():
310311
raise ConnectionError("Could not connect to Unity")
312+
logger.info("[TIMING-STDIO] connect took %.3fs command=%s", time.time() - t_conn_start, command_type)
311313

312314
# Build payload
313315
if command_type == 'ping':
@@ -324,20 +326,24 @@ def read_status_file(target_hash: str | None = None) -> dict | None:
324326
with contextlib.suppress(Exception):
325327
logger.debug(
326328
f"send {len(payload)} bytes; mode={mode}; head={payload[:32].decode('utf-8', 'ignore')}")
329+
t_send_start = time.time()
327330
if self.use_framing:
328331
header = struct.pack('>Q', len(payload))
329332
self.sock.sendall(header)
330333
self.sock.sendall(payload)
331334
else:
332335
self.sock.sendall(payload)
336+
logger.info("[TIMING-STDIO] sendall took %.3fs command=%s", time.time() - t_send_start, command_type)
333337

334338
# During retry bursts use a short receive timeout and ensure restoration
335339
restore_timeout = None
336340
if attempt > 0 and last_short_timeout is None:
337341
restore_timeout = self.sock.gettimeout()
338342
self.sock.settimeout(1.0)
339343
try:
344+
t_recv_start = time.time()
340345
response_data = self.receive_full_response(self.sock)
346+
logger.info("[TIMING-STDIO] receive took %.3fs command=%s len=%d", time.time() - t_recv_start, command_type, len(response_data))
341347
with contextlib.suppress(Exception):
342348
logger.debug(
343349
f"recv {len(response_data)} bytes; mode={mode}")
@@ -419,7 +425,8 @@ def read_status_file(target_hash: str | None = None) -> dict | None:
419425

420426
# Cap backoff depending on state
421427
if status and status.get('reloading'):
422-
cap = 0.8
428+
# Domain reload can take 10-20s; use longer waits
429+
cap = 5.0
423430
elif fast_error:
424431
cap = 0.25
425432
else:
@@ -761,22 +768,36 @@ def send_command_with_retry(
761768
Uses config.reload_retry_ms and config.reload_max_retries by default. Preserves the
762769
structured failure if retries are exhausted.
763770
"""
771+
t_retry_start = time.time()
772+
logger.info("[TIMING-STDIO] send_command_with_retry START command=%s", command_type)
773+
t_get_conn = time.time()
764774
conn = get_unity_connection(instance_id)
775+
logger.info("[TIMING-STDIO] get_unity_connection took %.3fs command=%s", time.time() - t_get_conn, command_type)
765776
if max_retries is None:
766777
max_retries = getattr(config, "reload_max_retries", 40)
767778
if retry_ms is None:
768779
retry_ms = getattr(config, "reload_retry_ms", 250)
780+
# Default to 20s to handle domain reloads (which can take 10-20s after tests or script changes).
781+
#
782+
# NOTE: This wait can impact agentic workflows where domain reloads happen
783+
# frequently (e.g., after test runs, script compilation). The 20s default
784+
# balances handling slow reloads vs. avoiding unnecessary delays.
785+
#
786+
# TODO: Make this more deterministic by detecting Unity's actual reload state
787+
# rather than blindly waiting up to 20s. See Issue #657.
788+
#
789+
# Configurable via: UNITY_MCP_RELOAD_MAX_WAIT_S (default: 20.0, max: 20.0)
769790
try:
770791
max_wait_s = float(os.environ.get(
771-
"UNITY_MCP_RELOAD_MAX_WAIT_S", "2.0"))
792+
"UNITY_MCP_RELOAD_MAX_WAIT_S", "20.0"))
772793
except ValueError as e:
773-
raw_val = os.environ.get("UNITY_MCP_RELOAD_MAX_WAIT_S", "2.0")
794+
raw_val = os.environ.get("UNITY_MCP_RELOAD_MAX_WAIT_S", "20.0")
774795
logger.warning(
775-
"Invalid UNITY_MCP_RELOAD_MAX_WAIT_S=%r, using default 2.0: %s",
796+
"Invalid UNITY_MCP_RELOAD_MAX_WAIT_S=%r, using default 20.0: %s",
776797
raw_val, e)
777-
max_wait_s = 2.0
778-
# Clamp to [0, 30] to prevent misconfiguration from causing excessive waits
779-
max_wait_s = max(0.0, min(max_wait_s, 30.0))
798+
max_wait_s = 20.0
799+
# Clamp to [0, 20] to prevent misconfiguration from causing excessive waits
800+
max_wait_s = max(0.0, min(max_wait_s, 20.0))
780801

781802
# If retry_on_reload=False, disable connection-level retries too (issue #577)
782803
# Commands that trigger compilation/reload shouldn't retry on disconnect
@@ -847,6 +868,7 @@ def send_command_with_retry(
847868
instance_id or "default",
848869
waited,
849870
)
871+
logger.info("[TIMING-STDIO] send_command_with_retry DONE total=%.3fs command=%s", time.time() - t_retry_start, command_type)
850872
return response
851873

852874

Server/src/transport/models.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ class ExecuteCommandMessage(BaseModel):
2323
params: dict[str, Any]
2424
timeout: float
2525

26+
27+
class PingMessage(BaseModel):
28+
"""Server-initiated ping to detect dead connections."""
29+
type: str = "ping"
30+
2631
# Incoming (Plugin -> Server)
2732

2833

Server/src/transport/plugin_hub.py

Lines changed: 114 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import os
88
import time
99
import uuid
10-
from typing import Any
10+
from typing import Any, ClassVar
1111

1212
from starlette.endpoints import WebSocketEndpoint
1313
from starlette.websockets import WebSocket
@@ -21,6 +21,7 @@
2121
WelcomeMessage,
2222
RegisteredMessage,
2323
ExecuteCommandMessage,
24+
PingMessage,
2425
RegisterMessage,
2526
RegisterToolsMessage,
2627
PongMessage,
@@ -29,7 +30,7 @@
2930
SessionDetails,
3031
)
3132

32-
logger = logging.getLogger("mcp-for-unity-server")
33+
logger = logging.getLogger(__name__)
3334

3435

3536
class PluginDisconnectedError(RuntimeError):
@@ -63,6 +64,10 @@ class PluginHub(WebSocketEndpoint):
6364
KEEP_ALIVE_INTERVAL = 15
6465
SERVER_TIMEOUT = 30
6566
COMMAND_TIMEOUT = 30
67+
# Server-side ping interval (seconds) - how often to send pings to Unity
68+
PING_INTERVAL = 10
69+
# Max time (seconds) to wait for pong before considering connection dead
70+
PING_TIMEOUT = 20
6671
# Timeout (seconds) for fast-fail commands like ping/read_console/get_editor_state.
6772
# Keep short so MCP clients aren't blocked during Unity compilation/reload/unfocused throttling.
6873
FAST_FAIL_TIMEOUT = 2.0
@@ -78,6 +83,10 @@ class PluginHub(WebSocketEndpoint):
7883
_pending: dict[str, dict[str, Any]] = {}
7984
_lock: asyncio.Lock | None = None
8085
_loop: asyncio.AbstractEventLoop | None = None
86+
# session_id -> last pong timestamp (monotonic)
87+
_last_pong: ClassVar[dict[str, float]] = {}
88+
# session_id -> ping task
89+
_ping_tasks: ClassVar[dict[str, asyncio.Task]] = {}
8190

8291
@classmethod
8392
def configure(
@@ -176,12 +185,20 @@ async def on_disconnect(self, websocket: WebSocket, close_code: int) -> None:
176185
(sid for sid, ws in cls._connections.items() if ws is websocket), None)
177186
if session_id:
178187
cls._connections.pop(session_id, None)
188+
# Stop the ping loop for this session
189+
ping_task = cls._ping_tasks.pop(session_id, None)
190+
if ping_task and not ping_task.done():
191+
ping_task.cancel()
192+
# Clean up last pong tracking
193+
cls._last_pong.pop(session_id, None)
179194
# Fail-fast any in-flight commands for this session to avoid waiting for COMMAND_TIMEOUT.
180195
pending_ids = [
181196
command_id
182197
for command_id, entry in cls._pending.items()
183198
if entry.get("session_id") == session_id
184199
]
200+
if pending_ids:
201+
logger.debug(f"Cancelling {len(pending_ids)} pending commands for disconnected session")
185202
for command_id in pending_ids:
186203
entry = cls._pending.get(command_id)
187204
future = entry.get("future") if isinstance(
@@ -364,10 +381,18 @@ async def _handle_register(self, websocket: WebSocket, payload: RegisterMessage)
364381
session = await registry.register(session_id, project_name, project_hash, unity_version, project_path, user_id=user_id)
365382
async with lock:
366383
cls._connections[session.session_id] = websocket
384+
# Initialize last pong time and start ping loop for this session
385+
cls._last_pong[session_id] = time.monotonic()
386+
# Cancel any existing ping task for this session (shouldn't happen, but be safe)
387+
old_task = cls._ping_tasks.pop(session_id, None)
388+
if old_task and not old_task.done():
389+
old_task.cancel()
390+
# Start the server-side ping loop
391+
ping_task = asyncio.create_task(cls._ping_loop(session_id, websocket))
392+
cls._ping_tasks[session_id] = ping_task
367393

368394
if user_id:
369-
logger.info(
370-
f"Plugin registered: {project_name} ({project_hash}) for user {user_id}")
395+
logger.info(f"Plugin registered: {project_name} ({project_hash}) for user {user_id}")
371396
else:
372397
logger.info(f"Plugin registered: {project_name} ({project_hash})")
373398

@@ -429,11 +454,77 @@ async def _handle_command_result(self, payload: CommandResultMessage) -> None:
429454
async def _handle_pong(self, payload: PongMessage) -> None:
430455
cls = type(self)
431456
registry = cls._registry
457+
lock = cls._lock
432458
if registry is None:
433459
return
434460
session_id = payload.session_id
435461
if session_id:
436462
await registry.touch(session_id)
463+
# Record last pong time for staleness detection (under lock for consistency)
464+
if lock is not None:
465+
async with lock:
466+
cls._last_pong[session_id] = time.monotonic()
467+
468+
@classmethod
469+
async def _ping_loop(cls, session_id: str, websocket: WebSocket) -> None:
470+
"""Server-initiated ping loop to detect dead connections.
471+
472+
Sends periodic pings to the Unity client. If no pong is received within
473+
PING_TIMEOUT seconds, the connection is considered dead and closed.
474+
This helps detect connections that die silently (e.g., Windows OSError 64).
475+
"""
476+
logger.debug(f"[Ping] Starting ping loop for session {session_id}")
477+
try:
478+
while True:
479+
await asyncio.sleep(cls.PING_INTERVAL)
480+
481+
# Check if we're still supposed to be running and get last pong time (under lock)
482+
lock = cls._lock
483+
if lock is None:
484+
break
485+
async with lock:
486+
if session_id not in cls._connections:
487+
logger.debug(f"[Ping] Session {session_id} no longer in connections, stopping ping loop")
488+
break
489+
# Read last pong time under lock for consistency
490+
last_pong = cls._last_pong.get(session_id, 0)
491+
492+
# Check staleness: has it been too long since we got a pong?
493+
elapsed = time.monotonic() - last_pong
494+
if elapsed > cls.PING_TIMEOUT:
495+
logger.warning(
496+
f"[Ping] Session {session_id} stale: no pong for {elapsed:.1f}s "
497+
f"(timeout={cls.PING_TIMEOUT}s). Closing connection."
498+
)
499+
try:
500+
await websocket.close(code=1001) # Going away
501+
except Exception as close_ex:
502+
logger.debug(f"[Ping] Error closing stale websocket: {close_ex}")
503+
break
504+
505+
# Send a ping to the client
506+
try:
507+
ping_msg = PingMessage()
508+
await websocket.send_json(ping_msg.model_dump())
509+
logger.debug(f"[Ping] Sent ping to session {session_id}")
510+
except Exception as send_ex:
511+
# Send failed - connection is dead
512+
logger.warning(
513+
f"[Ping] Failed to send ping to session {session_id}: {send_ex}. "
514+
"Connection likely dead."
515+
)
516+
try:
517+
await websocket.close(code=1006) # Abnormal closure
518+
except Exception:
519+
pass
520+
break
521+
522+
except asyncio.CancelledError:
523+
logger.debug(f"[Ping] Ping loop cancelled for session {session_id}")
524+
except Exception as ex:
525+
logger.warning(f"[Ping] Ping loop error for session {session_id}: {ex}")
526+
finally:
527+
logger.debug(f"[Ping] Ping loop ended for session {session_id}")
437528

438529
@classmethod
439530
async def _get_connection(cls, session_id: str) -> WebSocket:
@@ -465,19 +556,30 @@ async def _resolve_session_id(cls, unity_instance: str | None, user_id: str | No
465556
if cls._registry is None:
466557
raise RuntimeError("Plugin registry not configured")
467558

468-
# Bound waiting for Unity sessions so calls fail fast when editors are not ready.
559+
# Bound waiting for Unity sessions. Default to 20s to handle domain reloads
560+
# (which can take 10-20s after test runs or script changes).
561+
#
562+
# NOTE: This wait can impact agentic workflows where domain reloads happen
563+
# frequently (e.g., after test runs, script compilation). The 20s default
564+
# balances handling slow reloads vs. avoiding unnecessary delays.
565+
#
566+
# TODO: Make this more deterministic by detecting Unity's actual reload state
567+
# (e.g., via status file, heartbeat, or explicit "reloading" signal from Unity)
568+
# rather than blindly waiting up to 20s. See Issue #657.
569+
#
570+
# Configurable via: UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S (default: 20.0, max: 20.0)
469571
try:
470572
max_wait_s = float(
471-
os.environ.get("UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "2.0"))
573+
os.environ.get("UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "20.0"))
472574
except ValueError as e:
473575
raw_val = os.environ.get(
474-
"UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "2.0")
576+
"UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "20.0")
475577
logger.warning(
476-
"Invalid UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S=%r, using default 2.0: %s",
578+
"Invalid UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S=%r, using default 20.0: %s",
477579
raw_val, e)
478-
max_wait_s = 2.0
479-
# Clamp to [0, 30] to prevent misconfiguration from causing excessive waits
480-
max_wait_s = max(0.0, min(max_wait_s, 30.0))
580+
max_wait_s = 20.0
581+
# Clamp to [0, 20] to prevent misconfiguration from causing excessive waits
582+
max_wait_s = max(0.0, min(max_wait_s, 20.0))
481583
retry_ms = float(getattr(config, "reload_retry_ms", 250))
482584
sleep_seconds = max(0.05, min(0.25, retry_ms / 1000.0))
483585

@@ -613,7 +715,7 @@ async def send_command_for_instance(
613715
"Invalid UNITY_MCP_SESSION_READY_WAIT_SECONDS=%r, using default 6.0: %s",
614716
raw_val, e)
615717
max_wait_s = 6.0
616-
max_wait_s = max(0.0, min(max_wait_s, 30.0))
718+
max_wait_s = max(0.0, min(max_wait_s, 20.0))
617719
if max_wait_s > 0:
618720
deadline = time.monotonic() + max_wait_s
619721
while time.monotonic() < deadline:

Server/src/transport/unity_instance_middleware.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -214,9 +214,10 @@ async def _inject_unity_instance(self, context: MiddlewareContext) -> None:
214214
# The 'active_instance' (Name@hash) might be valid for stdio even if PluginHub fails.
215215

216216
session_id: str | None = None
217-
# Only validate via PluginHub if we are actually using HTTP transport
218-
# OR if we want to support hybrid mode. For now, let's be permissive.
219-
if PluginHub.is_configured():
217+
# Only validate via PluginHub if we are actually using HTTP transport.
218+
# For stdio transport, skip PluginHub entirely - we only need the instance ID.
219+
from transport.unity_transport import _is_http_transport
220+
if _is_http_transport() and PluginHub.is_configured():
220221
try:
221222
# resolving session_id might fail if the plugin disconnected
222223
# We only need session_id for HTTP transport routing.

Server/src/transport/unity_transport.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
from models.models import MCPResponse
1212
from models.unity_response import normalize_unity_response
1313

14+
logger = logging.getLogger(__name__)
1415
T = TypeVar("T")
15-
logger = logging.getLogger("mcp-for-unity-server")
1616

1717

1818
def _is_http_transport() -> bool:

0 commit comments

Comments
 (0)