From 52708acc4d4fb313e395d8d58fb71bfbb1c49b42 Mon Sep 17 00:00:00 2001 From: jack Date: Sat, 4 Jul 2026 23:38:33 +0800 Subject: [PATCH] fix(browser): keep the extension bridge connection alive and its status in sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The jcode Browser Bridge extension flapped between connected and "Reconnecting…" every 1–3 minutes, and could get stranded showing "Reconnecting…" (with "Could not reach the jcode server") even while the socket was up and browser tools worked. Two root causes: 1. No server-side keepalive. After the hello handshake the bridge cleared the read deadline and never sent anything, so the only thing holding the extension's MV3 service worker awake was its own 30s chrome.alarms ping — which races the 30s idle-kill (and is clamped to 60s on some builds). When the worker napped the socket dropped and only came back on the next alarm tick. Fix: the server now sends an application-level {"type":"ping"} every 15s (an inbound ws message resets the worker's idle timer) and drops the peer if no frame arrives within 40s. 2. Stale-socket state clobber. All ws event handlers referenced the module- level `ws`, so a superseded socket's late-firing onclose set connected=false after a newer socket had already welcomed; the keepalive alarm's reconnect then early-returned because the (live) global ws was OPEN, leaving connected stuck false forever. Fix: capture each socket in a local `sock` and guard every handler with `ws === sock`; also self-heal `connected` when a server ping arrives (receiving one proves we're connected). Also: quiet the popup so a routine reconnect shows a neutral "Reconnecting to jcode…" instead of the red "Could not reach the jcode server" error, which now only appears after several failed attempts in a row. Adds TestBridgeKeepAlivePing covering the server ping + read watchdog. Generated with Jack AI bot --- extension/background.js | 53 ++++++++++++++++++++++++++------- extension/popup/popup.html | 1 + extension/popup/popup.js | 7 ++++- internal/browser/bridge.go | 53 +++++++++++++++++++++++++++++++-- internal/browser/bridge_test.go | 31 +++++++++++++++++++ 5 files changed, 130 insertions(+), 15 deletions(-) diff --git a/extension/background.js b/extension/background.js index f336257..0e99005 100644 --- a/extension/background.js +++ b/extension/background.js @@ -98,13 +98,21 @@ async function connect() { desired = false; // no token yet — wait for Auto-connect to fetch one. return; } + let sock; try { - ws = new WebSocket(serverUrl); + sock = new WebSocket(serverUrl); } catch (e) { lastError = "Bad server URL: " + String(e && e.message ? e.message : e); scheduleReconnect(); return; } + ws = sock; + // Every handler below guards on `ws === sock`. Handlers are bound to this + // specific socket but mutate module-level state (`connected`, `lastError`); + // once a newer connect() has replaced the global `ws`, this socket is + // superseded and a late-firing event from it (especially onclose) must NOT + // clobber the live connection's state — that desync is what stranded the + // popup on "Reconnecting…" while the socket was actually up. // Connect-stall watchdog. When the extension lacks host access to the target // (e.g. 127.0.0.1 site access is off in edge://extensions), the WebSocket @@ -113,21 +121,23 @@ async function connect() { if (connectTimer) clearTimeout(connectTimer); connectTimer = setTimeout(() => { connectTimer = null; - if (!connected && ws && ws.readyState !== WebSocket.OPEN) { + if (ws === sock && !connected && sock.readyState !== WebSocket.OPEN) { lastError = "Connection stalled (no response from " + serverUrl + "). " + "Most likely the extension lacks access to this host — open the extensions page › this extension › " + "Site access and allow 127.0.0.1 / localhost (set to 'On all sites'). Then reload the extension and Auto-connect again."; - try { ws.close(); } catch {} + try { sock.close(); } catch {} } }, CONNECT_TIMEOUT_MS); - ws.onopen = () => { + sock.onopen = () => { + if (ws !== sock) return; lastError = ""; - ws.send(JSON.stringify({ type: "hello", token })); + sock.send(JSON.stringify({ type: "hello", token })); }; - ws.onmessage = async (ev) => { + sock.onmessage = async (ev) => { + if (ws !== sock) return; let msg; try { msg = JSON.parse(ev.data); } catch { return; } if (msg.type === "welcome") { @@ -150,21 +160,42 @@ async function connect() { stop(true); return; } + // Server keepalive: the inbound frame itself is what keeps this MV3 worker + // awake (Chrome resets the idle timer on any ws message); the pong lets the + // server confirm we're alive. Receiving a ping also proves we're connected, + // so self-heal the flag in case a stale event left it out of sync. + if (msg.type === "ping") { + if (!connected) { connected = true; chrome.action.setBadgeText({ text: "on" }); } + send({ type: "pong" }); + return; + } + if (msg.type === "pong") { return; } await handleEnvelope(msg); }; - ws.onclose = () => { + sock.onclose = () => { + if (ws !== sock) return; // superseded socket — leave the live one alone if (connectTimer) { clearTimeout(connectTimer); connectTimer = null; } + const wasConnected = connected; connected = false; - if (!lastError) { + chrome.action.setBadgeText({ text: "" }); + // Dropping a live connection is routine (MV3 worker nap, jcode restart) — + // don't cry wolf. Reconnect quietly and let the badge/pill show "Reconnecting…". + // Only surface the hard "can't reach" error once a few attempts in a row + // have failed, i.e. the server really is gone or on a stale port. + if (wasConnected) { + lastError = ""; + reconnectDelay = 1000; + attempts = 0; + } else if (!lastError && attempts >= 3) { lastError = "Could not reach the jcode server. Check that jcode is running and the URL/port is right."; } - chrome.action.setBadgeText({ text: "" }); scheduleReconnect(); }; - ws.onerror = () => { + sock.onerror = () => { + if (ws !== sock) return; lastError = "WebSocket error connecting to " + serverUrl + " — is jcode running there, and does the extension have site access to it?"; - try { ws.close(); } catch {} + try { sock.close(); } catch {} }; } diff --git a/extension/popup/popup.html b/extension/popup/popup.html index 3f112da..a4026f4 100644 --- a/extension/popup/popup.html +++ b/extension/popup/popup.html @@ -42,6 +42,7 @@ .msg { margin-top: 8px; font-size: 12px; line-height: 1.4; padding: 7px 9px; border-radius: 7px; } .msg.err { background: #fdeceb; color: var(--red); } .msg.ok { background: #e6f6ec; color: var(--green); } + .msg.info { background: #f2f0ed; color: var(--ink2); } diff --git a/extension/popup/popup.js b/extension/popup/popup.js index 7ba6c48..7292146 100644 --- a/extension/popup/popup.js +++ b/extension/popup/popup.js @@ -41,7 +41,12 @@ async function refresh() { pill.className = "pill off"; pill.innerHTML = 'Reconnecting…'; $("autoConnect").textContent = "Auto-connect to jcode"; - showMsg((st.lastError ? st.lastError + " " : "") + "Click Disconnect to stop trying.", "err"); + // Only go red on a real failure; a routine reconnect stays neutral. + if (st.lastError) { + showMsg(st.lastError + " Click Disconnect to stop trying.", "err"); + } else { + showMsg("Reconnecting to jcode…", "info"); + } } else { pill.className = "pill off"; pill.innerHTML = 'Offline'; diff --git a/internal/browser/bridge.go b/internal/browser/bridge.go index e9032e0..bd7bb79 100644 --- a/internal/browser/bridge.go +++ b/internal/browser/bridge.go @@ -98,6 +98,7 @@ func (b *Bridge) HandleWS(w http.ResponseWriter, r *http.Request) { b.mu.Unlock() config.Logger().Printf("[browser] extension connected") + go bc.keepAlive() bc.readLoop() b.mu.Lock() @@ -135,6 +136,19 @@ type bridgeEnvelope struct { URL string `json:"url,omitempty"` } +// Keepalive timing. Chrome/Edge kill an MV3 service worker after ~30s idle, and +// an inbound websocket message resets that timer — so a steady server→extension +// ping is what actually keeps the extension worker (and thus the whole bridge) +// alive between browser commands. Without it the worker naps, the socket drops, +// and the popup flaps to "Reconnecting…". keepAliveWait is the read side: if no +// frame (pong, alarm ping, or command reply) arrives within two ping periods, +// treat the extension as dead and tear the socket down. +// vars, not consts, so tests can shrink them. +var ( + keepAlivePing = 15 * time.Second + keepAliveWait = 40 * time.Second +) + type bridgeConn struct { ws *websocket.Conn writeMu sync.Mutex @@ -147,6 +161,35 @@ type bridgeConn struct { closeErr error } +// writeJSON serializes all writes to the socket (gorilla forbids concurrent +// writers) and bounds a stuck write so a wedged peer can't block a ping or a +// command forever. +func (c *bridgeConn) writeJSON(v any) error { + c.writeMu.Lock() + defer c.writeMu.Unlock() + _ = c.ws.SetWriteDeadline(time.Now().Add(10 * time.Second)) + return c.ws.WriteJSON(v) +} + +// keepAlive pings the extension on an interval so its service worker stays awake +// and the socket stays up between commands. It exits when the read loop closes +// the conn. +func (c *bridgeConn) keepAlive() { + t := time.NewTicker(keepAlivePing) + defer t.Stop() + for { + select { + case <-c.closed: + return + case <-t.C: + if err := c.writeJSON(bridgeEnvelope{Type: "ping"}); err != nil { + c.close() // wake the read loop so it tears the conn down + return + } + } + } +} + func newBridgeConn(ws *websocket.Conn) *bridgeConn { ws.SetReadLimit(256 << 20) return &bridgeConn{ @@ -158,6 +201,7 @@ func newBridgeConn(ws *websocket.Conn) *bridgeConn { } func (c *bridgeConn) readLoop() { + _ = c.ws.SetReadDeadline(time.Now().Add(keepAliveWait)) for { var env bridgeEnvelope if err := c.ws.ReadJSON(&env); err != nil { @@ -171,7 +215,12 @@ func (c *bridgeConn) readLoop() { close(c.closed) return } + // Any inbound frame proves the extension is alive; extend the window. + _ = c.ws.SetReadDeadline(time.Now().Add(keepAliveWait)) switch env.Type { + case "ping", "pong": + // Keepalive traffic (the extension's own alarm ping, or a pong to + // ours) — nothing to route. case "cdp.result", "cdp.error", "tabs.result", "tab.result": c.mu.Lock() ch := c.pending[env.ID] @@ -204,9 +253,7 @@ func (c *bridgeConn) request(ctx context.Context, env bridgeEnvelope) (bridgeEnv c.pending[id] = ch c.mu.Unlock() - c.writeMu.Lock() - err := c.ws.WriteJSON(env) - c.writeMu.Unlock() + err := c.writeJSON(env) if err != nil { c.mu.Lock() delete(c.pending, id) diff --git a/internal/browser/bridge_test.go b/internal/browser/bridge_test.go index b19fe06..8424700 100644 --- a/internal/browser/bridge_test.go +++ b/internal/browser/bridge_test.go @@ -144,6 +144,37 @@ func TestBridgeCDPForwarding(t *testing.T) { } } +// TestBridgeKeepAlivePing verifies the server proactively pings the extension so +// its MV3 service worker can't nap the socket shut. A silent extension (no alarm +// ping, no command traffic) must still receive server pings; and if it never +// answers, the read watchdog must eventually drop it. +func TestBridgeKeepAlivePing(t *testing.T) { + oldPing, oldWait := keepAlivePing, keepAliveWait + keepAlivePing, keepAliveWait = 20*time.Millisecond, 120*time.Millisecond + t.Cleanup(func() { keepAlivePing, keepAliveWait = oldPing, oldWait }) + + b, wsURL := bridgeServer(t) + token := b.IssueToken() + fe, ok := dialExtension(t, wsURL, map[string]any{"type": "hello", "token": token}) + if !ok { + t.Fatal("token auth failed") + } + + // Read the first frame the server sends after welcome; it must be a ping. + _ = fe.conn.SetReadDeadline(time.Now().Add(2 * time.Second)) + var env bridgeEnvelope + if err := fe.conn.ReadJSON(&env); err != nil { + t.Fatalf("expected a server ping, got read error: %v", err) + } + if env.Type != "ping" { + t.Fatalf("first server keepalive frame = %q, want \"ping\"", env.Type) + } + + // A silent extension (we stop reading/answering) must be dropped by the read + // watchdog rather than lingering forever. + waitUntil(t, func() bool { return !b.Connected() }) +} + func TestBridgeStableTokenIsStable(t *testing.T) { t.Setenv("HOME", t.TempDir()) b := NewBridge()