From 26c6c3a134e33efece3c0b88ca3c6406a5a3f528 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 10 Mar 2026 01:30:44 +0000
Subject: [PATCH 1/6] Initial plan


From ef0a926c8208e76ce28bdcf524c12e8c08be312d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 10 Mar 2026 01:42:36 +0000
Subject: [PATCH 2/6] fix: BrowserToolSet Playwright cache thread-safety (Bug
 1/2/3)

- Bug 1 (P0): _get_playwright now checks if the creating thread is still
  alive before returning the cached Playwright instance. When the creating
  thread has exited (e.g. after a LangGraph ToolNode executor shuts down),
  the stale greenlet-bound instance is discarded and a fresh one is created
  in the current thread. Concurrent callers in the same executor (creating
  thread still alive) continue to share one connection.

- Bug 2 (P1): Change browser_navigate default wait_until from 'load' to
  'domcontentloaded'. Sites where external sub-resources are unreachable
  from the sandbox never fire the 'load' event, causing a permanent 90s
  timeout. 'domcontentloaded' fires reliably once the DOM is ready.

- Bug 3 (P2): Detect 'cannot switch to a different thread' errors in
  _run_in_sandbox's catch-all Exception handler and reset the cached
  Playwright instance + sandbox so the next call can recover cleanly.

Also:
- _reset_playwright now also clears _playwright_thread
- Updated all test fixtures to set _playwright_thread = None
- Fixed concurrent caching test to use a hold barrier keeping threads alive
- Added TestBrowserToolSetThreadAwareness (dead-thread recreation, live-
  thread reuse) and TestBrowserToolSetGreenletErrorHandling test classes

Type check: passed (297 source files, no issues)

Co-authored-by: OhYee <13498329+OhYee@users.noreply.github.com>
---
 agentrun/integration/builtin/sandbox.py       |  46 ++++-
 .../test_browser_toolset_error_handling.py    | 187 +++++++++++++++++-
 2 files changed, 220 insertions(+), 13 deletions(-)

diff --git a/agentrun/integration/builtin/sandbox.py b/agentrun/integration/builtin/sandbox.py
index 0456799..171a30a 100644
--- a/agentrun/integration/builtin/sandbox.py
+++ b/agentrun/integration/builtin/sandbox.py
@@ -727,24 +727,40 @@ def __init__(
             polar_fs_config=polar_fs_config,
         )
         self._playwright_sync: Optional["BrowserPlaywrightSync"] = None
+        self._playwright_thread: Optional[threading.Thread] = None
 
     def _get_playwright(self, sb: BrowserSandbox) -> "BrowserPlaywrightSync":
         """获取或创建 Playwright 连接 / Get or create Playwright connection
 
         复用已有连接以减少连接建立开销和瞬态错误。
         使用双重检查锁定避免并发调用时创建多个连接导致资源泄漏。
+        当创建连接的线程已退出时，自动重建连接（Playwright greenlet 绑定到创建它的线程）。
+
         Reuses existing connection to reduce connection overhead and transient errors.
         Uses double-checked locking to avoid leaking connections under concurrent calls.
+        Automatically recreates the connection when the thread that created it has exited,
+        because Playwright's internal greenlet is bound to the thread that created it.
         """
-        if self._playwright_sync is not None:
-            return self._playwright_sync
+        if (
+            self._playwright_sync is not None
+            and self._playwright_thread is not None
+            and not self._playwright_thread.is_alive()
+        ):
+            logger.debug(
+                "Playwright creating thread (id=%s) has exited, recreating"
+                " connection",
+                self._playwright_thread.ident,
+            )
+            self._reset_playwright()
 
-        with self.lock:
-            if self._playwright_sync is None:
-                playwright_sync = sb.sync_playwright()
-                playwright_sync.open()
-                self._playwright_sync = playwright_sync
-            return self._playwright_sync
+        if self._playwright_sync is None:
+            with self.lock:
+                if self._playwright_sync is None:
+                    playwright_sync = sb.sync_playwright()
+                    playwright_sync.open()
+                    self._playwright_sync = playwright_sync
+                    self._playwright_thread = threading.current_thread()
+        return self._playwright_sync
 
     def _reset_playwright(self) -> None:
         """重置 Playwright 连接 / Reset Playwright connection
@@ -763,6 +779,7 @@ def _reset_playwright(self) -> None:
                         exc_info=True,
                     )
                 self._playwright_sync = None
+            self._playwright_thread = None
 
     def _run_in_sandbox(self, callback: Callable[[Sandbox], Any]) -> Any:
         """在沙箱中执行操作，智能区分错误类型 / Execute in sandbox with smart error handling
@@ -813,7 +830,16 @@ def _run_in_sandbox(self, callback: Callable[[Sandbox], Any]) -> Any:
                 )
                 return {"error": f"{e!s}"}
         except Exception as e:
-            logger.debug("Unexpected error in browser sandbox: %s", e)
+            error_msg = str(e)
+            if "cannot switch to" in error_msg:
+                logger.debug(
+                    "Greenlet thread-binding error, resetting Playwright: %s",
+                    e,
+                )
+                self._reset_playwright()
+                self.sandbox = None
+            else:
+                logger.debug("Unexpected error in browser sandbox: %s", e)
             return {"error": f"{e!s}"}
 
     def _is_infrastructure_error(self, error_msg: str) -> bool:
@@ -881,7 +907,7 @@ def inner(sb: Sandbox):
     def browser_navigate(
         self,
         url: str,
-        wait_until: str = "load",
+        wait_until: str = "domcontentloaded",
         timeout: Optional[float] = None,
     ) -> Dict[str, Any]:
         """导航到 URL / Navigate to URL"""
diff --git a/tests/unittests/integration/test_browser_toolset_error_handling.py b/tests/unittests/integration/test_browser_toolset_error_handling.py
index fb1ea71..ff2de13 100644
--- a/tests/unittests/integration/test_browser_toolset_error_handling.py
+++ b/tests/unittests/integration/test_browser_toolset_error_handling.py
@@ -91,6 +91,7 @@ def toolset(self, mock_sandbox):
         with patch.object(BrowserToolSet, "__init__", lambda self: None):
             ts = BrowserToolSet()
             ts._playwright_sync = None
+            ts._playwright_thread = None
             ts.sandbox = mock_sandbox
             ts.sandbox_id = "test-sandbox-id"
             ts.lock = MagicMock()
@@ -218,6 +219,7 @@ def toolset(self, mock_sandbox):
         with patch.object(BrowserToolSet, "__init__", lambda self: None):
             ts = BrowserToolSet()
             ts._playwright_sync = None
+            ts._playwright_thread = None
             ts.sandbox = mock_sandbox
             ts.sandbox_id = "test-sandbox-id"
             ts.lock = threading.Lock()
@@ -255,14 +257,22 @@ def test_reset_playwright_handles_close_error(self, toolset, mock_sandbox):
     def test_concurrent_get_playwright_creates_only_one_connection(
         self, toolset, mock_sandbox
     ):
-        """测试并发调用 _get_playwright 只创建一个连接，不会泄漏"""
-        barrier = threading.Barrier(5)
+        """测试并发调用 _get_playwright 只创建一个连接，不会泄漏
+
+        所有工作线程在同一 executor 内并发运行（即创建线程仍存活），
+        应复用同一连接，不会触发重建。
+        """
+        start_barrier = threading.Barrier(5)
+        # Keep all threads alive until every thread has obtained playwright,
+        # simulating concurrent workers within the same executor context.
+        hold_barrier = threading.Barrier(5)
         results: list = []
 
         def worker():
-            barrier.wait()
+            start_barrier.wait()
             p = toolset._get_playwright(mock_sandbox)
             results.append(p)
+            hold_barrier.wait()  # stay alive so is_alive() == True for peers
 
         threads = [threading.Thread(target=worker) for _ in range(5)]
         for t in threads:
@@ -289,6 +299,7 @@ def toolset(self, mock_sandbox):
         with patch.object(BrowserToolSet, "__init__", lambda self: None):
             ts = BrowserToolSet()
             ts._playwright_sync = MagicMock()
+            ts._playwright_thread = threading.current_thread()
             ts.sandbox = mock_sandbox
             ts.sandbox_id = "test-sandbox-id"
             ts.lock = threading.Lock()
@@ -307,3 +318,173 @@ def test_close_cleans_up_playwright_and_sandbox(
         mock_sandbox.stop.assert_called_once()
         assert toolset.sandbox is None
         assert toolset.sandbox_id == ""
+
+
+class TestBrowserToolSetThreadAwareness:
+    """测试 _get_playwright 的线程感知行为 / Tests for thread-aware Playwright caching"""
+
+    @pytest.fixture
+    def mock_sandbox(self):
+        """创建模拟的沙箱"""
+        sb = MagicMock()
+        sb.sync_playwright.return_value = MagicMock()
+        return sb
+
+    @pytest.fixture
+    def toolset(self, mock_sandbox):
+        """创建带有模拟沙箱的 BrowserToolSet 实例"""
+        with patch.object(BrowserToolSet, "__init__", lambda self: None):
+            ts = BrowserToolSet()
+            ts._playwright_sync = None
+            ts._playwright_thread = None
+            ts.sandbox = mock_sandbox
+            ts.sandbox_id = "test-sandbox-id"
+            ts.lock = threading.Lock()
+            return ts
+
+    def test_get_playwright_records_creating_thread(self, toolset, mock_sandbox):
+        """测试 _get_playwright 记录创建连接的线程"""
+        toolset._get_playwright(mock_sandbox)
+
+        assert toolset._playwright_thread is threading.current_thread()
+
+    def test_get_playwright_same_thread_reuses_connection(
+        self, toolset, mock_sandbox
+    ):
+        """测试同一线程多次调用复用连接"""
+        p1 = toolset._get_playwright(mock_sandbox)
+        p2 = toolset._get_playwright(mock_sandbox)
+
+        assert p1 is p2
+        mock_sandbox.sync_playwright.assert_called_once()
+
+    def test_get_playwright_dead_thread_recreates_connection(
+        self, toolset, mock_sandbox
+    ):
+        """测试创建线程退出后重建 Playwright 连接（Bug 1 修复）
+
+        模拟 LangGraph ToolNode 的行为：每次工具调用在不同的线程上执行。
+        当创建连接的工作线程退出后，缓存的 Playwright 实例必须重建，
+        因为 Playwright 内部 greenlet 绑定到创建它的线程。
+        """
+        first_instance: list = []
+        second_instance: list = []
+
+        def first_call():
+            p = toolset._get_playwright(mock_sandbox)
+            first_instance.append(p)
+
+        t1 = threading.Thread(target=first_call)
+        t1.start()
+        t1.join()
+        # t1 has now exited — its greenlet binding is dead
+
+        def second_call():
+            p = toolset._get_playwright(mock_sandbox)
+            second_instance.append(p)
+
+        t2 = threading.Thread(target=second_call)
+        t2.start()
+        t2.join()
+
+        assert len(first_instance) == 1
+        assert len(second_instance) == 1
+        # A new connection must have been created for the second call
+        assert mock_sandbox.sync_playwright.call_count == 2
+
+    def test_get_playwright_live_thread_not_recreated(
+        self, toolset, mock_sandbox
+    ):
+        """测试创建线程仍存活时不重建连接（并发安全）
+
+        即使在不同线程中调用，只要创建线程仍然存活，就应复用同一连接。
+        这模拟同一 executor 内并发工具调用的场景。
+        """
+        results: list = []
+
+        # Create connection in main thread first
+        toolset._get_playwright(mock_sandbox)
+        # The creating thread (main test thread) is still alive
+
+        # Another thread should reuse the same connection
+        def worker():
+            p = toolset._get_playwright(mock_sandbox)
+            results.append(p)
+
+        t = threading.Thread(target=worker)
+        t.start()
+        t.join()
+
+        assert len(results) == 1
+        assert results[0] is toolset._playwright_sync
+        mock_sandbox.sync_playwright.assert_called_once()
+
+    def test_reset_playwright_clears_thread(self, toolset, mock_sandbox):
+        """测试 _reset_playwright 清理线程引用"""
+        toolset._get_playwright(mock_sandbox)
+        assert toolset._playwright_thread is not None
+
+        toolset._reset_playwright()
+
+        assert toolset._playwright_thread is None
+        assert toolset._playwright_sync is None
+
+
+class TestBrowserToolSetGreenletErrorHandling:
+    """测试 _run_in_sandbox 对 greenlet 死亡错误的处理（Bug 3 修复）"""
+
+    @pytest.fixture
+    def mock_sandbox(self):
+        """创建模拟的沙箱"""
+        return MagicMock()
+
+    @pytest.fixture
+    def toolset(self, mock_sandbox):
+        """创建带有模拟沙箱的 BrowserToolSet 实例"""
+        with patch.object(BrowserToolSet, "__init__", lambda self: None):
+            ts = BrowserToolSet()
+            ts._playwright_sync = None
+            ts._playwright_thread = None
+            ts.sandbox = mock_sandbox
+            ts.sandbox_id = "test-sandbox-id"
+            ts.lock = MagicMock()
+            ts._reset_playwright = MagicMock()
+            ts._ensure_sandbox = MagicMock(return_value=mock_sandbox)
+            return ts
+
+    def test_greenlet_thread_error_resets_playwright_and_sandbox(
+        self, toolset, mock_sandbox
+    ):
+        """测试 greenlet 线程绑定错误触发 Playwright 和沙箱重置
+
+        当 'cannot switch to a different thread' 错误发生时，
+        必须重置缓存的 Playwright 实例，避免后续调用持续失败。
+        """
+
+        def callback(sb):
+            raise Exception(
+                "cannot switch to a different thread (which happens to have"
+                " exited)"
+            )
+
+        result = toolset._run_in_sandbox(callback)
+
+        assert "error" in result
+        assert "cannot switch to" in result["error"]
+        toolset._reset_playwright.assert_called_once()
+        assert toolset.sandbox is None
+
+    def test_non_greenlet_unexpected_error_does_not_reset(
+        self, toolset, mock_sandbox
+    ):
+        """测试普通未知错误不触发 Playwright 重置"""
+        original_sandbox = toolset.sandbox
+
+        def callback(sb):
+            raise ValueError("Some other unexpected error")
+
+        result = toolset._run_in_sandbox(callback)
+
+        assert "error" in result
+        toolset._reset_playwright.assert_not_called()
+        assert toolset.sandbox is original_sandbox

From d1bafaf42bf91e00b821a7dbef9121090ccb8a45 Mon Sep 17 00:00:00 2001
From: OhYee <oyohyee@oyohyee.com>
Date: Tue, 10 Mar 2026 11:25:22 +0800
Subject: [PATCH 3/6] Update agentrun/integration/builtin/sandbox.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: OhYee <oyohyee@oyohyee.com>
---
 agentrun/integration/builtin/sandbox.py | 29 +++++++++++++++----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/agentrun/integration/builtin/sandbox.py b/agentrun/integration/builtin/sandbox.py
index 171a30a..a0f2b82 100644
--- a/agentrun/integration/builtin/sandbox.py
+++ b/agentrun/integration/builtin/sandbox.py
@@ -741,17 +741,24 @@ def _get_playwright(self, sb: BrowserSandbox) -> "BrowserPlaywrightSync":
         Automatically recreates the connection when the thread that created it has exited,
         because Playwright's internal greenlet is bound to the thread that created it.
         """
-        if (
-            self._playwright_sync is not None
-            and self._playwright_thread is not None
-            and not self._playwright_thread.is_alive()
-        ):
-            logger.debug(
-                "Playwright creating thread (id=%s) has exited, recreating"
-                " connection",
-                self._playwright_thread.ident,
-            )
-            self._reset_playwright()
+        if self._playwright_sync is not None and self._playwright_thread is not None:
+            current_thread = threading.current_thread()
+            creator_thread = self._playwright_thread
+            if not creator_thread.is_alive() or current_thread is not creator_thread:
+                if not creator_thread.is_alive():
+                    logger.debug(
+                        "Playwright creating thread (id=%s) has exited, recreating"
+                        " connection",
+                        creator_thread.ident,
+                    )
+                else:
+                    logger.debug(
+                        "Playwright creating thread (id=%s) differs from current"
+                        " thread (id=%s), recreating connection",
+                        creator_thread.ident,
+                        current_thread.ident,
+                    )
+                self._reset_playwright()
 
         if self._playwright_sync is None:
             with self.lock:

From 915e97a5ca431c8cf7e2c3691156a8ce0ddabecb Mon Sep 17 00:00:00 2001
From: OhYee <oyohyee@oyohyee.com>
Date: Tue, 10 Mar 2026 11:25:48 +0800
Subject: [PATCH 4/6] Update agentrun/integration/builtin/sandbox.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: OhYee <oyohyee@oyohyee.com>
---
 agentrun/integration/builtin/sandbox.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/agentrun/integration/builtin/sandbox.py b/agentrun/integration/builtin/sandbox.py
index a0f2b82..a65f072 100644
--- a/agentrun/integration/builtin/sandbox.py
+++ b/agentrun/integration/builtin/sandbox.py
@@ -843,11 +843,21 @@ def _run_in_sandbox(self, callback: Callable[[Sandbox], Any]) -> Any:
                     "Greenlet thread-binding error, resetting Playwright: %s",
                     e,
                 )
-                self._reset_playwright()
-                self.sandbox = None
+                # Reset only the Playwright connection and keep the existing sandbox
+                try:
+                    self._reset_playwright()
+                    # Retry once with the same sandbox instance; the original error
+                    # will still be returned if this retry fails.
+                    return callback(sb)
+                except Exception as e2:
+                    logger.debug(
+                        "Retry after Playwright reset failed: %s",
+                        e2,
+                    )
+                    return {"error": f"{e!s}"}
             else:
                 logger.debug("Unexpected error in browser sandbox: %s", e)
-            return {"error": f"{e!s}"}
+                return {"error": f"{e!s}"}
 
     def _is_infrastructure_error(self, error_msg: str) -> bool:
         """判断是否为基础设施错误 / Check if error is infrastructure-level

From ac0252e002ff31e38e9e9db0d3e71942934b56a4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 10 Mar 2026 03:32:26 +0000
Subject: [PATCH 5/6] fix: use greenlet.error type instead of string check;
 recreate per thread

- Add GreenletError import with ImportError fallback (same pattern as PlaywrightError)
- Replace brittle 'cannot switch to' substring check with dedicated
  except GreenletError block placed before the generic except Exception,
  so the correct exception type is matched regardless of message wording
- Keep existing sandbox on greenlet error (failure is client-side thread
  affinity, not a sandbox crash); only reset Playwright and retry
- _get_playwright already checks current_thread is not creator_thread
  (any different thread triggers recreation, not just dead threads)
- Update concurrent caching test: each thread now gets its own connection
  (Playwright Sync API is thread-affine, cross-thread sharing is unsafe)
- Replace test_get_playwright_live_thread_not_recreated with
  test_get_playwright_different_live_thread_recreates_connection to
  validate the correct per-thread isolation behavior
- Update greenlet error tests to use real greenlet.error, verify sandbox
  is preserved, and check retry behavior

Type check: passed (297 source files, no issues)

Co-authored-by: OhYee <13498329+OhYee@users.noreply.github.com>
---
 agentrun/integration/builtin/sandbox.py       | 45 ++++++-----
 .../test_browser_toolset_error_handling.py    | 78 +++++++++++++------
 2 files changed, 80 insertions(+), 43 deletions(-)

diff --git a/agentrun/integration/builtin/sandbox.py b/agentrun/integration/builtin/sandbox.py
index a65f072..fc0b328 100644
--- a/agentrun/integration/builtin/sandbox.py
+++ b/agentrun/integration/builtin/sandbox.py
@@ -26,6 +26,16 @@ class PlaywrightError(Exception):  # type: ignore[no-redef]
         pass
 
 
+try:
+    from greenlet import error as GreenletError
+except ImportError:
+
+    class GreenletError(Exception):  # type: ignore[no-redef]
+        """Fallback greenlet error used when greenlet is not installed."""
+
+        pass
+
+
 class SandboxToolSet(CommonToolSet):
     """沙箱工具集基类
 
@@ -836,28 +846,25 @@ def _run_in_sandbox(self, callback: Callable[[Sandbox], Any]) -> Any:
                     "Browser tool-level error (no sandbox rebuild): %s", e
                 )
                 return {"error": f"{e!s}"}
-        except Exception as e:
-            error_msg = str(e)
-            if "cannot switch to" in error_msg:
+        except GreenletError as e:
+            logger.debug(
+                "Greenlet thread-binding error, resetting Playwright: %s",
+                e,
+            )
+            # Keep the existing sandbox (it is still healthy); only the
+            # Playwright connection needs to be recreated on this thread.
+            try:
+                self._reset_playwright()
+                return callback(sb)
+            except Exception as e2:
                 logger.debug(
-                    "Greenlet thread-binding error, resetting Playwright: %s",
-                    e,
+                    "Retry after Playwright reset failed: %s",
+                    e2,
                 )
-                # Reset only the Playwright connection and keep the existing sandbox
-                try:
-                    self._reset_playwright()
-                    # Retry once with the same sandbox instance; the original error
-                    # will still be returned if this retry fails.
-                    return callback(sb)
-                except Exception as e2:
-                    logger.debug(
-                        "Retry after Playwright reset failed: %s",
-                        e2,
-                    )
-                    return {"error": f"{e!s}"}
-            else:
-                logger.debug("Unexpected error in browser sandbox: %s", e)
                 return {"error": f"{e!s}"}
+        except Exception as e:
+            logger.debug("Unexpected error in browser sandbox: %s", e)
+            return {"error": f"{e!s}"}
 
     def _is_infrastructure_error(self, error_msg: str) -> bool:
         """判断是否为基础设施错误 / Check if error is infrastructure-level
diff --git a/tests/unittests/integration/test_browser_toolset_error_handling.py b/tests/unittests/integration/test_browser_toolset_error_handling.py
index ff2de13..f5e02b0 100644
--- a/tests/unittests/integration/test_browser_toolset_error_handling.py
+++ b/tests/unittests/integration/test_browser_toolset_error_handling.py
@@ -254,25 +254,21 @@ def test_reset_playwright_handles_close_error(self, toolset, mock_sandbox):
 
         assert toolset._playwright_sync is None
 
-    def test_concurrent_get_playwright_creates_only_one_connection(
+    def test_concurrent_get_playwright_each_thread_gets_own_connection(
         self, toolset, mock_sandbox
     ):
-        """测试并发调用 _get_playwright 只创建一个连接，不会泄漏
+        """测试并发调用 _get_playwright 时每个线程各自创建连接
 
-        所有工作线程在同一 executor 内并发运行（即创建线程仍存活），
-        应复用同一连接，不会触发重建。
+        Playwright Sync API 的 greenlet 绑定到创建它的 OS 线程，
+        不能跨线程共享。每个工作线程必须创建自己的连接。
         """
         start_barrier = threading.Barrier(5)
-        # Keep all threads alive until every thread has obtained playwright,
-        # simulating concurrent workers within the same executor context.
-        hold_barrier = threading.Barrier(5)
         results: list = []
 
         def worker():
             start_barrier.wait()
             p = toolset._get_playwright(mock_sandbox)
             results.append(p)
-            hold_barrier.wait()  # stay alive so is_alive() == True for peers
 
         threads = [threading.Thread(target=worker) for _ in range(5)]
         for t in threads:
@@ -280,9 +276,8 @@ def worker():
         for t in threads:
             t.join()
 
+        # Every thread must have received a connection
         assert len(results) == 5
-        assert all(p is results[0] for p in results)
-        mock_sandbox.sync_playwright.assert_called_once()
 
 
 class TestBrowserToolSetClose:
@@ -392,13 +387,14 @@ def second_call():
         # A new connection must have been created for the second call
         assert mock_sandbox.sync_playwright.call_count == 2
 
-    def test_get_playwright_live_thread_not_recreated(
+    def test_get_playwright_different_live_thread_recreates_connection(
         self, toolset, mock_sandbox
     ):
-        """测试创建线程仍存活时不重建连接（并发安全）
+        """测试从不同线程调用时，即使创建线程仍存活也会重建连接
 
-        即使在不同线程中调用，只要创建线程仍然存活，就应复用同一连接。
-        这模拟同一 executor 内并发工具调用的场景。
+        Playwright Sync API 的 greenlet 绑定到创建它的 OS 线程，
+        即使创建线程仍存活，在另一个线程上调用也不安全。
+        每个调用线程必须获得自己的连接。
         """
         results: list = []
 
@@ -406,7 +402,7 @@ def test_get_playwright_live_thread_not_recreated(
         toolset._get_playwright(mock_sandbox)
         # The creating thread (main test thread) is still alive
 
-        # Another thread should reuse the same connection
+        # A different thread must receive its own new connection
         def worker():
             p = toolset._get_playwright(mock_sandbox)
             results.append(p)
@@ -416,8 +412,8 @@ def worker():
         t.join()
 
         assert len(results) == 1
-        assert results[0] is toolset._playwright_sync
-        mock_sandbox.sync_playwright.assert_called_once()
+        # A new connection must have been created for the worker thread
+        assert mock_sandbox.sync_playwright.call_count == 2
 
     def test_reset_playwright_clears_thread(self, toolset, mock_sandbox):
         """测试 _reset_playwright 清理线程引用"""
@@ -452,17 +448,51 @@ def toolset(self, mock_sandbox):
             ts._ensure_sandbox = MagicMock(return_value=mock_sandbox)
             return ts
 
-    def test_greenlet_thread_error_resets_playwright_and_sandbox(
+    def test_greenlet_error_resets_playwright_keeps_sandbox_and_retries(
         self, toolset, mock_sandbox
     ):
-        """测试 greenlet 线程绑定错误触发 Playwright 和沙箱重置
+        """測試 greenlet.error 触发 Playwright 重置、保留沙箱并重试
 
-        当 'cannot switch to a different thread' 错误发生时，
-        必须重置缓存的 Playwright 实例，避免后续调用持续失败。
+        当 greenlet.error 发生时，沙箱本身仍然健康（这是客户端线程亲和性问题），
+        只需重置 Playwright 连接并在当前线程重试，不应销毁沙箱。
         """
+        try:
+            from greenlet import error as GreenletError
+        except ImportError:
+            pytest.skip("greenlet not installed")
+
+        call_count = 0
 
         def callback(sb):
-            raise Exception(
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise GreenletError(
+                    "cannot switch to a different thread (which happens to have"
+                    " exited)"
+                )
+            return {"success": True}
+
+        result = toolset._run_in_sandbox(callback)
+
+        assert result == {"success": True}
+        assert call_count == 2
+        toolset._reset_playwright.assert_called_once()
+        # Sandbox must be preserved — the error is client-side thread affinity,
+        # not a sandbox crash.
+        assert toolset.sandbox is mock_sandbox
+
+    def test_greenlet_error_returns_error_if_retry_fails(
+        self, toolset, mock_sandbox
+    ):
+        """测试 greenlet.error 重试失败时返回错误字典"""
+        try:
+            from greenlet import error as GreenletError
+        except ImportError:
+            pytest.skip("greenlet not installed")
+
+        def callback(sb):
+            raise GreenletError(
                 "cannot switch to a different thread (which happens to have"
                 " exited)"
             )
@@ -470,9 +500,9 @@ def callback(sb):
         result = toolset._run_in_sandbox(callback)
 
         assert "error" in result
-        assert "cannot switch to" in result["error"]
         toolset._reset_playwright.assert_called_once()
-        assert toolset.sandbox is None
+        # Sandbox still preserved even after retry failure
+        assert toolset.sandbox is mock_sandbox
 
     def test_non_greenlet_unexpected_error_does_not_reset(
         self, toolset, mock_sandbox

From 213fc6cafc7d237a4e63778e929c870501ddb1ef Mon Sep 17 00:00:00 2001
From: OhYee <oyohyee@oyohyee.com>
Date: Tue, 10 Mar 2026 15:54:29 +0800
Subject: [PATCH 6/6] test: remove LangChain invoke methods integration test
 and rename test toolsets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removed the comprehensive LangChain invoke methods integration test file which was testing various invocation patterns (astream_events, astream, stream, invoke, ainvoke) with both streaming and non-streaming scenarios. Also renamed TestToolSet to SampleToolSet across multiple integration test files to avoid confusion with actual test classes.

The removed test file contained extensive setup for mocking OpenAI protocol services and comprehensive test cases for different LangChain/LangGraph invocation methods including streaming protocols and tool call scenarios.

The toolset renaming affects AgentScope, CrewAI, Google ADK, LangChain, LangGraph, and PydanticAI integration tests to maintain consistency and prevent naming conflicts.

feat(browser-toolset): add thread awareness and error handling for Playwright connections

Added comprehensive thread awareness to BrowserToolSet's Playwright connection management. The implementation now tracks which thread created the Playwright connection and properly recreates it when the creating thread exits, preventing greenlet binding errors in multi-threaded environments like LangGraph ToolNodes.

Also added proper error handling for 'cannot switch to a different thread' exceptions, automatically resetting both Playwright connections and sandbox instances when such errors occur to prevent cascading failures.

The changes include new test coverage for concurrent access, thread lifecycle management, and error recovery scenarios to ensure robust operation in production environments.

test(browser-toolset): enhance error handling tests with thread awareness

Expanded the browser toolset error handling tests to cover thread-aware Playwright connection management. Added comprehensive test cases for concurrent access patterns, thread lifecycle scenarios, and specific error handling for greenlet binding issues that occur when switching between threads in LangGraph environments.

The new tests verify proper connection reuse within the same thread, correct recreation when the creating thread exits, and appropriate error handling for cross-thread Playwright operations.

chore(test): remove duplicate LangChain invoke methods test file

Removed the duplicate test file that appears to be an older version of the LangChain invoke methods tests, maintaining only the comprehensive version that covers all invocation patterns and streaming scenarios.

test: 移除 LangChain 调用方法集成测试并重命名测试工具集

移除了全面的 LangChain 调用方法集成测试文件，该文件测试了各种调用模式（astream_events、astream、stream、invoke、ainvoke）以及流式和非流式场景。同时在多个集成测试文件中将 TestToolSet 重命名为 SampleToolSet，以避免与实际测试类混淆。

被删除的测试文件包含大量用于模拟 OpenAI 协议服务的设置，以及针对不同 LangChain/LangGraph 调用方法的全面测试用例，包括流式协议和工具调用场景。

工具集重命名影响了 AgentScope、CrewAI、Google ADK、LangChain、LangGraph 和 PydanticAI 集成测试，以保持一致性并防止命名冲突。

feat(browser-toolset): 为 Playwright 连接添加线程感知和错误处理功能

为 BrowserToolSet 的 Playwright 连接管理添加了全面的线程感知功能。实现现在跟踪哪个线程创建了 Playwright 连接，并在线程退出时正确重新创建它，防止在多线程环境（如 LangGraph ToolNodes）中的 greenlet 绑定错误。

还为 'cannot switch to a different thread' 异常添加了适当的错误处理，在发生此类错误时自动重置 Playwright 连接和沙箱实例，以防止级联故障。

更改包括对并发访问、线程生命周期管理和错误恢复场景的新测试覆盖，以确保在生产环境中稳健运行。

test(browser-toolset): 使用线程感知增强错误处理测试

扩展浏览器工具集错误处理测试以涵盖线程感知的 Playwright 连接管理。添加了对并发访问模式、线程生命周期场景的全面测试用例，以及针对在 LangGraph 环境中跨线程切换时发生的 greenlet 绑定问题的具体错误处理。

新测试验证同一线程内的正确连接复用、创建线程退出时的正确重建，以及跨线程 Playwright 操作的适当错误处理。

chore(test): 删除重复的 LangChain 调用方法测试文件

删除似乎是 LangChain 调用方法测试旧版本的重复测试文件，只保留涵盖所有调用模式和流式场景的完整版本。

Change-Id: I4d963b9b041bf81cc2f5f20d3c2f7cd47abe6bbb
Signed-off-by: OhYee <oyohyee@oyohyee.com>
---
 ...py => test_agent_invoke_methods_unittests.py} |  0
 tests/unittests/integration/test_agentscope.py   | 10 +++++-----
 .../test_browser_toolset_error_handling.py       |  4 +++-
 tests/unittests/integration/test_crewai.py       | 10 +++++-----
 tests/unittests/integration/test_google_adk.py   | 12 ++++++------
 tests/unittests/integration/test_langchain.py    | 16 ++++++++--------
 tests/unittests/integration/test_langgraph.py    | 14 +++++++-------
 tests/unittests/integration/test_pydanticai.py   | 14 +++++++-------
 8 files changed, 41 insertions(+), 39 deletions(-)
 rename tests/unittests/integration/langchain/{test_agent_invoke_methods.py => test_agent_invoke_methods_unittests.py} (100%)

diff --git a/tests/unittests/integration/langchain/test_agent_invoke_methods.py b/tests/unittests/integration/langchain/test_agent_invoke_methods_unittests.py
similarity index 100%
rename from tests/unittests/integration/langchain/test_agent_invoke_methods.py
rename to tests/unittests/integration/langchain/test_agent_invoke_methods_unittests.py
diff --git a/tests/unittests/integration/test_agentscope.py b/tests/unittests/integration/test_agentscope.py
index cdcd6e0..022d2dd 100644
--- a/tests/unittests/integration/test_agentscope.py
+++ b/tests/unittests/integration/test_agentscope.py
@@ -21,7 +21,7 @@
 from .scenarios import Scenarios
 
 
-class TestToolSet(CommonToolSet):
+class SampleToolSet(CommonToolSet):
     """测试用工具集"""
 
     def __init__(self, timezone: str = "UTC"):
@@ -150,9 +150,9 @@ def mocked_model(
         return model("mock-model")
 
     @pytest.fixture
-    def mocked_toolset(self) -> TestToolSet:
+    def mocked_toolset(self) -> SampleToolSet:
         """创建 mock 的工具集"""
-        return TestToolSet(timezone="UTC")
+        return SampleToolSet(timezone="UTC")
 
     # =========================================================================
     # 测试：简单对话（无工具调用）
@@ -194,7 +194,7 @@ async def test_multi_tool_calls(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试多工具同时调用"""
         # 使用默认的多工具场景
@@ -223,7 +223,7 @@ async def test_stream_options_validation(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试 stream_options 在请求中的正确性"""
         # 使用默认场景
diff --git a/tests/unittests/integration/test_browser_toolset_error_handling.py b/tests/unittests/integration/test_browser_toolset_error_handling.py
index f5e02b0..a0e3924 100644
--- a/tests/unittests/integration/test_browser_toolset_error_handling.py
+++ b/tests/unittests/integration/test_browser_toolset_error_handling.py
@@ -337,7 +337,9 @@ def toolset(self, mock_sandbox):
             ts.lock = threading.Lock()
             return ts
 
-    def test_get_playwright_records_creating_thread(self, toolset, mock_sandbox):
+    def test_get_playwright_records_creating_thread(
+        self, toolset, mock_sandbox
+    ):
         """测试 _get_playwright 记录创建连接的线程"""
         toolset._get_playwright(mock_sandbox)
 
diff --git a/tests/unittests/integration/test_crewai.py b/tests/unittests/integration/test_crewai.py
index e004fc6..d59e89f 100644
--- a/tests/unittests/integration/test_crewai.py
+++ b/tests/unittests/integration/test_crewai.py
@@ -23,7 +23,7 @@
 from .scenarios import Scenarios
 
 
-class TestToolSet(CommonToolSet):
+class SampleToolSet(CommonToolSet):
     """测试用工具集"""
 
     def __init__(self, timezone: str = "UTC"):
@@ -149,9 +149,9 @@ def mocked_model(
         return model("mock-model")
 
     @pytest.fixture
-    def mocked_toolset(self) -> TestToolSet:
+    def mocked_toolset(self) -> SampleToolSet:
         """创建 mock 的工具集"""
-        return TestToolSet(timezone="UTC")
+        return SampleToolSet(timezone="UTC")
 
     # =========================================================================
     # 测试：简单对话（无工具调用）
@@ -191,7 +191,7 @@ def test_multi_tool_calls(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试多工具同时调用
 
@@ -210,7 +210,7 @@ def test_stream_options_validation(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试 stream_options 在请求中的正确性
 
diff --git a/tests/unittests/integration/test_google_adk.py b/tests/unittests/integration/test_google_adk.py
index 50287f2..801985a 100644
--- a/tests/unittests/integration/test_google_adk.py
+++ b/tests/unittests/integration/test_google_adk.py
@@ -21,7 +21,7 @@
 from .scenarios import Scenarios
 
 
-class TestToolSet(CommonToolSet):
+class SampleToolSet(CommonToolSet):
     """测试用工具集"""
 
     def __init__(self, timezone: str = "UTC"):
@@ -208,9 +208,9 @@ def mocked_model(
         return model("mock-model")
 
     @pytest.fixture
-    def mocked_toolset(self) -> TestToolSet:
+    def mocked_toolset(self) -> SampleToolSet:
         """创建 mock 的工具集"""
-        return TestToolSet(timezone="UTC")
+        return SampleToolSet(timezone="UTC")
 
     # =========================================================================
     # 测试：简单对话（无工具调用）
@@ -252,7 +252,7 @@ async def test_single_tool_call(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试单次工具调用"""
         # 配置场景
@@ -284,7 +284,7 @@ async def test_multi_tool_calls(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试多工具同时调用"""
         # 使用默认的多工具场景
@@ -315,7 +315,7 @@ async def test_stream_options_validation(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试 stream_options 在请求中的正确性"""
         # 使用默认场景
diff --git a/tests/unittests/integration/test_langchain.py b/tests/unittests/integration/test_langchain.py
index 4587e42..712842a 100644
--- a/tests/unittests/integration/test_langchain.py
+++ b/tests/unittests/integration/test_langchain.py
@@ -22,7 +22,7 @@
 from .scenarios import Scenarios
 
 
-class TestToolSet(CommonToolSet):
+class SampleToolSet(CommonToolSet):
     """测试用工具集"""
 
     def __init__(self, timezone: str = "UTC"):
@@ -201,9 +201,9 @@ def mocked_model(
         return model("mock-model")
 
     @pytest.fixture
-    def mocked_toolset(self) -> TestToolSet:
+    def mocked_toolset(self) -> SampleToolSet:
         """创建 mock 的工具集"""
-        return TestToolSet(timezone="UTC")
+        return SampleToolSet(timezone="UTC")
 
     # =========================================================================
     # 测试：简单对话（无工具调用）
@@ -244,7 +244,7 @@ def test_single_tool_call(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试单次工具调用"""
         # 配置场景
@@ -276,7 +276,7 @@ def test_multi_tool_calls(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试多工具同时调用"""
         # 使用默认的多工具场景
@@ -307,7 +307,7 @@ def test_stream_options_in_requests(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试请求中的 stream_options 设置"""
         from langchain_openai import ChatOpenAI
@@ -324,7 +324,7 @@ def test_stream_options_validation(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试 stream_options 在请求中的正确性"""
         # 使用默认场景
@@ -370,7 +370,7 @@ async def test_async_invoke(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试异步调用"""
         # 使用默认场景
diff --git a/tests/unittests/integration/test_langgraph.py b/tests/unittests/integration/test_langgraph.py
index d56e697..3bc4339 100644
--- a/tests/unittests/integration/test_langgraph.py
+++ b/tests/unittests/integration/test_langgraph.py
@@ -22,7 +22,7 @@
 from .scenarios import Scenarios
 
 
-class TestToolSet(CommonToolSet):
+class SampleToolSet(CommonToolSet):
     """测试用工具集"""
 
     def __init__(self, timezone: str = "UTC"):
@@ -233,9 +233,9 @@ def mocked_model(
         return model("mock-model")
 
     @pytest.fixture
-    def mocked_toolset(self) -> TestToolSet:
+    def mocked_toolset(self) -> SampleToolSet:
         """创建 mock 的工具集"""
-        return TestToolSet(timezone="UTC")
+        return SampleToolSet(timezone="UTC")
 
     # =========================================================================
     # 测试：简单对话（无工具调用）
@@ -275,7 +275,7 @@ def test_single_tool_call(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试单次工具调用"""
         # 配置场景
@@ -306,7 +306,7 @@ def test_multi_tool_calls(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试多工具同时调用"""
         # 使用默认的多工具场景
@@ -336,7 +336,7 @@ def test_stream_options_validation(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试 stream_options 在请求中的正确性"""
         # 使用默认场景
@@ -377,7 +377,7 @@ async def test_async_invoke(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试异步调用"""
         # 使用默认场景
diff --git a/tests/unittests/integration/test_pydanticai.py b/tests/unittests/integration/test_pydanticai.py
index 2a5c713..9bea4b9 100644
--- a/tests/unittests/integration/test_pydanticai.py
+++ b/tests/unittests/integration/test_pydanticai.py
@@ -22,7 +22,7 @@
 from .scenarios import Scenarios
 
 
-class TestToolSet(CommonToolSet):
+class SampleToolSet(CommonToolSet):
     """测试用工具集"""
 
     def __init__(self, timezone: str = "UTC"):
@@ -203,9 +203,9 @@ def mocked_model(
         return model("mock-model")
 
     @pytest.fixture
-    def mocked_toolset(self) -> TestToolSet:
+    def mocked_toolset(self) -> SampleToolSet:
         """创建 mock 的工具集"""
-        return TestToolSet(timezone="UTC")
+        return SampleToolSet(timezone="UTC")
 
     # =========================================================================
     # 测试：简单对话（无工具调用）
@@ -245,7 +245,7 @@ def test_single_tool_call(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试单次工具调用"""
         # 配置场景
@@ -276,7 +276,7 @@ def test_multi_tool_calls(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试多工具同时调用"""
         # 使用默认的多工具场景
@@ -305,7 +305,7 @@ def test_stream_options_validation(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试 stream_options 在请求中的正确性
 
@@ -349,7 +349,7 @@ async def test_async_invoke(
         self,
         mock_server: MockLLMServer,
         mocked_model: CommonModel,
-        mocked_toolset: TestToolSet,
+        mocked_toolset: SampleToolSet,
     ):
         """测试异步调用"""
         # 使用默认场景