fixes #5

jph00 · jph00 · commit 6aafc0142bbc · 2026-03-05T16:26:18.000+10:00
diff --git a/conkernelclient/_modidx.py b/conkernelclient/_modidx.py
@@ -8,6 +8,10 @@
   'syms': { 'conkernelclient.core': { 'conkernelclient.core.ConKernelClient': ('core.html#conkernelclient', 'conkernelclient/core.py'),
                                       'conkernelclient.core.ConKernelClient._async_recv_reply': ( 'core.html#conkernelclient._async_recv_reply',
                                                                                                   'conkernelclient/core.py'),
+                                      'conkernelclient.core.ConKernelClient._check_alive': ( 'core.html#conkernelclient._check_alive',
+                                                                                             'conkernelclient/core.py'),
+                                      'conkernelclient.core.ConKernelClient._fail_pending': ( 'core.html#conkernelclient._fail_pending',
+                                                                                              'conkernelclient/core.py'),
                                       'conkernelclient.core.ConKernelClient.execute': ( 'core.html#conkernelclient.execute',
                                                                                         'conkernelclient/core.py'),
                                       'conkernelclient.core.ConKernelClient.start_channels': ( 'core.html#conkernelclient.start_channels',
diff --git a/conkernelclient/core.py b/conkernelclient/core.py
@@ -15,7 +15,10 @@
 from jupyter_client.kernelspec import KernelSpec
 from jupyter_client import AsyncKernelManager
 from traitlets import Type
-import asyncio, zmq.asyncio, time
+import asyncio, zmq.asyncio, time, logging
+
+# %% ../nbs/00_core.ipynb #737a0fc1
+_log = logging.getLogger(__name__)
 
 # %% ../nbs/00_core.ipynb #374b75d0
 if not hasattr(Session, '_orig_send'): Session._orig_send = Session.send
@@ -36,6 +39,17 @@ def _send(self, stream, msg_or_type, content=None, parent=None, ident=None,
 
 # %% ../nbs/00_core.ipynb #d6a5fa6a
 class ConKernelClient(AsyncKernelClient):
+    def _fail_pending(self, exc:Exception, skip=None):
+        for k,(q,_) in list(getattr(self, '_pending', {}).items()):
+            if k != skip:
+                try: q.put_nowait(exc)
+                except asyncio.QueueFull: pass
+
+    def _check_alive(self):
+        if not self.channels_running: raise RuntimeError("Channels not running")
+        tk = getattr(self, '_shell_reader_task', None)
+        return tk is not None and not tk.done()
+
     async def start_channels(self, shell:bool=True, iopub:bool=True, stdin:bool=True, hb:bool=True, control:bool=True):
         "Start channels, wait for ready, and launch background shell-reply reader"
         super().start_channels(shell=shell, iopub=iopub, stdin=stdin, hb=hb, control=control)
@@ -47,19 +61,28 @@ async def _reader():
             while True:
                 try: reply = await self.get_shell_msg(timeout=None)
                 except Exception as e:
-                    for q in self._pending.values(): await q.put(e)
-                    if self._pending: logging.warning(f"_reader died with pending - {self._pending}: {e}")
-                    else: logging.warning(f"_reader died with no pending: {e}")
+                    self._fail_pending(e)
+                    _log.warning(f"_reader died, pending={list(self._pending)}: {e}")
                     break
-                q = self._pending.get(reply["parent_header"].get("msg_id"))
-                if q: await q.put(reply)
+                mid = reply["parent_header"].get("msg_id")
+                pend = self._pending.get(mid)
+                if pend:
+                    q, soe = pend
+                    try: q.put_nowait(reply)
+                    except asyncio.QueueFull: pass
+                else: _log.warning(f"Orphan reply for {reply['parent_header'].get('msg_id')}, pending={list(self._pending)}")
+                cts = reply.get("content", {})
+                if cts.get("status") in ("error", "aborted") and pend and soe:
+                    exc = RuntimeError(f"Kernel error aborted: {cts.get('ename')}: {cts.get('evalue')}")
+                    self._fail_pending(exc, skip=mid)
         self._shell_reader_task = asyncio.create_task(_reader())
         await _ready.wait()
         await asyncio.sleep(0.2)
         return self
 
     def stop_channels(self):
         "Stop channels and cancel the background shell-reply reader task"
+        self._fail_pending(RuntimeError("Shell channels stopped before reply"))
         super().stop_channels()
         if (tk := getattr(self, '_shell_reader_task', None)):
             tk.cancel()
@@ -68,26 +91,29 @@ def stop_channels(self):
 
     async def _async_recv_reply(self, msg_id, timeout=None, channel="shell"):
         if channel == "control": return await self._async_get_control_msg(timeout=timeout)
-        q = self._pending[msg_id]
+        q, _ = self._pending[msg_id]
         try:
             res = await asyncio.wait_for(q.get(), timeout)
             if isinstance(res, Exception): raise res
             return res
-        except asyncio.TimeoutError as e: raise TimeoutError("Timeout waiting for reply") from e
+        except (asyncio.TimeoutError, asyncio.CancelledError) as e:
+            _log.warning(f"Timeout for {msg_id}, pending={list(self._pending)}")
+            raise TimeoutError("Timeout waiting for reply") from e
         finally: self._pending.pop(msg_id, None)
 
     def execute(self, code, user_expressions=None, allow_stdin=None, reply=False, subsh_id=None,
-                cts_typ='code', timeout=60, msg_id=None, **kw):
+                cts_typ='code', timeout=60, msg_id=None, stop_on_error=True, **kw):
         "Send an execute request, returning a coroutine for the reply if `reply`, else the msg_id"
+        if not self._check_alive(): return asyncio.sleep(0) if reply else None
         if user_expressions is None: user_expressions = {}
         if allow_stdin is None: allow_stdin = self.allow_stdin
-        content = dict(user_expressions=user_expressions, allow_stdin=allow_stdin, subsh_id=subsh_id, **kw)
+        content = dict(user_expressions=user_expressions, allow_stdin=allow_stdin, subsh_id=subsh_id, stop_on_error=stop_on_error, **kw)
         content[cts_typ] = code
         msg = self.session.msg("execute_request", content)
         if msg_id is not None: msg["header"]["msg_id"] = msg_id
         if subsh_id is not None: msg["header"]["subshell_id"] = subsh_id
         msg_id = msg["header"]["msg_id"]
-        if reply: self._pending[msg_id] = asyncio.Queue(maxsize=1)
+        if reply: self._pending[msg_id] = (asyncio.Queue(maxsize=1), stop_on_error)
         self.shell_channel.send(msg)
         if not reply: return msg_id
         return self._async_recv_reply(msg_id, timeout=timeout)
diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb
@@ -45,7 +45,7 @@
     "from jupyter_client.kernelspec import KernelSpec\n",
     "from jupyter_client import AsyncKernelManager\n",
     "from traitlets import Type\n",
-    "import asyncio, zmq.asyncio, time"
+    "import asyncio, zmq.asyncio, time, logging"
    ]
   },
   {
@@ -59,6 +59,17 @@
     "from fastcore.utils import patch"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "737a0fc1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "_log = logging.getLogger(__name__)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "665c28bb",
@@ -101,6 +112,17 @@
    "source": [
     "#| export\n",
     "class ConKernelClient(AsyncKernelClient):\n",
+    "    def _fail_pending(self, exc:Exception, skip=None):\n",
+    "        for k,(q,_) in list(getattr(self, '_pending', {}).items()):\n",
+    "            if k != skip:\n",
+    "                try: q.put_nowait(exc)\n",
+    "                except asyncio.QueueFull: pass\n",
+    "\n",
+    "    def _check_alive(self):\n",
+    "        if not self.channels_running: raise RuntimeError(\"Channels not running\")\n",
+    "        tk = getattr(self, '_shell_reader_task', None)\n",
+    "        return tk is not None and not tk.done()\n",
+    "\n",
     "    async def start_channels(self, shell:bool=True, iopub:bool=True, stdin:bool=True, hb:bool=True, control:bool=True):\n",
     "        \"Start channels, wait for ready, and launch background shell-reply reader\"\n",
     "        super().start_channels(shell=shell, iopub=iopub, stdin=stdin, hb=hb, control=control)\n",
@@ -112,19 +134,28 @@
     "            while True:\n",
     "                try: reply = await self.get_shell_msg(timeout=None)\n",
     "                except Exception as e:\n",
-    "                    for q in self._pending.values(): await q.put(e)\n",
-    "                    if self._pending: logging.warning(f\"_reader died with pending - {self._pending}: {e}\")\n",
-    "                    else: logging.warning(f\"_reader died with no pending: {e}\")\n",
+    "                    self._fail_pending(e)\n",
+    "                    _log.warning(f\"_reader died, pending={list(self._pending)}: {e}\")\n",
     "                    break\n",
-    "                q = self._pending.get(reply[\"parent_header\"].get(\"msg_id\"))\n",
-    "                if q: await q.put(reply)\n",
+    "                mid = reply[\"parent_header\"].get(\"msg_id\")\n",
+    "                pend = self._pending.get(mid)\n",
+    "                if pend:\n",
+    "                    q, soe = pend\n",
+    "                    try: q.put_nowait(reply)\n",
+    "                    except asyncio.QueueFull: pass\n",
+    "                else: _log.warning(f\"Orphan reply for {reply['parent_header'].get('msg_id')}, pending={list(self._pending)}\")\n",
+    "                cts = reply.get(\"content\", {})\n",
+    "                if cts.get(\"status\") in (\"error\", \"aborted\") and pend and soe:\n",
+    "                    exc = RuntimeError(f\"Kernel error aborted: {cts.get('ename')}: {cts.get('evalue')}\")\n",
+    "                    self._fail_pending(exc, skip=mid)\n",
     "        self._shell_reader_task = asyncio.create_task(_reader())\n",
     "        await _ready.wait()\n",
     "        await asyncio.sleep(0.2)\n",
     "        return self\n",
     "\n",
     "    def stop_channels(self):\n",
     "        \"Stop channels and cancel the background shell-reply reader task\"\n",
+    "        self._fail_pending(RuntimeError(\"Shell channels stopped before reply\"))\n",
     "        super().stop_channels()\n",
     "        if (tk := getattr(self, '_shell_reader_task', None)):\n",
     "            tk.cancel()\n",
@@ -133,26 +164,29 @@
     "\n",
     "    async def _async_recv_reply(self, msg_id, timeout=None, channel=\"shell\"):\n",
     "        if channel == \"control\": return await self._async_get_control_msg(timeout=timeout)\n",
-    "        q = self._pending[msg_id]\n",
+    "        q, _ = self._pending[msg_id]\n",
     "        try:\n",
     "            res = await asyncio.wait_for(q.get(), timeout)\n",
     "            if isinstance(res, Exception): raise res\n",
     "            return res\n",
-    "        except asyncio.TimeoutError as e: raise TimeoutError(\"Timeout waiting for reply\") from e\n",
+    "        except (asyncio.TimeoutError, asyncio.CancelledError) as e:\n",
+    "            _log.warning(f\"Timeout for {msg_id}, pending={list(self._pending)}\")\n",
+    "            raise TimeoutError(\"Timeout waiting for reply\") from e\n",
     "        finally: self._pending.pop(msg_id, None)\n",
     "\n",
     "    def execute(self, code, user_expressions=None, allow_stdin=None, reply=False, subsh_id=None,\n",
-    "                cts_typ='code', timeout=60, msg_id=None, **kw):\n",
+    "                cts_typ='code', timeout=60, msg_id=None, stop_on_error=True, **kw):\n",
     "        \"Send an execute request, returning a coroutine for the reply if `reply`, else the msg_id\"\n",
+    "        if not self._check_alive(): return asyncio.sleep(0) if reply else None\n",
     "        if user_expressions is None: user_expressions = {}\n",
     "        if allow_stdin is None: allow_stdin = self.allow_stdin\n",
-    "        content = dict(user_expressions=user_expressions, allow_stdin=allow_stdin, subsh_id=subsh_id, **kw)\n",
+    "        content = dict(user_expressions=user_expressions, allow_stdin=allow_stdin, subsh_id=subsh_id, stop_on_error=stop_on_error, **kw)\n",
     "        content[cts_typ] = code\n",
     "        msg = self.session.msg(\"execute_request\", content)\n",
     "        if msg_id is not None: msg[\"header\"][\"msg_id\"] = msg_id\n",
     "        if subsh_id is not None: msg[\"header\"][\"subshell_id\"] = subsh_id\n",
     "        msg_id = msg[\"header\"][\"msg_id\"]\n",
-    "        if reply: self._pending[msg_id] = asyncio.Queue(maxsize=1)\n",
+    "        if reply: self._pending[msg_id] = (asyncio.Queue(maxsize=1), stop_on_error)\n",
     "        self.shell_channel.send(msg)\n",
     "        if not reply: return msg_id\n",
     "        return self._async_recv_reply(msg_id, timeout=timeout)\n",
@@ -212,6 +246,13 @@
      "execution_count": null,
      "metadata": {},
      "output_type": "execute_result"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Orphan reply for 30c03d32-82cd57ee2bae596f194f9300_34664_1, pending=[]\n"
+     ]
     }
    ],
    "source": [
@@ -228,7 +269,7 @@
     {
      "data": {
       "text/plain": [
-       "'95164565-1b052c74632b03fbe217b8de_3713_1'"
+       "'30c03d32-82cd57ee2bae596f194f9300_34664_1'"
       ]
      },
      "execution_count": null,
@@ -293,11 +334,11 @@
     {
      "data": {
       "text/plain": [
-       "{'msg_id': '95164565-1b052c74632b03fbe217b8de_3713_1',\n",
+       "{'msg_id': '30c03d32-82cd57ee2bae596f194f9300_34664_1',\n",
        " 'msg_type': 'execute_request',\n",
        " 'username': 'jhoward',\n",
-       " 'session': '95164565-1b052c74632b03fbe217b8de',\n",
-       " 'date': datetime.datetime(2026, 2, 27, 3, 16, 39, 228656, tzinfo=tzutc()),\n",
+       " 'session': '30c03d32-82cd57ee2bae596f194f9300',\n",
+       " 'date': datetime.datetime(2026, 3, 5, 6, 20, 32, 255676, tzinfo=tzutc()),\n",
        " 'version': '5.4'}"
       ]
      },
@@ -325,7 +366,15 @@
    "execution_count": null,
    "id": "afb4f539",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Orphan reply for 30c03d32-82cd57ee2bae596f194f9300_34664_2, pending=[]\n"
+     ]
+    }
+   ],
    "source": [
     "kc = await km.client().start_channels()"
    ]
@@ -339,23 +388,23 @@
     {
      "data": {
       "text/plain": [
-       "{'header': {'msg_id': 'd40943ee-1c9991f3726c7b2c58e4e42c_3719_21',\n",
+       "{'header': {'msg_id': '3560f722-f0607ae043c870acc8743b8c_34684_21',\n",
        "  'msg_type': 'execute_reply',\n",
        "  'username': 'jhoward',\n",
-       "  'session': 'd40943ee-1c9991f3726c7b2c58e4e42c',\n",
-       "  'date': datetime.datetime(2026, 2, 27, 3, 16, 40, 156085, tzinfo=tzutc()),\n",
+       "  'session': '3560f722-f0607ae043c870acc8743b8c',\n",
+       "  'date': datetime.datetime(2026, 3, 5, 6, 20, 33, 234758, tzinfo=tzutc()),\n",
        "  'version': '5.4'},\n",
-       " 'msg_id': 'd40943ee-1c9991f3726c7b2c58e4e42c_3719_21',\n",
+       " 'msg_id': '3560f722-f0607ae043c870acc8743b8c_34684_21',\n",
        " 'msg_type': 'execute_reply',\n",
-       " 'parent_header': {'msg_id': '95164565-1b052c74632b03fbe217b8de_3713_1',\n",
+       " 'parent_header': {'msg_id': '30c03d32-82cd57ee2bae596f194f9300_34664_1',\n",
        "  'msg_type': 'execute_request',\n",
        "  'username': 'jhoward',\n",
-       "  'session': '95164565-1b052c74632b03fbe217b8de',\n",
-       "  'date': datetime.datetime(2026, 2, 27, 3, 16, 40, 151952, tzinfo=tzutc()),\n",
+       "  'session': '30c03d32-82cd57ee2bae596f194f9300',\n",
+       "  'date': datetime.datetime(2026, 3, 5, 6, 20, 33, 227523, tzinfo=tzutc()),\n",
        "  'version': '5.4'},\n",
-       " 'metadata': {'started': '2026-02-27T03:16:40.153095Z',\n",
+       " 'metadata': {'started': '2026-03-05T06:20:33.230196Z',\n",
        "  'dependencies_met': True,\n",
-       "  'engine': '45b26be5-508d-4fa0-8397-a8e45704f6da',\n",
+       "  'engine': 'fce6c486-de7f-4012-87a9-ef36d7d2ec76',\n",
        "  'status': 'ok'},\n",
        " 'content': {'status': 'ok',\n",
        "  'execution_count': 2,\n",
@@ -397,22 +446,11 @@
    "execution_count": null,
    "id": "0ccc7cca",
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "a = kc.execute('x=2', reply=True)\n",
-    "b = kc.execute('y=3', reply=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2739f44c",
-   "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'95164565-1b052c74632b03fbe217b8de_3713_5'"
+       "'30c03d32-82cd57ee2bae596f194f9300_34664_5'"
       ]
      },
      "execution_count": null,
@@ -421,11 +459,27 @@
     }
    ],
    "source": [
+    "a = kc.execute('x=2', reply=True)\n",
+    "b = kc.execute('y=3', reply=True)\n",
+    "\n",
     "r = await asyncio.wait_for(asyncio.gather(a,b), timeout=2)\n",
     "test_eq(len(r), 2)\n",
     "r[0]['parent_header']['msg_id']"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27a2a287",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async def g():\n",
+    "    for i in range(10): await kc.execute(f'a{i}={i}; a{i}', reply=True)\n",
+    "\n",
+    "r = await asyncio.wait_for(asyncio.gather(g(),g(),g(),g()), timeout=10)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,