From 33c67d540f69a6e1ff7eea20cce0821493f971cf Mon Sep 17 00:00:00 2001
From: GangGreenTemperTatum
 <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Mon, 4 May 2026 14:47:29 -0400
Subject: [PATCH 1/2] Add interrupted tool result recovery hook to web-security

---
 capabilities/web-security/capability.yaml     |   5 +-
 .../hooks/interrupted_tool_result.py          | 156 +++++++++++
 .../test_interrupted_tool_result_hook.py      | 259 ++++++++++++++++++
 3 files changed, 419 insertions(+), 1 deletion(-)
 create mode 100644 capabilities/web-security/hooks/interrupted_tool_result.py
 create mode 100644 capabilities/web-security/tests/test_interrupted_tool_result_hook.py

diff --git a/capabilities/web-security/capability.yaml b/capabilities/web-security/capability.yaml
index aed0448..3601f2f 100644
--- a/capabilities/web-security/capability.yaml
+++ b/capabilities/web-security/capability.yaml
@@ -1,6 +1,6 @@
 schema: 1
 name: web-security
-version: "1.0.3"
+version: "1.0.4"
 description: >
   Web application penetration testing with 30+ attack technique playbooks
   covering request smuggling, cache poisoning, SSRF, SSTI, DOM
@@ -9,6 +9,9 @@ description: >
   integration via MCP, credential management, DNS rebinding, phone
   verification, and vulnerability verification.
 
+hooks:
+  - hooks/interrupted_tool_result.py
+
 mcp:
   servers:
     caido:
diff --git a/capabilities/web-security/hooks/interrupted_tool_result.py b/capabilities/web-security/hooks/interrupted_tool_result.py
new file mode 100644
index 0000000..58eecd8
--- /dev/null
+++ b/capabilities/web-security/hooks/interrupted_tool_result.py
@@ -0,0 +1,156 @@
+"""Recover from provider interruption sentinels after tool execution."""
+
+from __future__ import annotations
+
+import asyncio
+import re
+from dataclasses import dataclass
+
+from dreadnode.agents.events import AgentEnd, GenerationStep, ToolEnd, ToolError
+from dreadnode.agents.reactions import Continue
+from dreadnode.core.hook import hook
+
+_INTERRUPTION_SENTINEL = re.compile(
+    r"^\[?\s*response interrupted by a tool call result\.\s*\]?$",
+    re.IGNORECASE,
+)
+_MAX_RECOVERIES_PER_AGENT = 2
+_MAX_SUMMARY_CHARS = 600
+
+
+@dataclass(slots=True)
+class _ToolOutcome:
+    tool_name: str
+    summary: str
+
+
+@dataclass(slots=True)
+class _AgentState:
+    last_tool_outcome: _ToolOutcome | None = None
+    recoveries: int = 0
+
+
+_STATE_LOCK = asyncio.Lock()
+_AGENT_STATE: dict[str, _AgentState] = {}
+
+
+def _normalize_text(value: object | None) -> str | None:
+    """Collapse tool output into a short, stable single-line summary."""
+    if value is None:
+        return None
+
+    text = " ".join(str(value).split()).strip()
+    if not text:
+        return None
+    if len(text) <= _MAX_SUMMARY_CHARS:
+        return text
+    return f"{text[:_MAX_SUMMARY_CHARS - 3].rstrip()}..."
+
+
+def _extract_assistant_text(event: GenerationStep) -> str | None:
+    """Return the last assistant text only when it is a plain text turn."""
+    if not event.messages:
+        return None
+
+    last_message = event.messages[-1]
+    if getattr(last_message, "role", None) != "assistant":
+        return None
+    if getattr(last_message, "tool_calls", None):
+        return None
+
+    return _normalize_text(getattr(last_message, "content", None))
+
+
+def _is_interruption_sentinel(text: str | None) -> bool:
+    """Match the provider sentinel exactly to avoid false positives."""
+    if text is None:
+        return False
+    return _INTERRUPTION_SENTINEL.fullmatch(text) is not None
+
+
+def _tool_end_summary(event: ToolEnd) -> str:
+    """Describe the last completed tool call for recovery feedback."""
+    if event.error:
+        detail = _normalize_text(event.error)
+        if detail:
+            return f"{event.tool_call.name} returned an error: {detail}"
+        return f"{event.tool_call.name} returned an error."
+
+    detail = _normalize_text(event.result)
+    if detail:
+        return f"{event.tool_call.name} returned: {detail}"
+    return f"{event.tool_call.name} completed without output."
+
+
+def _tool_error_summary(event: ToolError) -> str:
+    """Describe an uncaught tool exception for recovery feedback."""
+    detail = _normalize_text(event.error)
+    if detail:
+        return f"{event.tool_call.name} raised an error: {detail}"
+    return f"{event.tool_call.name} raised an error."
+
+
+def _recovery_feedback(state: _AgentState) -> str:
+    """Build the corrective prompt appended after the sentinel turn."""
+    base = (
+        "Your last response was a transport artifact "
+        "(`[Response interrupted by a tool call result.]`), not a valid assistant turn. "
+        "Ignore it."
+    )
+    if state.last_tool_outcome is None:
+        return f"{base} Continue from the current conversation state and take the next best action."
+    return (
+        f"{base} The last tool outcome was: {state.last_tool_outcome.summary} "
+        "Continue from that result and take the next best action."
+    )
+
+
+@hook(ToolEnd)
+async def remember_tool_end(event: ToolEnd) -> None:
+    """Remember the most recent tool completion for later recovery."""
+    async with _STATE_LOCK:
+        state = _AGENT_STATE.setdefault(event.agent_id, _AgentState())
+        state.last_tool_outcome = _ToolOutcome(
+            tool_name=event.tool_call.name,
+            summary=_tool_end_summary(event),
+        )
+
+
+@hook(ToolError)
+async def remember_tool_error(event: ToolError) -> None:
+    """Remember uncaught tool failures for later recovery."""
+    async with _STATE_LOCK:
+        state = _AGENT_STATE.setdefault(event.agent_id, _AgentState())
+        state.last_tool_outcome = _ToolOutcome(
+            tool_name=event.tool_call.name,
+            summary=_tool_error_summary(event),
+        )
+
+
+@hook(GenerationStep)
+async def recover_interrupted_tool_result(event: GenerationStep) -> Continue | None:
+    """Continue the run when the model emits the interruption sentinel."""
+    assistant_text = _extract_assistant_text(event)
+
+    async with _STATE_LOCK:
+        state = _AGENT_STATE.setdefault(event.agent_id, _AgentState())
+
+        if not _is_interruption_sentinel(assistant_text):
+            if assistant_text:
+                state.recoveries = 0
+            return None
+
+        if state.recoveries >= _MAX_RECOVERIES_PER_AGENT:
+            return None
+
+        state.recoveries += 1
+        feedback = _recovery_feedback(state)
+
+    return Continue(feedback=feedback)
+
+
+@hook(AgentEnd)
+async def clear_recovery_state(event: AgentEnd) -> None:
+    """Drop per-agent recovery state when the run ends."""
+    async with _STATE_LOCK:
+        _AGENT_STATE.pop(event.agent_id, None)
diff --git a/capabilities/web-security/tests/test_interrupted_tool_result_hook.py b/capabilities/web-security/tests/test_interrupted_tool_result_hook.py
new file mode 100644
index 0000000..d244136
--- /dev/null
+++ b/capabilities/web-security/tests/test_interrupted_tool_result_hook.py
@@ -0,0 +1,259 @@
+from __future__ import annotations
+
+import importlib.util
+import sys
+import types
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import pytest
+import yaml
+
+
+def _install_hook_stubs() -> None:
+    dreadnode = types.ModuleType("dreadnode")
+    agents = types.ModuleType("dreadnode.agents")
+    events = types.ModuleType("dreadnode.agents.events")
+    reactions = types.ModuleType("dreadnode.agents.reactions")
+    core = types.ModuleType("dreadnode.core")
+    hook_module = types.ModuleType("dreadnode.core.hook")
+
+    @dataclass
+    class FunctionCall:
+        name: str
+        arguments: str = "{}"
+
+    @dataclass
+    class ToolCall:
+        id: str
+        name: str
+        function: FunctionCall = field(init=False)
+
+        def __post_init__(self) -> None:
+            self.function = FunctionCall(name=self.name)
+
+    @dataclass
+    class Message:
+        role: str
+        content: str | None = None
+        tool_calls: list[object] | None = None
+
+    @dataclass
+    class AgentEnd:
+        agent_id: str
+
+    @dataclass
+    class ToolEnd:
+        agent_id: str
+        tool_call: ToolCall
+        result: str | None = None
+        error: str | None = None
+        error_type: str | None = None
+
+    @dataclass
+    class ToolError:
+        agent_id: str
+        tool_call: ToolCall
+        error: Exception | str
+
+    @dataclass
+    class GenerationStep:
+        agent_id: str
+        messages: list[Message]
+        step: int = 1
+
+    @dataclass
+    class Continue(Exception):
+        feedback: str | None = None
+
+    class Hook:
+        def __init__(self, func, event_type) -> None:
+            self.func = func
+            self.event_type = event_type
+            self.__name__ = getattr(func, "__name__", "hook")
+
+        def __call__(self, event):
+            if not isinstance(event, self.event_type):
+                return None
+            return self.func(event)
+
+    def hook(event_type):
+        def decorator(fn):
+            return Hook(fn, event_type)
+
+        return decorator
+
+    events.AgentEnd = AgentEnd
+    events.GenerationStep = GenerationStep
+    events.ToolCall = ToolCall
+    events.ToolEnd = ToolEnd
+    events.ToolError = ToolError
+    reactions.Continue = Continue
+    hook_module.Hook = Hook
+    hook_module.hook = hook
+
+    dreadnode.agents = agents
+    dreadnode.core = core
+    agents.events = events
+    reactions.Message = Message
+    core.hook = hook_module
+
+    sys.modules["dreadnode"] = dreadnode
+    sys.modules["dreadnode.agents"] = agents
+    sys.modules["dreadnode.agents.events"] = events
+    sys.modules["dreadnode.agents.reactions"] = reactions
+    sys.modules["dreadnode.core"] = core
+    sys.modules["dreadnode.core.hook"] = hook_module
+
+
+@pytest.fixture
+def hook_module():
+    _install_hook_stubs()
+
+    module_path = (
+        Path(__file__).resolve().parents[1] / "hooks" / "interrupted_tool_result.py"
+    )
+    module_name = "test_web_security_interrupted_tool_result"
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    assert spec is not None
+    assert spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+@pytest.mark.asyncio
+async def test_manifest_wires_hook_file() -> None:
+    manifest_path = Path(__file__).resolve().parents[1] / "capability.yaml"
+    manifest = yaml.safe_load(manifest_path.read_text(encoding="utf-8"))
+
+    assert manifest["version"] == "1.0.4"
+    assert manifest["hooks"] == ["hooks/interrupted_tool_result.py"]
+
+
+@pytest.mark.asyncio
+async def test_recovers_from_interruption_marker_after_tool_end(hook_module) -> None:
+    tool_end = sys.modules["dreadnode.agents.events"].ToolEnd(
+        agent_id="agent-1",
+        tool_call=sys.modules["dreadnode.agents.events"].ToolCall("tc-1", "bash"),
+        error="Command failed (1): nmap target",
+    )
+    await hook_module.remember_tool_end(tool_end)
+
+    generation = sys.modules["dreadnode.agents.events"].GenerationStep(
+        agent_id="agent-1",
+        messages=[
+            sys.modules["dreadnode.agents.reactions"].Message(
+                role="assistant",
+                content="[Response interrupted by a tool call result.]",
+            )
+        ],
+        step=2,
+    )
+
+    reaction = await hook_module.recover_interrupted_tool_result(generation)
+
+    assert reaction is not None
+    assert "transport artifact" in reaction.feedback
+    assert (
+        "bash returned an error: Command failed (1): nmap target" in reaction.feedback
+    )
+
+
+@pytest.mark.asyncio
+async def test_recovers_from_interruption_marker_after_tool_error(hook_module) -> None:
+    tool_error = sys.modules["dreadnode.agents.events"].ToolError(
+        agent_id="agent-2",
+        tool_call=sys.modules["dreadnode.agents.events"].ToolCall("tc-2", "bash"),
+        error=RuntimeError("socket hangup"),
+    )
+    await hook_module.remember_tool_error(tool_error)
+
+    generation = sys.modules["dreadnode.agents.events"].GenerationStep(
+        agent_id="agent-2",
+        messages=[
+            sys.modules["dreadnode.agents.reactions"].Message(
+                role="assistant",
+                content="Response interrupted by a tool call result.",
+            )
+        ],
+        step=3,
+    )
+
+    reaction = await hook_module.recover_interrupted_tool_result(generation)
+
+    assert reaction is not None
+    assert "bash raised an error: socket hangup" in reaction.feedback
+
+
+@pytest.mark.asyncio
+async def test_does_not_fire_on_normal_text_or_embedded_phrase(hook_module) -> None:
+    normal = sys.modules["dreadnode.agents.events"].GenerationStep(
+        agent_id="agent-3",
+        messages=[
+            sys.modules["dreadnode.agents.reactions"].Message(
+                role="assistant",
+                content="I found a login form and will test password reset next.",
+            )
+        ],
+        step=1,
+    )
+    embedded = sys.modules["dreadnode.agents.events"].GenerationStep(
+        agent_id="agent-3",
+        messages=[
+            sys.modules["dreadnode.agents.reactions"].Message(
+                role="assistant",
+                content="The UI literally showed [Response interrupted by a tool call result.] once.",
+            )
+        ],
+        step=2,
+    )
+
+    assert await hook_module.recover_interrupted_tool_result(normal) is None
+    assert await hook_module.recover_interrupted_tool_result(embedded) is None
+
+
+@pytest.mark.asyncio
+async def test_retry_budget_resets_after_valid_turn_and_state_cleans_up(
+    hook_module,
+) -> None:
+    tool_end = sys.modules["dreadnode.agents.events"].ToolEnd(
+        agent_id="agent-4",
+        tool_call=sys.modules["dreadnode.agents.events"].ToolCall("tc-4", "bash"),
+        result="80/tcp open http",
+    )
+    await hook_module.remember_tool_end(tool_end)
+
+    sentinel = sys.modules["dreadnode.agents.events"].GenerationStep(
+        agent_id="agent-4",
+        messages=[
+            sys.modules["dreadnode.agents.reactions"].Message(
+                role="assistant",
+                content="[Response interrupted by a tool call result.]",
+            )
+        ],
+        step=1,
+    )
+
+    assert await hook_module.recover_interrupted_tool_result(sentinel) is not None
+    assert await hook_module.recover_interrupted_tool_result(sentinel) is not None
+    assert await hook_module.recover_interrupted_tool_result(sentinel) is None
+
+    valid_turn = sys.modules["dreadnode.agents.events"].GenerationStep(
+        agent_id="agent-4",
+        messages=[
+            sys.modules["dreadnode.agents.reactions"].Message(
+                role="assistant",
+                content="Port 80 is open. I will fetch the homepage next.",
+            )
+        ],
+        step=2,
+    )
+    assert await hook_module.recover_interrupted_tool_result(valid_turn) is None
+    assert await hook_module.recover_interrupted_tool_result(sentinel) is not None
+
+    await hook_module.clear_recovery_state(
+        sys.modules["dreadnode.agents.events"].AgentEnd(agent_id="agent-4")
+    )
+    assert "agent-4" not in hook_module._AGENT_STATE

From 6dddd1b07488a5f515eb9b5b96b15ed4051716cb Mon Sep 17 00:00:00 2001
From: GangGreenTemperTatum
 <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Mon, 4 May 2026 14:51:42 -0400
Subject: [PATCH 2/2] Fix web-security test imports and asyncio marks

---
 .../web-security/tests/test_bbscope.py        | 31 ++++++++--
 .../web-security/tests/test_credence.py       | 59 +++++++++++++++----
 2 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/capabilities/web-security/tests/test_bbscope.py b/capabilities/web-security/tests/test_bbscope.py
index 14a34b1..9132249 100644
--- a/capabilities/web-security/tests/test_bbscope.py
+++ b/capabilities/web-security/tests/test_bbscope.py
@@ -10,15 +10,13 @@
 import httpx
 import pytest
 
-pytestmark = pytest.mark.asyncio
-
 # Add tools directory to path for import
 _REPO_ROOT = Path(__file__).resolve()
 while _REPO_ROOT != _REPO_ROOT.parent:
-    if (_REPO_ROOT / "dreadnode" / "web-security" / "tools").is_dir():
+    if (_REPO_ROOT / "capabilities" / "web-security" / "tools").is_dir():
         break
     _REPO_ROOT = _REPO_ROOT.parent
-sys.path.insert(0, str(_REPO_ROOT / "dreadnode" / "web-security" / "tools"))
+sys.path.insert(0, str(_REPO_ROOT / "capabilities" / "web-security" / "tools"))
 
 from bbscope import BBScope
 
@@ -56,12 +54,21 @@ def test_all_tools_have_catch(self, toolset: BBScope) -> None:
 
 
 class TestFind:
+    @pytest.mark.asyncio
     async def test_find_with_results(self, toolset: BBScope) -> None:
         mock_data = {
             "query": "example.com",
             "programs": [
-                {"platform": "h1", "handle": "example", "url": "https://hackerone.com/example"},
-                {"platform": "bc", "handle": "example-bc", "url": "https://bugcrowd.com/example-bc"},
+                {
+                    "platform": "h1",
+                    "handle": "example",
+                    "url": "https://hackerone.com/example",
+                },
+                {
+                    "platform": "bc",
+                    "handle": "example-bc",
+                    "url": "https://bugcrowd.com/example-bc",
+                },
             ],
             "total_count": 2,
         }
@@ -76,6 +83,7 @@ async def test_find_with_results(self, toolset: BBScope) -> None:
             assert "example" in result
             assert "BC" in result
 
+    @pytest.mark.asyncio
     async def test_find_no_results(self, toolset: BBScope) -> None:
         mock_data = {"query": "nonexistent.invalid", "programs": [], "total_count": 0}
         with patch.object(toolset, "_get_client") as mock_client:
@@ -86,6 +94,7 @@ async def test_find_no_results(self, toolset: BBScope) -> None:
             result = await toolset.find(query="nonexistent.invalid")
             assert "No bug bounty programs found" in result
 
+    @pytest.mark.asyncio
     async def test_find_api_error(self, toolset: BBScope) -> None:
         with patch.object(toolset, "_get_client") as mock_client:
             client = AsyncMock()
@@ -98,6 +107,7 @@ async def test_find_api_error(self, toolset: BBScope) -> None:
 
 
 class TestProgram:
+    @pytest.mark.asyncio
     async def test_program_details(self, toolset: BBScope) -> None:
         mock_data = {
             "platform": "h1",
@@ -119,6 +129,7 @@ async def test_program_details(self, toolset: BBScope) -> None:
             assert "*.example.com" in result
             assert "In-scope targets: 5" in result
 
+    @pytest.mark.asyncio
     async def test_program_vdp(self, toolset: BBScope) -> None:
         mock_data = {
             "platform": "bc",
@@ -138,11 +149,13 @@ async def test_program_vdp(self, toolset: BBScope) -> None:
             result = await toolset.program(platform="bc", handle="test")
             assert "VDP" in result
 
+    @pytest.mark.asyncio
     async def test_program_invalid_platform(self, toolset: BBScope) -> None:
         result = await toolset.program(platform="invalid", handle="test")
         assert "Error" in result
         assert "Invalid platform" in result
 
+    @pytest.mark.asyncio
     async def test_program_not_found(self, toolset: BBScope) -> None:
         with patch.object(toolset, "_get_client") as mock_client:
             client = AsyncMock()
@@ -154,6 +167,7 @@ async def test_program_not_found(self, toolset: BBScope) -> None:
 
 
 class TestTargets:
+    @pytest.mark.asyncio
     async def test_targets_wildcards(self, toolset: BBScope) -> None:
         mock_data = ["*.example.com", "*.test.org"]
         with patch.object(toolset, "_get_client") as mock_client:
@@ -165,16 +179,19 @@ async def test_targets_wildcards(self, toolset: BBScope) -> None:
             assert "*.example.com" in result
             assert "2 wildcards" in result
 
+    @pytest.mark.asyncio
     async def test_targets_invalid_type(self, toolset: BBScope) -> None:
         result = await toolset.targets(target_type="invalid")
         assert "Error" in result
         assert "Invalid target_type" in result
 
+    @pytest.mark.asyncio
     async def test_targets_invalid_platform(self, toolset: BBScope) -> None:
         result = await toolset.targets(target_type="domains", platform="invalid")
         assert "Error" in result
         assert "Invalid platform" in result
 
+    @pytest.mark.asyncio
     async def test_targets_with_limit(self, toolset: BBScope) -> None:
         mock_data = [f"target{i}.com" for i in range(200)]
         with patch.object(toolset, "_get_client") as mock_client:
@@ -188,6 +205,7 @@ async def test_targets_with_limit(self, toolset: BBScope) -> None:
 
 
 class TestUpdates:
+    @pytest.mark.asyncio
     async def test_updates_today(self, toolset: BBScope) -> None:
         mock_data = {
             "updates": [
@@ -215,6 +233,7 @@ async def test_updates_today(self, toolset: BBScope) -> None:
             assert "new.example.com" in result
             assert "added" in result
 
+    @pytest.mark.asyncio
     async def test_updates_no_results(self, toolset: BBScope) -> None:
         mock_data = {"updates": [], "total_count": 0}
         with patch.object(toolset, "_get_client") as mock_client:
diff --git a/capabilities/web-security/tests/test_credence.py b/capabilities/web-security/tests/test_credence.py
index 0d43a46..1d89769 100644
--- a/capabilities/web-security/tests/test_credence.py
+++ b/capabilities/web-security/tests/test_credence.py
@@ -7,8 +7,6 @@
 
 import pytest
 
-pytestmark = pytest.mark.asyncio
-
 # Add tools directory to path for import
 _REPO_ROOT = Path(__file__).resolve()
 while _REPO_ROOT != _REPO_ROOT.parent:
@@ -49,6 +47,7 @@ def test_schema_has_required_params(self, toolset: CredenceTool) -> None:
 
 
 class TestHighConfidence:
+    @pytest.mark.asyncio
     async def test_high_with_poc_confirmed(self, toolset: CredenceTool) -> None:
         result = await toolset.assess_confidence(
             claim="SQLi in /api/users?id=1' OR 1=1--",
@@ -57,6 +56,7 @@ async def test_high_with_poc_confirmed(self, toolset: CredenceTool) -> None:
         )
         assert "CONFIRMED" in result
 
+    @pytest.mark.asyncio
     async def test_high_with_response_verified(self, toolset: CredenceTool) -> None:
         result = await toolset.assess_confidence(
             claim="XSS reflected unencoded in search param",
@@ -65,6 +65,7 @@ async def test_high_with_response_verified(self, toolset: CredenceTool) -> None:
         )
         assert "CONFIRMED" in result
 
+    @pytest.mark.asyncio
     async def test_high_with_data_flow_traced(self, toolset: CredenceTool) -> None:
         result = await toolset.assess_confidence(
             claim="user input reaches innerHTML in app.js:456",
@@ -73,7 +74,10 @@ async def test_high_with_data_flow_traced(self, toolset: CredenceTool) -> None:
         )
         assert "CONFIRMED" in result
 
-    async def test_high_with_pattern_only_is_overconfident(self, toolset: CredenceTool) -> None:
+    @pytest.mark.asyncio
+    async def test_high_with_pattern_only_is_overconfident(
+        self, toolset: CredenceTool
+    ) -> None:
         result = await toolset.assess_confidence(
             claim="innerHTML usage found in dashboard.js",
             confidence="high",
@@ -82,7 +86,10 @@ async def test_high_with_pattern_only_is_overconfident(self, toolset: CredenceTo
         assert "OVERCONFIDENT" in result
         assert "lead/gadget" in result.lower()
 
-    async def test_high_with_scanner_output_is_overconfident(self, toolset: CredenceTool) -> None:
+    @pytest.mark.asyncio
+    async def test_high_with_scanner_output_is_overconfident(
+        self, toolset: CredenceTool
+    ) -> None:
         result = await toolset.assess_confidence(
             claim="nuclei flagged potential SSRF",
             confidence="high",
@@ -90,7 +97,10 @@ async def test_high_with_scanner_output_is_overconfident(self, toolset: Credence
         )
         assert "OVERCONFIDENT" in result
 
-    async def test_high_with_assumed_is_overconfident(self, toolset: CredenceTool) -> None:
+    @pytest.mark.asyncio
+    async def test_high_with_assumed_is_overconfident(
+        self, toolset: CredenceTool
+    ) -> None:
         result = await toolset.assess_confidence(
             claim="probably using MySQL based on error page",
             confidence="high",
@@ -98,7 +108,10 @@ async def test_high_with_assumed_is_overconfident(self, toolset: CredenceTool) -
         )
         assert "OVERCONFIDENT" in result
 
-    async def test_high_with_behavior_observed_is_overconfident(self, toolset: CredenceTool) -> None:
+    @pytest.mark.asyncio
+    async def test_high_with_behavior_observed_is_overconfident(
+        self, toolset: CredenceTool
+    ) -> None:
         result = await toolset.assess_confidence(
             claim="timing difference suggests blind SQLi",
             confidence="high",
@@ -106,7 +119,10 @@ async def test_high_with_behavior_observed_is_overconfident(self, toolset: Crede
         )
         assert "OVERCONFIDENT" in result
 
-    async def test_high_with_code_pattern_is_overconfident(self, toolset: CredenceTool) -> None:
+    @pytest.mark.asyncio
+    async def test_high_with_code_pattern_is_overconfident(
+        self, toolset: CredenceTool
+    ) -> None:
         result = await toolset.assess_confidence(
             claim="eval() called with user input nearby",
             confidence="high",
@@ -116,6 +132,7 @@ async def test_high_with_code_pattern_is_overconfident(self, toolset: CredenceTo
 
 
 class TestMediumConfidence:
+    @pytest.mark.asyncio
     async def test_medium_with_weak_evidence(self, toolset: CredenceTool) -> None:
         result = await toolset.assess_confidence(
             claim="possible IDOR on /api/orders/{id}",
@@ -125,6 +142,7 @@ async def test_medium_with_weak_evidence(self, toolset: CredenceTool) -> None:
         assert "UNCONFIRMED LEAD" in result
         assert "report" not in result.lower() or "do not" in result.lower()
 
+    @pytest.mark.asyncio
     async def test_medium_with_behavior_observed(self, toolset: CredenceTool) -> None:
         result = await toolset.assess_confidence(
             claim="different response length for admin vs user",
@@ -133,7 +151,10 @@ async def test_medium_with_behavior_observed(self, toolset: CredenceTool) -> Non
         )
         assert "UNCONFIRMED LEAD" in result
 
-    async def test_medium_with_strong_evidence_suggests_upgrade(self, toolset: CredenceTool) -> None:
+    @pytest.mark.asyncio
+    async def test_medium_with_strong_evidence_suggests_upgrade(
+        self, toolset: CredenceTool
+    ) -> None:
         result = await toolset.assess_confidence(
             claim="BOLA confirmed with cross-user data",
             confidence="medium",
@@ -141,7 +162,10 @@ async def test_medium_with_strong_evidence_suggests_upgrade(self, toolset: Crede
         )
         assert "UPGRADE" in result
 
-    async def test_medium_with_response_verified_suggests_upgrade(self, toolset: CredenceTool) -> None:
+    @pytest.mark.asyncio
+    async def test_medium_with_response_verified_suggests_upgrade(
+        self, toolset: CredenceTool
+    ) -> None:
         result = await toolset.assess_confidence(
             claim="path traversal returns /etc/passwd",
             confidence="medium",
@@ -151,6 +175,7 @@ async def test_medium_with_response_verified_suggests_upgrade(self, toolset: Cre
 
 
 class TestLowConfidence:
+    @pytest.mark.asyncio
     async def test_low_confidence(self, toolset: CredenceTool) -> None:
         result = await toolset.assess_confidence(
             claim="might have command injection somewhere",
@@ -160,6 +185,7 @@ async def test_low_confidence(self, toolset: CredenceTool) -> None:
         assert "INSUFFICIENT" in result
         assert "gadget" in result.lower()
 
+    @pytest.mark.asyncio
     async def test_uncertain_confidence(self, toolset: CredenceTool) -> None:
         result = await toolset.assess_confidence(
             claim="not sure what this endpoint does",
@@ -168,7 +194,10 @@ async def test_uncertain_confidence(self, toolset: CredenceTool) -> None:
         )
         assert "INSUFFICIENT" in result
 
-    async def test_low_with_strong_evidence_still_insufficient(self, toolset: CredenceTool) -> None:
+    @pytest.mark.asyncio
+    async def test_low_with_strong_evidence_still_insufficient(
+        self, toolset: CredenceTool
+    ) -> None:
         """Even strong evidence with low confidence = don't assert."""
         result = await toolset.assess_confidence(
             claim="got a 500 but not sure it's exploitable",
@@ -179,6 +208,7 @@ async def test_low_with_strong_evidence_still_insufficient(self, toolset: Creden
 
 
 class TestAgentString:
+    @pytest.mark.asyncio
     async def test_agent_string_in_output(self, toolset: CredenceTool) -> None:
         result = await toolset.assess_confidence(
             claim="XSS confirmed",
@@ -189,6 +219,7 @@ async def test_agent_string_in_output(self, toolset: CredenceTool) -> None:
         assert result.startswith("[agent-opus] ")
         assert "CONFIRMED" in result
 
+    @pytest.mark.asyncio
     async def test_different_agent_strings(self, toolset: CredenceTool) -> None:
         for agent in ("dn-agent-kimi", "agent-codex", "agent-opus"):
             result = await toolset.assess_confidence(
@@ -199,6 +230,7 @@ async def test_different_agent_strings(self, toolset: CredenceTool) -> None:
             )
             assert result.startswith(f"[{agent}] ")
 
+    @pytest.mark.asyncio
     async def test_default_agent_string(self, toolset: CredenceTool) -> None:
         result = await toolset.assess_confidence(
             claim="test claim",
@@ -207,6 +239,7 @@ async def test_default_agent_string(self, toolset: CredenceTool) -> None:
         )
         assert result.startswith("[unknown] ")
 
+    @pytest.mark.asyncio
     async def test_agent_string_in_schema(self, toolset: CredenceTool) -> None:
         tool = toolset.get_tools()[0]
         props = tool.parameters_schema.get("properties", {})
@@ -214,6 +247,7 @@ async def test_agent_string_in_schema(self, toolset: CredenceTool) -> None:
 
 
 class TestHandleToolCall:
+    @pytest.mark.asyncio
     async def test_via_handle_tool_call(self, toolset: CredenceTool) -> None:
         from dreadnode.agents.tools import FunctionCall, ToolCall
 
@@ -229,7 +263,10 @@ async def test_via_handle_tool_call(self, toolset: CredenceTool) -> None:
         assert stop is False
         assert "CONFIRMED" in message.content
 
-    async def test_overconfident_via_handle_tool_call(self, toolset: CredenceTool) -> None:
+    @pytest.mark.asyncio
+    async def test_overconfident_via_handle_tool_call(
+        self, toolset: CredenceTool
+    ) -> None:
         from dreadnode.agents.tools import FunctionCall, ToolCall
 
         tools = {t.name: t for t in toolset.get_tools()}