From 0aa45347be0ef672f707432648a77b5251359bd1 Mon Sep 17 00:00:00 2001
From: Cooper Miller <kcoopermiller9@gmail.com>
Date: Mon, 18 May 2026 23:36:20 -0700
Subject: [PATCH 1/5] langsmith

---
 .../README.md                                 |  12 ++
 .../langchain_deep_agents_wikispeedia.py      |  35 +++++-
 .../test_langchain_deep_agents_wikispeedia.py | 106 +++++++++++++++++-
 3 files changed, 149 insertions(+), 4 deletions(-)

diff --git a/environments/langchain_deep_agents_wikispeedia/README.md b/environments/langchain_deep_agents_wikispeedia/README.md
index 7fbec5fce..aa51515ab 100644
--- a/environments/langchain_deep_agents_wikispeedia/README.md
+++ b/environments/langchain_deep_agents_wikispeedia/README.md
@@ -49,6 +49,18 @@ Notes:
 - The first run downloads ~5MB of SNAP data into `~/.cache/wikispeedia` (override with `cache_dir`).
 - Set `OPENAI_API_KEY` (or whatever the policy endpoint expects) for the agent.
 
+### LangSmith tracing
+
+Deep Agents uses LangGraph/LangChain native LangSmith tracing. Enable it with
+the standard LangSmith environment variables before running the eval:
+
+```bash
+export LANGSMITH_TRACING=true
+export LANGSMITH_API_KEY=...
+export LANGSMITH_PROJECT=verifiers-wikispeedia
+prime eval run langchain-deep-agents-wikispeedia
+```
+
 ### Taskset Config
 
 | Field | Type | Default | Description |
diff --git a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py
index 7da17fef9..d31888e78 100644
--- a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py
+++ b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py
@@ -1,5 +1,7 @@
 import asyncio
 import json
+import os
+import uuid
 from collections.abc import Awaitable, Callable, Iterator, Mapping, Sequence
 from typing import Protocol, cast
 
@@ -46,6 +48,8 @@ def system_prompt(allow_go_back: bool = True) -> str:
 
 
 SYSTEM_PROMPT = system_prompt()
+ENV_ID = "langchain-deep-agents-wikispeedia"
+AGENT_NAME = "wikispeedia-navigator"
 
 
 class WikispeediaTasksetConfig(vf.TasksetConfig):
@@ -438,12 +442,35 @@ async def run_langchain_deep_agents_wikispeedia_program(
             model=model,
             tools=nav_tools,
             system_prompt=state_system_prompt or SYSTEM_PROMPT,
+            name=AGENT_NAME,
         )
         prompt = str(cast(list[vf.ConfigData], state["prompt"])[-1]["content"])
         recursion_limit = state.get_max_turns(max_turns)
-        invoke_config = (
-            {"recursion_limit": recursion_limit} if recursion_limit > 0 else None
-        )
+        runtime = state.get("runtime", {})
+        runtime = runtime if isinstance(runtime, Mapping) else {}
+        source = str(state["info"]["source"])
+        target = str(state["info"]["target"])
+        trajectory_id = str(state["trajectory_id"])
+        run_id = uuid.UUID(hex=trajectory_id)
+        state["langsmith_run_id"] = str(run_id)
+        invoke_metadata = {
+            "vf_env": ENV_ID,
+            "vf_task_id": str(task.get("task_id", "")),
+            "vf_trajectory_id": trajectory_id,
+            "vf_group_key": str(runtime.get("group_key", "")),
+            "source": source,
+            "target": target,
+            "shortest_path": int(state["info"]["shortest_path"]),
+        }
+        invoke_config: dict[str, object] = {
+            "run_name": f"wikispeedia:{source}->{target}",
+            "run_id": run_id,
+            "configurable": {"thread_id": trajectory_id},
+            "metadata": invoke_metadata,
+            "tags": ["verifiers", "vf-v1", ENV_ID],
+        }
+        if recursion_limit > 0:
+            invoke_config["recursion_limit"] = recursion_limit
         invoke = agent.ainvoke(
             {"messages": [{"role": "user", "content": prompt}]},
             config=invoke_config,
@@ -560,6 +587,8 @@ def load_harness(config: WikispeediaHarnessConfig) -> WikispeediaHarness:
 
 def load_environment(config: WikispeediaEnvConfig) -> vf.Env:
     """Load the v1 Wikispeedia taskset with a LangChain Deep Agents harness."""
+    if os.environ.get("LANGSMITH_TRACING") == "true":
+        vf.ensure_keys(["LANGSMITH_API_KEY"])
 
     return vf.Env(
         taskset=load_taskset(config=config.taskset),
diff --git a/tests/test_langchain_deep_agents_wikispeedia.py b/tests/test_langchain_deep_agents_wikispeedia.py
index 8ca55bea4..c0b85b244 100644
--- a/tests/test_langchain_deep_agents_wikispeedia.py
+++ b/tests/test_langchain_deep_agents_wikispeedia.py
@@ -2,6 +2,7 @@
 import inspect
 import sys
 import types
+import uuid
 from pathlib import Path
 
 import pytest
@@ -336,7 +337,8 @@ def fake_create_deep_agent(**kwargs):
     )
     state = FakeState(
         {
-            "info": {"source": "A"},
+            "trajectory_id": "0123456789abcdef0123456789abcdef",
+            "info": {"source": "A", "target": "B", "shortest_path": 1},
             "prompt": [{"role": "user", "content": "start"}],
             "system_prompt": [
                 {"role": "user", "content": "first prompt chunk"},
@@ -353,6 +355,108 @@ def fake_create_deep_agent(**kwargs):
     assert result["agent_completion"] == []
 
 
+@pytest.mark.asyncio
+async def test_wikispeedia_deep_agents_program_passes_langsmith_config(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    module = load_module(monkeypatch)
+
+    class GraphRecursionError(Exception):
+        pass
+
+    class FakeState(dict):
+        def get_endpoint_config(self, api: str):
+            return {
+                "model": "model",
+                "api_base": "https://example.invalid/v1",
+                "api_key": "key",
+            }
+
+        def get_tools(self):
+            return {}
+
+        def get_max_turns(self, default: int):
+            return default
+
+        def stop(self, reason: str):
+            self["stop_reason"] = reason
+
+    class FakeChatOpenAI:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+
+    class FakeAgent:
+        async def ainvoke(self, payload, config=None):
+            captured["payload"] = payload
+            captured["config"] = config
+            return {"messages": [{"role": "assistant", "content": "done"}]}
+
+    captured: dict[str, object] = {}
+    created: dict[str, object] = {}
+
+    def fake_create_deep_agent(**kwargs):
+        created.update(kwargs)
+        return FakeAgent()
+
+    fake_deepagents = types.ModuleType("deepagents")
+    fake_langchain_openai = types.ModuleType("langchain_openai")
+    fake_langgraph = types.ModuleType("langgraph")
+    fake_langgraph_errors = types.ModuleType("langgraph.errors")
+    fake_langchain_core = types.ModuleType("langchain_core")
+    fake_tools_module = types.ModuleType("langchain_core.tools")
+
+    fake_deepagents.create_deep_agent = fake_create_deep_agent
+    fake_langchain_openai.ChatOpenAI = FakeChatOpenAI
+    fake_langgraph_errors.GraphRecursionError = GraphRecursionError
+    fake_langgraph.errors = fake_langgraph_errors
+    fake_tools_module.tool = lambda func: func
+    fake_langchain_core.tools = fake_tools_module
+    monkeypatch.setitem(sys.modules, "deepagents", fake_deepagents)
+    monkeypatch.setitem(sys.modules, "langchain_openai", fake_langchain_openai)
+    monkeypatch.setitem(sys.modules, "langgraph", fake_langgraph)
+    monkeypatch.setitem(sys.modules, "langgraph.errors", fake_langgraph_errors)
+    monkeypatch.setitem(sys.modules, "langchain_core", fake_langchain_core)
+    monkeypatch.setitem(sys.modules, "langchain_core.tools", fake_tools_module)
+
+    trajectory_id = "0123456789abcdef0123456789abcdef"
+    run_id = uuid.UUID(hex=trajectory_id)
+    program = module.make_langchain_deep_agents_program(
+        max_turns=12,
+        timeout_seconds=30,
+    )
+    state = FakeState(
+        {
+            "trajectory_id": trajectory_id,
+            "runtime": {"group_key": "group-1"},
+            "info": {"source": "A", "target": "B", "shortest_path": 2},
+            "prompt": [{"role": "user", "content": "start"}],
+        }
+    )
+
+    result = await program({"task_id": "A->B"}, state)
+
+    assert created["name"] == "wikispeedia-navigator"
+    assert captured["payload"] == {"messages": [{"role": "user", "content": "start"}]}
+    assert captured["config"] == {
+        "run_name": "wikispeedia:A->B",
+        "run_id": run_id,
+        "configurable": {"thread_id": trajectory_id},
+        "metadata": {
+            "vf_env": "langchain-deep-agents-wikispeedia",
+            "vf_task_id": "A->B",
+            "vf_trajectory_id": trajectory_id,
+            "vf_group_key": "group-1",
+            "source": "A",
+            "target": "B",
+            "shortest_path": 2,
+        },
+        "tags": ["verifiers", "vf-v1", "langchain-deep-agents-wikispeedia"],
+        "recursion_limit": 12,
+    }
+    assert result["langsmith_run_id"] == str(run_id)
+    assert result["completion"] == [{"role": "assistant", "content": "done"}]
+
+
 @pytest.mark.asyncio
 async def test_wikispeedia_tool_metrics_use_agent_completion(
     monkeypatch: pytest.MonkeyPatch,

From 64e86af626441878e57b14a848a84df2b928f687 Mon Sep 17 00:00:00 2001
From: Cooper Miller <kcoopermiller9@gmail.com>
Date: Mon, 18 May 2026 23:44:48 -0700
Subject: [PATCH 2/5] invoke_config type

---
 .../langchain_deep_agents_wikispeedia.py                        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py
index d31888e78..826ed8071 100644
--- a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py
+++ b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py
@@ -462,7 +462,7 @@ async def run_langchain_deep_agents_wikispeedia_program(
             "target": target,
             "shortest_path": int(state["info"]["shortest_path"]),
         }
-        invoke_config: dict[str, object] = {
+        invoke_config: vf.ConfigData = {
             "run_name": f"wikispeedia:{source}->{target}",
             "run_id": run_id,
             "configurable": {"thread_id": trajectory_id},

From e919d273cc5d6f8f210fd01c759893a7263ecafd Mon Sep 17 00:00:00 2001
From: Cooper Miller <kcoopermiller9@gmail.com>
Date: Tue, 19 May 2026 12:15:40 -0700
Subject: [PATCH 3/5] bump verifiers

---
 environments/langchain_deep_agents_wikispeedia/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environments/langchain_deep_agents_wikispeedia/pyproject.toml b/environments/langchain_deep_agents_wikispeedia/pyproject.toml
index 5818a214a..9b880415d 100644
--- a/environments/langchain_deep_agents_wikispeedia/pyproject.toml
+++ b/environments/langchain_deep_agents_wikispeedia/pyproject.toml
@@ -5,7 +5,7 @@ tags = ["v1", "taskset", "harness", "multi-turn", "tool-use", "langchain", "deep
 version = "0.1.4"
 requires-python = ">=3.11,<3.13"
 dependencies = [
-    "verifiers>=0.1.14",
+    "verifiers>=0.1.15.dev7",
     "datasets",
     "deepagents>=0.5.5",
     "langgraph",

From 99ee9da05bf139de243dc582ac4f06b9bc4d80f4 Mon Sep 17 00:00:00 2001
From: Cooper Miller <kcoopermiller9@gmail.com>
Date: Tue, 19 May 2026 15:34:26 -0700
Subject: [PATCH 4/5] fix metrics

---
 .../README.md                                 |   5 +-
 .../langchain_deep_agents_wikispeedia.py      |  59 ++++++-
 ..._lab2_langchain_deep_agents_wikispeedia.py | 144 ++++++++++++++++++
 .../test_langchain_deep_agents_wikispeedia.py |  24 +++
 4 files changed, 227 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_lab2_langchain_deep_agents_wikispeedia.py

diff --git a/environments/langchain_deep_agents_wikispeedia/README.md b/environments/langchain_deep_agents_wikispeedia/README.md
index aa51515ab..861f19a2c 100644
--- a/environments/langchain_deep_agents_wikispeedia/README.md
+++ b/environments/langchain_deep_agents_wikispeedia/README.md
@@ -74,7 +74,7 @@ prime eval run langchain-deep-agents-wikispeedia
 | `split_seed` | int | `0` | Seed for deterministic train/eval split. |
 | `links_only` | bool | `False` | Render articles as just the link menu (ablation: tests whether the agent navigates from semantic content or link names alone). |
 | `allow_go_back` | bool | `True` | Expose the `go_back` tool. |
-| `max_turns` | int | `50` | Per-rollout turn cap. |
+| `max_turns` | int | `50` | Per-rollout LangGraph recursion limit stored on each task row. This is not a literal model-turn count; Deep Agents may spend multiple graph steps per model/tool cycle. |
 | `efficiency_weight` | float | `0.0` | If `> 0`, mix `path_efficiency` into the reward at this weight (a near-optimal route earns up to `1 + efficiency_weight`; a wanderer that reaches the target still earns `1`). Default `0.0` keeps reward as pure binary reachability. |
 | `stratify_path_length` | bool | `True` | Take equal counts at each shortest-path bucket inside `[min_path_length, max_path_length]`, capped at the smallest non-empty bucket. The SNAP graph's natural distribution heavily skews toward the lower end of any band (4-6 → 83% sp=4); without stratification the policy over-trains on the trivial floor. Set `False` to recover the natural distribution. |
 
@@ -82,7 +82,7 @@ prime eval run langchain-deep-agents-wikispeedia
 
 | Field | Type | Default | Description |
 | --- | ---- | ------- | ----------- |
-| `max_turns` | int | `50` | LangChain recursion limit fallback when runtime config does not provide one. |
+| `max_turns` | int | `50` | LangGraph recursion limit fallback when runtime config does not provide one. This is not directly correlated with model turns. |
 | `timeout_seconds` | float | `1200.0` | Per-rollout wall-clock cap. |
 
 ### Metrics
@@ -103,3 +103,4 @@ prime eval run langchain-deep-agents-wikispeedia
 - Reward is `reached_target` only — exact, deterministic, no judge required. The deep-agent structural metrics are zero-weight so they show up in eval tables without shaping the policy.
 - `min_path_length=4, max_path_length=6` is the calibrated RL difficulty band for Nemotron-30B-A3B-BF16 — predicted ~0.3-0.4 reach rate, the useful-gradient zone. The 3-5 band landed at 0.61 mean reach (dominated by the trivial sp=3 floor where the deep-agent scaffolding is decorative); the 5-7 band landed at 0.13 with 27% timeouts.
 - This is the primary LangChain Deep Agents example because tool use is load-bearing: the model cannot reach the target without invoking `click_link`.
+- `max_turns` is passed through to LangGraph as `recursion_limit`. It caps graph execution steps, not model calls, so the observed number of model/tool cycles can be lower than the configured value.
diff --git a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py
index 826ed8071..8c801e556 100644
--- a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py
+++ b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py
@@ -50,6 +50,7 @@ def system_prompt(allow_go_back: bool = True) -> str:
 SYSTEM_PROMPT = system_prompt()
 ENV_ID = "langchain-deep-agents-wikispeedia"
 AGENT_NAME = "wikispeedia-navigator"
+NAVIGATION_TOOL_CALLS_KEY = "navigation_tool_calls"
 
 
 class WikispeediaTasksetConfig(vf.TasksetConfig):
@@ -94,6 +95,14 @@ def format_article(wiki: WikiGraph, article: str, links_only: bool = False) -> s
     return f"# {article}\n\n{text}\n\n---\nAvailable links: {links_str}"
 
 
+def record_navigation_tool_call(state: vf.State, name: str, valid: bool) -> None:
+    calls = state.get(NAVIGATION_TOOL_CALLS_KEY)
+    if not isinstance(calls, list):
+        calls = []
+        state[NAVIGATION_TOOL_CALLS_KEY] = calls
+    calls.append({"name": name, "valid": valid})
+
+
 async def click_link(article: str, wiki: WikiGraph, state: vf.State) -> str:
     """Navigate to a linked Wikipedia article."""
     links_only = bool(state.get("links_only", False))
@@ -101,11 +110,13 @@ async def click_link(article: str, wiki: WikiGraph, state: vf.State) -> str:
     available = wiki.get_links(current)
     normalized = wiki.normalize_name(article)
     if normalized is None or normalized not in available:
+        record_navigation_tool_call(state, "click_link", valid=False)
         avail_str = ", ".join(available) if available else "(none)"
         return (
             f"'{article}' is not a valid link from '{current}'.\n"
             f"Available links: {avail_str}"
         )
+    record_navigation_tool_call(state, "click_link", valid=True)
     state["current_article"] = normalized
     state["path"].append(normalized)
     if normalized == state["info"]["target"]:
@@ -123,7 +134,9 @@ async def go_back(wiki: WikiGraph, state: vf.State) -> str:
     """Undo the last click_link and return to the previous article."""
     path = state["path"]
     if len(path) <= 1:
+        record_navigation_tool_call(state, "go_back", valid=False)
         return "You are already at the starting article. Cannot go back."
+    record_navigation_tool_call(state, "go_back", valid=True)
     path.pop()
     state["current_article"] = path[-1]
     return format_article(
@@ -167,7 +180,20 @@ async def agent_timeout(task: vf.Task, state: vf.State) -> float:
     return 1.0 if state.get("agent_timeout", False) else 0.0
 
 
-def iter_tool_calls(state: vf.State) -> Iterator[str]:
+def has_navigation_tool_log(state: vf.State) -> bool:
+    return isinstance(state.get(NAVIGATION_TOOL_CALLS_KEY), list)
+
+
+def iter_navigation_tool_calls(state: vf.State) -> Iterator[vf.ConfigMap]:
+    calls = state.get(NAVIGATION_TOOL_CALLS_KEY)
+    if not isinstance(calls, list):
+        return
+    for call in calls:
+        if isinstance(call, Mapping):
+            yield call
+
+
+def iter_completion_tool_calls(state: vf.State) -> Iterator[str]:
     completion = state.get("completion") or []
     messages = (
         vf.get_messages(completion, role="assistant")
@@ -183,9 +209,26 @@ def iter_tool_calls(state: vf.State) -> Iterator[str]:
 
 
 def count_tool_calls(state: vf.State, name: str | None = None) -> int:
+    if has_navigation_tool_log(state):
+        nav_count = sum(
+            1
+            for call in iter_navigation_tool_calls(state)
+            if name is None or call.get("name") == name
+        )
+        if name in WIKISPEEDIA_TOOLS:
+            return nav_count
+        completion_count = sum(
+            1
+            for tool_name in iter_completion_tool_calls(state)
+            if tool_name not in WIKISPEEDIA_TOOLS
+            and (name is None or tool_name == name)
+        )
+        return nav_count + completion_count
     if name is None:
-        return sum(1 for _ in iter_tool_calls(state))
-    return sum(1 for tool_name in iter_tool_calls(state) if tool_name == name)
+        return sum(1 for _ in iter_completion_tool_calls(state))
+    return sum(
+        1 for tool_name in iter_completion_tool_calls(state) if tool_name == name
+    )
 
 
 def make_tool_count_metric(
@@ -236,6 +279,15 @@ async def assistant_turns(task: vf.Task, state: vf.State) -> float:
 
 
 async def invalid_link_rate(task: vf.Task, state: vf.State) -> float:
+    if has_navigation_tool_log(state):
+        click_calls = [
+            call
+            for call in iter_navigation_tool_calls(state)
+            if call.get("name") == "click_link"
+        ]
+        invalid = sum(1 for call in click_calls if call.get("valid") is False)
+        return float(invalid / len(click_calls)) if click_calls else 0.0
+
     clicks = 0
     invalid = 0
     completion = state.get("completion") or []
@@ -422,6 +474,7 @@ async def run_langchain_deep_agents_wikispeedia_program(
         state["reached_target"] = False
         state["agent_timeout"] = False
         state["links_only"] = bool(task.get("links_only", False))
+        state[NAVIGATION_TOOL_CALLS_KEY] = []
 
         endpoint_config = state.get_endpoint_config(api="chat")
         model = ChatOpenAI(
diff --git a/tests/test_lab2_langchain_deep_agents_wikispeedia.py b/tests/test_lab2_langchain_deep_agents_wikispeedia.py
new file mode 100644
index 000000000..20a7fcd2e
--- /dev/null
+++ b/tests/test_lab2_langchain_deep_agents_wikispeedia.py
@@ -0,0 +1,144 @@
+import importlib
+import sys
+import types
+import uuid
+from pathlib import Path
+
+import pytest
+
+import verifiers as vf
+
+
+def load_lab2_module(monkeypatch: pytest.MonkeyPatch):
+    env_dir = (
+        Path(__file__).parents[1]
+        / "lab2"
+        / "environments"
+        / "langchain_deep_agents_wikispeedia"
+    )
+    monkeypatch.syspath_prepend(str(env_dir))
+    sys.modules.pop("langchain_deep_agents_wikispeedia", None)
+    sys.modules.pop("wiki_graph", None)
+    return importlib.import_module("langchain_deep_agents_wikispeedia")
+
+
+@pytest.mark.asyncio
+async def test_lab2_deep_agents_program_passes_langsmith_config(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    module = load_lab2_module(monkeypatch)
+
+    class GraphRecursionError(Exception):
+        pass
+
+    class FakeState(dict):
+        def get_endpoint_config(self, api: str):
+            return {
+                "model": "model",
+                "api_base": "https://example.invalid/v1",
+                "api_key": "key",
+            }
+
+        def get_tools(self):
+            return {}
+
+        def get_max_turns(self, default: int):
+            return default
+
+        def stop(self, reason: str):
+            self["stop_reason"] = reason
+
+    class FakeChatOpenAI:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+
+    class FakeAgent:
+        async def ainvoke(self, payload, config=None):
+            captured["payload"] = payload
+            captured["config"] = config
+            return {"messages": [{"role": "assistant", "content": "done"}]}
+
+    captured: dict[str, object] = {}
+    created: dict[str, object] = {}
+
+    def fake_create_deep_agent(**kwargs):
+        created.update(kwargs)
+        return FakeAgent()
+
+    fake_deepagents = types.ModuleType("deepagents")
+    fake_langchain_openai = types.ModuleType("langchain_openai")
+    fake_langgraph = types.ModuleType("langgraph")
+    fake_langgraph_errors = types.ModuleType("langgraph.errors")
+    fake_langchain_core = types.ModuleType("langchain_core")
+    fake_tools_module = types.ModuleType("langchain_core.tools")
+
+    fake_deepagents.create_deep_agent = fake_create_deep_agent
+    fake_langchain_openai.ChatOpenAI = FakeChatOpenAI
+    fake_langgraph_errors.GraphRecursionError = GraphRecursionError
+    fake_langgraph.errors = fake_langgraph_errors
+    fake_tools_module.tool = lambda func: func
+    fake_langchain_core.tools = fake_tools_module
+    monkeypatch.setitem(sys.modules, "deepagents", fake_deepagents)
+    monkeypatch.setitem(sys.modules, "langchain_openai", fake_langchain_openai)
+    monkeypatch.setitem(sys.modules, "langgraph", fake_langgraph)
+    monkeypatch.setitem(sys.modules, "langgraph.errors", fake_langgraph_errors)
+    monkeypatch.setitem(sys.modules, "langchain_core", fake_langchain_core)
+    monkeypatch.setitem(sys.modules, "langchain_core.tools", fake_tools_module)
+
+    trajectory_id = "0123456789abcdef0123456789abcdef"
+    run_id = uuid.UUID(hex=trajectory_id)
+    program = module.make_langchain_deep_agents_program(
+        max_turns=12,
+        timeout_seconds=30,
+    )
+    state = FakeState(
+        {
+            "trajectory_id": trajectory_id,
+            "runtime": {"group_key": "group-1"},
+            "info": {"source": "A", "target": "B", "shortest_path": 2},
+            "prompt": [{"role": "user", "content": "start"}],
+        }
+    )
+
+    result = await program({"task_id": "A->B"}, state)
+
+    assert created["name"] == "wikispeedia-navigator"
+    assert captured["payload"] == {"messages": [{"role": "user", "content": "start"}]}
+    assert captured["config"] == {
+        "run_name": "wikispeedia:A->B",
+        "run_id": run_id,
+        "configurable": {"thread_id": trajectory_id},
+        "metadata": {
+            "vf_env": "langchain-deep-agents-wikispeedia",
+            "vf_task_id": "A->B",
+            "vf_trajectory_id": trajectory_id,
+            "vf_group_key": "group-1",
+            "source": "A",
+            "target": "B",
+            "shortest_path": 2,
+        },
+        "tags": ["verifiers", "vf-v1", "langchain-deep-agents-wikispeedia"],
+        "recursion_limit": 12,
+    }
+    assert result["langsmith_run_id"] == str(run_id)
+    assert result["completion"] == [{"role": "assistant", "content": "done"}]
+
+
+@pytest.mark.asyncio
+async def test_lab2_navigation_metrics_use_state_log_when_completion_empty(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    module = load_lab2_module(monkeypatch)
+    task = vf.Task({"prompt": [], "info": {"shortest_path": 1}}).freeze()
+    state = vf.State.for_task(task)
+    state["completion"] = []
+    state[module.NAVIGATION_TOOL_CALLS_KEY] = [
+        {"name": "click_link", "valid": False},
+        {"name": "click_link", "valid": True},
+        {"name": "go_back", "valid": True},
+    ]
+
+    assert await module.total_tool_calls(task, state) == 3.0
+    assert await module.make_tool_count_metric("click_link")(task, state) == 2.0
+    assert await module.make_tool_count_metric("go_back")(task, state) == 1.0
+    assert await module.invalid_link_rate(task, state) == 0.5
diff --git a/tests/test_langchain_deep_agents_wikispeedia.py b/tests/test_langchain_deep_agents_wikispeedia.py
index c0b85b244..a0abba978 100644
--- a/tests/test_langchain_deep_agents_wikispeedia.py
+++ b/tests/test_langchain_deep_agents_wikispeedia.py
@@ -234,6 +234,10 @@ async def test_wikispeedia_tools_resolve_through_v1_runtime(
     assert sorted(tools) == ["click_link", "go_back"]
     assert result.startswith("TARGET REACHED")
     assert state["reached_target"] is True
+    assert state[module.NAVIGATION_TOOL_CALLS_KEY] == [
+        {"name": "click_link", "valid": True}
+    ]
+    assert await module.total_tool_calls(task, state) == 1.0
 
 
 @pytest.mark.asyncio
@@ -479,3 +483,23 @@ async def test_wikispeedia_tool_metrics_use_agent_completion(
 
     assert await module.total_tool_calls(task, state) == 1.0
     assert await module.invalid_link_rate(task, state) == 1.0
+
+
+@pytest.mark.asyncio
+async def test_wikispeedia_navigation_metrics_use_state_log_when_completion_empty(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    module = load_module(monkeypatch)
+    task = vf.Task({"prompt": [], "info": {"shortest_path": 1}}).freeze()
+    state = vf.State.for_task(task)
+    state["completion"] = []
+    state[module.NAVIGATION_TOOL_CALLS_KEY] = [
+        {"name": "click_link", "valid": False},
+        {"name": "click_link", "valid": True},
+        {"name": "go_back", "valid": True},
+    ]
+
+    assert await module.total_tool_calls(task, state) == 3.0
+    assert await module.make_tool_count_metric("click_link")(task, state) == 2.0
+    assert await module.make_tool_count_metric("go_back")(task, state) == 1.0
+    assert await module.invalid_link_rate(task, state) == 0.5

From 0f5801197c6259fa197eb164b8db336b58459b26 Mon Sep 17 00:00:00 2001
From: Cooper Miller <kcoopermiller9@gmail.com>
Date: Tue, 19 May 2026 15:35:17 -0700
Subject: [PATCH 5/5] rm test

---
 ..._lab2_langchain_deep_agents_wikispeedia.py | 144 ------------------
 1 file changed, 144 deletions(-)
 delete mode 100644 tests/test_lab2_langchain_deep_agents_wikispeedia.py

diff --git a/tests/test_lab2_langchain_deep_agents_wikispeedia.py b/tests/test_lab2_langchain_deep_agents_wikispeedia.py
deleted file mode 100644
index 20a7fcd2e..000000000
--- a/tests/test_lab2_langchain_deep_agents_wikispeedia.py
+++ /dev/null
@@ -1,144 +0,0 @@
-import importlib
-import sys
-import types
-import uuid
-from pathlib import Path
-
-import pytest
-
-import verifiers as vf
-
-
-def load_lab2_module(monkeypatch: pytest.MonkeyPatch):
-    env_dir = (
-        Path(__file__).parents[1]
-        / "lab2"
-        / "environments"
-        / "langchain_deep_agents_wikispeedia"
-    )
-    monkeypatch.syspath_prepend(str(env_dir))
-    sys.modules.pop("langchain_deep_agents_wikispeedia", None)
-    sys.modules.pop("wiki_graph", None)
-    return importlib.import_module("langchain_deep_agents_wikispeedia")
-
-
-@pytest.mark.asyncio
-async def test_lab2_deep_agents_program_passes_langsmith_config(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    module = load_lab2_module(monkeypatch)
-
-    class GraphRecursionError(Exception):
-        pass
-
-    class FakeState(dict):
-        def get_endpoint_config(self, api: str):
-            return {
-                "model": "model",
-                "api_base": "https://example.invalid/v1",
-                "api_key": "key",
-            }
-
-        def get_tools(self):
-            return {}
-
-        def get_max_turns(self, default: int):
-            return default
-
-        def stop(self, reason: str):
-            self["stop_reason"] = reason
-
-    class FakeChatOpenAI:
-        def __init__(self, **kwargs):
-            self.kwargs = kwargs
-
-    class FakeAgent:
-        async def ainvoke(self, payload, config=None):
-            captured["payload"] = payload
-            captured["config"] = config
-            return {"messages": [{"role": "assistant", "content": "done"}]}
-
-    captured: dict[str, object] = {}
-    created: dict[str, object] = {}
-
-    def fake_create_deep_agent(**kwargs):
-        created.update(kwargs)
-        return FakeAgent()
-
-    fake_deepagents = types.ModuleType("deepagents")
-    fake_langchain_openai = types.ModuleType("langchain_openai")
-    fake_langgraph = types.ModuleType("langgraph")
-    fake_langgraph_errors = types.ModuleType("langgraph.errors")
-    fake_langchain_core = types.ModuleType("langchain_core")
-    fake_tools_module = types.ModuleType("langchain_core.tools")
-
-    fake_deepagents.create_deep_agent = fake_create_deep_agent
-    fake_langchain_openai.ChatOpenAI = FakeChatOpenAI
-    fake_langgraph_errors.GraphRecursionError = GraphRecursionError
-    fake_langgraph.errors = fake_langgraph_errors
-    fake_tools_module.tool = lambda func: func
-    fake_langchain_core.tools = fake_tools_module
-    monkeypatch.setitem(sys.modules, "deepagents", fake_deepagents)
-    monkeypatch.setitem(sys.modules, "langchain_openai", fake_langchain_openai)
-    monkeypatch.setitem(sys.modules, "langgraph", fake_langgraph)
-    monkeypatch.setitem(sys.modules, "langgraph.errors", fake_langgraph_errors)
-    monkeypatch.setitem(sys.modules, "langchain_core", fake_langchain_core)
-    monkeypatch.setitem(sys.modules, "langchain_core.tools", fake_tools_module)
-
-    trajectory_id = "0123456789abcdef0123456789abcdef"
-    run_id = uuid.UUID(hex=trajectory_id)
-    program = module.make_langchain_deep_agents_program(
-        max_turns=12,
-        timeout_seconds=30,
-    )
-    state = FakeState(
-        {
-            "trajectory_id": trajectory_id,
-            "runtime": {"group_key": "group-1"},
-            "info": {"source": "A", "target": "B", "shortest_path": 2},
-            "prompt": [{"role": "user", "content": "start"}],
-        }
-    )
-
-    result = await program({"task_id": "A->B"}, state)
-
-    assert created["name"] == "wikispeedia-navigator"
-    assert captured["payload"] == {"messages": [{"role": "user", "content": "start"}]}
-    assert captured["config"] == {
-        "run_name": "wikispeedia:A->B",
-        "run_id": run_id,
-        "configurable": {"thread_id": trajectory_id},
-        "metadata": {
-            "vf_env": "langchain-deep-agents-wikispeedia",
-            "vf_task_id": "A->B",
-            "vf_trajectory_id": trajectory_id,
-            "vf_group_key": "group-1",
-            "source": "A",
-            "target": "B",
-            "shortest_path": 2,
-        },
-        "tags": ["verifiers", "vf-v1", "langchain-deep-agents-wikispeedia"],
-        "recursion_limit": 12,
-    }
-    assert result["langsmith_run_id"] == str(run_id)
-    assert result["completion"] == [{"role": "assistant", "content": "done"}]
-
-
-@pytest.mark.asyncio
-async def test_lab2_navigation_metrics_use_state_log_when_completion_empty(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    module = load_lab2_module(monkeypatch)
-    task = vf.Task({"prompt": [], "info": {"shortest_path": 1}}).freeze()
-    state = vf.State.for_task(task)
-    state["completion"] = []
-    state[module.NAVIGATION_TOOL_CALLS_KEY] = [
-        {"name": "click_link", "valid": False},
-        {"name": "click_link", "valid": True},
-        {"name": "go_back", "valid": True},
-    ]
-
-    assert await module.total_tool_calls(task, state) == 3.0
-    assert await module.make_tool_count_metric("click_link")(task, state) == 2.0
-    assert await module.make_tool_count_metric("go_back")(task, state) == 1.0
-    assert await module.invalid_link_rate(task, state) == 0.5