PrimeIntellect-ai · kcoopermiller · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/environments/langchain_deep_agents_wikispeedia/README.md b/environments/langchain_deep_agents_wikispeedia/README.md
@@ -49,6 +49,18 @@ Notes:
 - The first run downloads ~5MB of SNAP data into `~/.cache/wikispeedia` (override with `cache_dir`).
 - Set `OPENAI_API_KEY` (or whatever the policy endpoint expects) for the agent.
 
+### LangSmith tracing
+
+Deep Agents uses LangGraph/LangChain native LangSmith tracing. Enable it with
+the standard LangSmith environment variables before running the eval:
+
+```bash
+export LANGSMITH_TRACING=true
+export LANGSMITH_API_KEY=...
+export LANGSMITH_PROJECT=verifiers-wikispeedia
+prime eval run langchain-deep-agents-wikispeedia
+```
+
 ### Taskset Config
 
 | Field | Type | Default | Description |
@@ -62,15 +74,15 @@ Notes:
 | `split_seed` | int | `0` | Seed for deterministic train/eval split. |
 | `links_only` | bool | `False` | Render articles as just the link menu (ablation: tests whether the agent navigates from semantic content or link names alone). |
 | `allow_go_back` | bool | `True` | Expose the `go_back` tool. |
-| `max_turns` | int | `50` | Per-rollout turn cap. |
+| `max_turns` | int | `50` | Per-rollout LangGraph recursion limit stored on each task row. This is not a literal model-turn count; Deep Agents may spend multiple graph steps per model/tool cycle. |
 | `efficiency_weight` | float | `0.0` | If `> 0`, mix `path_efficiency` into the reward at this weight (a near-optimal route earns up to `1 + efficiency_weight`; a wanderer that reaches the target still earns `1`). Default `0.0` keeps reward as pure binary reachability. |
 | `stratify_path_length` | bool | `True` | Take equal counts at each shortest-path bucket inside `[min_path_length, max_path_length]`, capped at the smallest non-empty bucket. The SNAP graph's natural distribution heavily skews toward the lower end of any band (4-6 → 83% sp=4); without stratification the policy over-trains on the trivial floor. Set `False` to recover the natural distribution. |
 
 ### Harness Config
 
 | Field | Type | Default | Description |
 | --- | ---- | ------- | ----------- |
-| `max_turns` | int | `50` | LangChain recursion limit fallback when runtime config does not provide one. |
+| `max_turns` | int | `50` | LangGraph recursion limit fallback when runtime config does not provide one. This is not directly correlated with model turns. |
 | `timeout_seconds` | float | `1200.0` | Per-rollout wall-clock cap. |
 
 ### Metrics
@@ -91,3 +103,4 @@ Notes:
 - Reward is `reached_target` only — exact, deterministic, no judge required. The deep-agent structural metrics are zero-weight so they show up in eval tables without shaping the policy.
 - `min_path_length=4, max_path_length=6` is the calibrated RL difficulty band for Nemotron-30B-A3B-BF16 — predicted ~0.3-0.4 reach rate, the useful-gradient zone. The 3-5 band landed at 0.61 mean reach (dominated by the trivial sp=3 floor where the deep-agent scaffolding is decorative); the 5-7 band landed at 0.13 with 27% timeouts.
 - This is the primary LangChain Deep Agents example because tool use is load-bearing: the model cannot reach the target without invoking `click_link`.
+- `max_turns` is passed through to LangGraph as `recursion_limit`. It caps graph execution steps, not model calls, so the observed number of model/tool cycles can be lower than the configured value.
diff --git a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py
@@ -1,5 +1,7 @@
 import asyncio
 import json
+import os
+import uuid
 from collections.abc import Awaitable, Callable, Iterator, Mapping, Sequence
 from typing import Protocol, cast
 
@@ -46,6 +48,9 @@ def system_prompt(allow_go_back: bool = True) -> str:
 
 
 SYSTEM_PROMPT = system_prompt()
+ENV_ID = "langchain-deep-agents-wikispeedia"
+AGENT_NAME = "wikispeedia-navigator"
+NAVIGATION_TOOL_CALLS_KEY = "navigation_tool_calls"
 
 
 class WikispeediaTasksetConfig(vf.TasksetConfig):
@@ -90,18 +95,28 @@ def format_article(wiki: WikiGraph, article: str, links_only: bool = False) -> s
     return f"# {article}\n\n{text}\n\n---\nAvailable links: {links_str}"
 
 
+def record_navigation_tool_call(state: vf.State, name: str, valid: bool) -> None:
+    calls = state.get(NAVIGATION_TOOL_CALLS_KEY)
+    if not isinstance(calls, list):
+        calls = []
+        state[NAVIGATION_TOOL_CALLS_KEY] = calls
+    calls.append({"name": name, "valid": valid})
+
+
 async def click_link(article: str, wiki: WikiGraph, state: vf.State) -> str:
     """Navigate to a linked Wikipedia article."""
     links_only = bool(state.get("links_only", False))
     current = state["current_article"]
     available = wiki.get_links(current)
     normalized = wiki.normalize_name(article)
     if normalized is None or normalized not in available:
+        record_navigation_tool_call(state, "click_link", valid=False)
         avail_str = ", ".join(available) if available else "(none)"
         return (
             f"'{article}' is not a valid link from '{current}'.\n"
             f"Available links: {avail_str}"
         )
+    record_navigation_tool_call(state, "click_link", valid=True)
     state["current_article"] = normalized
     state["path"].append(normalized)
     if normalized == state["info"]["target"]:
@@ -119,7 +134,9 @@ async def go_back(wiki: WikiGraph, state: vf.State) -> str:
     """Undo the last click_link and return to the previous article."""
     path = state["path"]
     if len(path) <= 1:
+        record_navigation_tool_call(state, "go_back", valid=False)
         return "You are already at the starting article. Cannot go back."
+    record_navigation_tool_call(state, "go_back", valid=True)
     path.pop()
     state["current_article"] = path[-1]
     return format_article(
@@ -163,7 +180,20 @@ async def agent_timeout(task: vf.Task, state: vf.State) -> float:
     return 1.0 if state.get("agent_timeout", False) else 0.0
 
 
-def iter_tool_calls(state: vf.State) -> Iterator[str]:
+def has_navigation_tool_log(state: vf.State) -> bool:
+    return isinstance(state.get(NAVIGATION_TOOL_CALLS_KEY), list)
+
+
+def iter_navigation_tool_calls(state: vf.State) -> Iterator[vf.ConfigMap]:
+    calls = state.get(NAVIGATION_TOOL_CALLS_KEY)
+    if not isinstance(calls, list):
+        return
+    for call in calls:
+        if isinstance(call, Mapping):
+            yield call
+
+
+def iter_completion_tool_calls(state: vf.State) -> Iterator[str]:
     completion = state.get("completion") or []
     messages = (
         vf.get_messages(completion, role="assistant")
@@ -179,9 +209,26 @@ def iter_tool_calls(state: vf.State) -> Iterator[str]:
 
 
 def count_tool_calls(state: vf.State, name: str | None = None) -> int:
+    if has_navigation_tool_log(state):
+        nav_count = sum(
+            1
+            for call in iter_navigation_tool_calls(state)
+            if name is None or call.get("name") == name
+        )
+        if name in WIKISPEEDIA_TOOLS:
+            return nav_count
+        completion_count = sum(
+            1
+            for tool_name in iter_completion_tool_calls(state)
+            if tool_name not in WIKISPEEDIA_TOOLS
+            and (name is None or tool_name == name)
+        )
+        return nav_count + completion_count
     if name is None:
-        return sum(1 for _ in iter_tool_calls(state))
-    return sum(1 for tool_name in iter_tool_calls(state) if tool_name == name)
+        return sum(1 for _ in iter_completion_tool_calls(state))
+    return sum(
+        1 for tool_name in iter_completion_tool_calls(state) if tool_name == name
+    )
 
 
 def make_tool_count_metric(
@@ -232,6 +279,15 @@ async def assistant_turns(task: vf.Task, state: vf.State) -> float:
 
 
 async def invalid_link_rate(task: vf.Task, state: vf.State) -> float:
+    if has_navigation_tool_log(state):
+        click_calls = [
+            call
+            for call in iter_navigation_tool_calls(state)
+            if call.get("name") == "click_link"
+        ]
+        invalid = sum(1 for call in click_calls if call.get("valid") is False)
+        return float(invalid / len(click_calls)) if click_calls else 0.0
+
     clicks = 0
     invalid = 0
     completion = state.get("completion") or []
@@ -418,6 +474,7 @@ async def run_langchain_deep_agents_wikispeedia_program(
         state["reached_target"] = False
         state["agent_timeout"] = False
         state["links_only"] = bool(task.get("links_only", False))
+        state[NAVIGATION_TOOL_CALLS_KEY] = []
 
         endpoint_config = state.get_endpoint_config(api="chat")
         model = ChatOpenAI(
@@ -438,12 +495,35 @@ async def run_langchain_deep_agents_wikispeedia_program(
             model=model,
             tools=nav_tools,
             system_prompt=state_system_prompt or SYSTEM_PROMPT,
+            name=AGENT_NAME,
         )
         prompt = str(cast(list[vf.ConfigData], state["prompt"])[-1]["content"])
         recursion_limit = state.get_max_turns(max_turns)
-        invoke_config = (
-            {"recursion_limit": recursion_limit} if recursion_limit > 0 else None
-        )
+        runtime = state.get("runtime", {})
+        runtime = runtime if isinstance(runtime, Mapping) else {}
+        source = str(state["info"]["source"])
+        target = str(state["info"]["target"])
+        trajectory_id = str(state["trajectory_id"])
+        run_id = uuid.UUID(hex=trajectory_id)
+        state["langsmith_run_id"] = str(run_id)
+        invoke_metadata = {
+            "vf_env": ENV_ID,
+            "vf_task_id": str(task.get("task_id", "")),
+            "vf_trajectory_id": trajectory_id,
+            "vf_group_key": str(runtime.get("group_key", "")),
+            "source": source,
+            "target": target,
+            "shortest_path": int(state["info"]["shortest_path"]),
+        }
+        invoke_config: vf.ConfigData = {
+            "run_name": f"wikispeedia:{source}->{target}",
+            "run_id": run_id,
+            "configurable": {"thread_id": trajectory_id},
+            "metadata": invoke_metadata,
+            "tags": ["verifiers", "vf-v1", ENV_ID],
+        }
+        if recursion_limit > 0:
+            invoke_config["recursion_limit"] = recursion_limit
         invoke = agent.ainvoke(
             {"messages": [{"role": "user", "content": prompt}]},
             config=invoke_config,
@@ -560,6 +640,8 @@ def load_harness(config: WikispeediaHarnessConfig) -> WikispeediaHarness:
 
 def load_environment(config: WikispeediaEnvConfig) -> vf.Env:
     """Load the v1 Wikispeedia taskset with a LangChain Deep Agents harness."""
+    if os.environ.get("LANGSMITH_TRACING") == "true":
+        vf.ensure_keys(["LANGSMITH_API_KEY"])
 
     return vf.Env(
         taskset=load_taskset(config=config.taskset),

diff --git a/environments/langchain_deep_agents_wikispeedia/pyproject.toml b/environments/langchain_deep_agents_wikispeedia/pyproject.toml
@@ -5,7 +5,7 @@ tags = ["v1", "taskset", "harness", "multi-turn", "tool-use", "langchain", "deep
 version = "0.1.4"
 requires-python = ">=3.11,<3.13"
 dependencies = [
-    "verifiers>=0.1.14",
+    "verifiers>=0.1.15.dev7",
     "datasets",
     "deepagents>=0.5.5",
     "langgraph",

diff --git a/tests/test_langchain_deep_agents_wikispeedia.py b/tests/test_langchain_deep_agents_wikispeedia.py
@@ -2,6 +2,7 @@
 import inspect
 import sys
 import types
+import uuid
 from pathlib import Path
 
 import pytest
@@ -233,6 +234,10 @@ async def test_wikispeedia_tools_resolve_through_v1_runtime(
     assert sorted(tools) == ["click_link", "go_back"]
     assert result.startswith("TARGET REACHED")
     assert state["reached_target"] is True
+    assert state[module.NAVIGATION_TOOL_CALLS_KEY] == [
+        {"name": "click_link", "valid": True}
+    ]
+    assert await module.total_tool_calls(task, state) == 1.0
 
 
 @pytest.mark.asyncio
@@ -336,7 +341,8 @@ def fake_create_deep_agent(**kwargs):
     )
     state = FakeState(
         {
-            "info": {"source": "A"},
+            "trajectory_id": "0123456789abcdef0123456789abcdef",
+            "info": {"source": "A", "target": "B", "shortest_path": 1},
             "prompt": [{"role": "user", "content": "start"}],
             "system_prompt": [
                 {"role": "user", "content": "first prompt chunk"},
@@ -353,6 +359,108 @@ def fake_create_deep_agent(**kwargs):
     assert result["agent_completion"] == []
 
 
+@pytest.mark.asyncio
+async def test_wikispeedia_deep_agents_program_passes_langsmith_config(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    module = load_module(monkeypatch)
+
+    class GraphRecursionError(Exception):
+        pass
+
+    class FakeState(dict):
+        def get_endpoint_config(self, api: str):
+            return {
+                "model": "model",
+                "api_base": "https://example.invalid/v1",
+                "api_key": "key",
+            }
+
+        def get_tools(self):
+            return {}
+
+        def get_max_turns(self, default: int):
+            return default
+
+        def stop(self, reason: str):
+            self["stop_reason"] = reason
+
+    class FakeChatOpenAI:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+
+    class FakeAgent:
+        async def ainvoke(self, payload, config=None):
+            captured["payload"] = payload
+            captured["config"] = config
+            return {"messages": [{"role": "assistant", "content": "done"}]}
+
+    captured: dict[str, object] = {}
+    created: dict[str, object] = {}
+
+    def fake_create_deep_agent(**kwargs):
+        created.update(kwargs)
+        return FakeAgent()
+
+    fake_deepagents = types.ModuleType("deepagents")
+    fake_langchain_openai = types.ModuleType("langchain_openai")
+    fake_langgraph = types.ModuleType("langgraph")
+    fake_langgraph_errors = types.ModuleType("langgraph.errors")
+    fake_langchain_core = types.ModuleType("langchain_core")
+    fake_tools_module = types.ModuleType("langchain_core.tools")
+
+    fake_deepagents.create_deep_agent = fake_create_deep_agent
+    fake_langchain_openai.ChatOpenAI = FakeChatOpenAI
+    fake_langgraph_errors.GraphRecursionError = GraphRecursionError
+    fake_langgraph.errors = fake_langgraph_errors
+    fake_tools_module.tool = lambda func: func
+    fake_langchain_core.tools = fake_tools_module
+    monkeypatch.setitem(sys.modules, "deepagents", fake_deepagents)
+    monkeypatch.setitem(sys.modules, "langchain_openai", fake_langchain_openai)
+    monkeypatch.setitem(sys.modules, "langgraph", fake_langgraph)
+    monkeypatch.setitem(sys.modules, "langgraph.errors", fake_langgraph_errors)
+    monkeypatch.setitem(sys.modules, "langchain_core", fake_langchain_core)
+    monkeypatch.setitem(sys.modules, "langchain_core.tools", fake_tools_module)
+
+    trajectory_id = "0123456789abcdef0123456789abcdef"
+    run_id = uuid.UUID(hex=trajectory_id)
+    program = module.make_langchain_deep_agents_program(
+        max_turns=12,
+        timeout_seconds=30,
+    )
+    state = FakeState(
+        {
+            "trajectory_id": trajectory_id,
+            "runtime": {"group_key": "group-1"},
+            "info": {"source": "A", "target": "B", "shortest_path": 2},
+            "prompt": [{"role": "user", "content": "start"}],
+        }
+    )
+
+    result = await program({"task_id": "A->B"}, state)
+
+    assert created["name"] == "wikispeedia-navigator"
+    assert captured["payload"] == {"messages": [{"role": "user", "content": "start"}]}
+    assert captured["config"] == {
+        "run_name": "wikispeedia:A->B",
+        "run_id": run_id,
+        "configurable": {"thread_id": trajectory_id},
+        "metadata": {
+            "vf_env": "langchain-deep-agents-wikispeedia",
+            "vf_task_id": "A->B",
+            "vf_trajectory_id": trajectory_id,
+            "vf_group_key": "group-1",
+            "source": "A",
+            "target": "B",
+            "shortest_path": 2,
+        },
+        "tags": ["verifiers", "vf-v1", "langchain-deep-agents-wikispeedia"],
+        "recursion_limit": 12,
+    }
+    assert result["langsmith_run_id"] == str(run_id)
+    assert result["completion"] == [{"role": "assistant", "content": "done"}]
+
+
 @pytest.mark.asyncio
 async def test_wikispeedia_tool_metrics_use_agent_completion(
     monkeypatch: pytest.MonkeyPatch,
@@ -375,3 +483,23 @@ async def test_wikispeedia_tool_metrics_use_agent_completion(
 
     assert await module.total_tool_calls(task, state) == 1.0
     assert await module.invalid_link_rate(task, state) == 1.0
+
+
+@pytest.mark.asyncio
+async def test_wikispeedia_navigation_metrics_use_state_log_when_completion_empty(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    module = load_module(monkeypatch)
+    task = vf.Task({"prompt": [], "info": {"shortest_path": 1}}).freeze()
+    state = vf.State.for_task(task)
+    state["completion"] = []
+    state[module.NAVIGATION_TOOL_CALLS_KEY] = [
+        {"name": "click_link", "valid": False},
+        {"name": "click_link", "valid": True},
+        {"name": "go_back", "valid": True},
+    ]
+
+    assert await module.total_tool_calls(task, state) == 3.0
+    assert await module.make_tool_count_metric("click_link")(task, state) == 2.0
+    assert await module.make_tool_count_metric("go_back")(task, state) == 1.0
+    assert await module.invalid_link_rate(task, state) == 0.5