Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions environments/langchain_deep_agents_wikispeedia/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,18 @@ Notes:
- The first run downloads ~5MB of SNAP data into `~/.cache/wikispeedia` (override with `cache_dir`).
- Set `OPENAI_API_KEY` (or whatever the policy endpoint expects) for the agent.

### LangSmith tracing

Deep Agents uses LangGraph/LangChain native LangSmith tracing. Enable it with
the standard LangSmith environment variables before running the eval:

```bash
export LANGSMITH_TRACING=true
export LANGSMITH_API_KEY=...
export LANGSMITH_PROJECT=verifiers-wikispeedia
prime eval run langchain-deep-agents-wikispeedia
```

### Taskset Config

| Field | Type | Default | Description |
Expand All @@ -62,15 +74,15 @@ Notes:
| `split_seed` | int | `0` | Seed for deterministic train/eval split. |
| `links_only` | bool | `False` | Render articles as just the link menu (ablation: tests whether the agent navigates from semantic content or link names alone). |
| `allow_go_back` | bool | `True` | Expose the `go_back` tool. |
| `max_turns` | int | `50` | Per-rollout turn cap. |
| `max_turns` | int | `50` | Per-rollout LangGraph recursion limit stored on each task row. This is not a literal model-turn count; Deep Agents may spend multiple graph steps per model/tool cycle. |
| `efficiency_weight` | float | `0.0` | If `> 0`, mix `path_efficiency` into the reward at this weight (a near-optimal route earns up to `1 + efficiency_weight`; a wanderer that reaches the target still earns `1`). Default `0.0` keeps reward as pure binary reachability. |
| `stratify_path_length` | bool | `True` | Take equal counts at each shortest-path bucket inside `[min_path_length, max_path_length]`, capped at the smallest non-empty bucket. The SNAP graph's natural distribution heavily skews toward the lower end of any band (4-6 → 83% sp=4); without stratification the policy over-trains on the trivial floor. Set `False` to recover the natural distribution. |

### Harness Config

| Field | Type | Default | Description |
| --- | ---- | ------- | ----------- |
| `max_turns` | int | `50` | LangChain recursion limit fallback when runtime config does not provide one. |
| `max_turns` | int | `50` | LangGraph recursion limit fallback when runtime config does not provide one. This is not directly correlated with model turns. |
| `timeout_seconds` | float | `1200.0` | Per-rollout wall-clock cap. |

### Metrics
Expand All @@ -91,3 +103,4 @@ Notes:
- Reward is `reached_target` only — exact, deterministic, no judge required. The deep-agent structural metrics are zero-weight so they show up in eval tables without shaping the policy.
- `min_path_length=4, max_path_length=6` is the calibrated RL difficulty band for Nemotron-30B-A3B-BF16 — predicted ~0.3-0.4 reach rate, the useful-gradient zone. The 3-5 band landed at 0.61 mean reach (dominated by the trivial sp=3 floor where the deep-agent scaffolding is decorative); the 5-7 band landed at 0.13 with 27% timeouts.
- This is the primary LangChain Deep Agents example because tool use is load-bearing: the model cannot reach the target without invoking `click_link`.
- `max_turns` is passed through to LangGraph as `recursion_limit`. It caps graph execution steps, not model calls, so the observed number of model/tool cycles can be lower than the configured value.
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import asyncio
import json
import os
import uuid
from collections.abc import Awaitable, Callable, Iterator, Mapping, Sequence
from typing import Protocol, cast

Expand Down Expand Up @@ -46,6 +48,9 @@ def system_prompt(allow_go_back: bool = True) -> str:


SYSTEM_PROMPT = system_prompt()
ENV_ID = "langchain-deep-agents-wikispeedia"
AGENT_NAME = "wikispeedia-navigator"
NAVIGATION_TOOL_CALLS_KEY = "navigation_tool_calls"


class WikispeediaTasksetConfig(vf.TasksetConfig):
Expand Down Expand Up @@ -90,18 +95,28 @@ def format_article(wiki: WikiGraph, article: str, links_only: bool = False) -> s
return f"# {article}\n\n{text}\n\n---\nAvailable links: {links_str}"


def record_navigation_tool_call(state: vf.State, name: str, valid: bool) -> None:
calls = state.get(NAVIGATION_TOOL_CALLS_KEY)
if not isinstance(calls, list):
calls = []
state[NAVIGATION_TOOL_CALLS_KEY] = calls
calls.append({"name": name, "valid": valid})


async def click_link(article: str, wiki: WikiGraph, state: vf.State) -> str:
"""Navigate to a linked Wikipedia article."""
links_only = bool(state.get("links_only", False))
current = state["current_article"]
available = wiki.get_links(current)
normalized = wiki.normalize_name(article)
if normalized is None or normalized not in available:
record_navigation_tool_call(state, "click_link", valid=False)
avail_str = ", ".join(available) if available else "(none)"
return (
f"'{article}' is not a valid link from '{current}'.\n"
f"Available links: {avail_str}"
)
record_navigation_tool_call(state, "click_link", valid=True)
state["current_article"] = normalized
state["path"].append(normalized)
if normalized == state["info"]["target"]:
Expand All @@ -119,7 +134,9 @@ async def go_back(wiki: WikiGraph, state: vf.State) -> str:
"""Undo the last click_link and return to the previous article."""
path = state["path"]
if len(path) <= 1:
record_navigation_tool_call(state, "go_back", valid=False)
return "You are already at the starting article. Cannot go back."
record_navigation_tool_call(state, "go_back", valid=True)
path.pop()
state["current_article"] = path[-1]
return format_article(
Expand Down Expand Up @@ -163,7 +180,20 @@ async def agent_timeout(task: vf.Task, state: vf.State) -> float:
return 1.0 if state.get("agent_timeout", False) else 0.0


def iter_tool_calls(state: vf.State) -> Iterator[str]:
def has_navigation_tool_log(state: vf.State) -> bool:
return isinstance(state.get(NAVIGATION_TOOL_CALLS_KEY), list)


def iter_navigation_tool_calls(state: vf.State) -> Iterator[vf.ConfigMap]:
calls = state.get(NAVIGATION_TOOL_CALLS_KEY)
if not isinstance(calls, list):
return
for call in calls:
if isinstance(call, Mapping):
yield call


def iter_completion_tool_calls(state: vf.State) -> Iterator[str]:
completion = state.get("completion") or []
messages = (
vf.get_messages(completion, role="assistant")
Expand All @@ -179,9 +209,26 @@ def iter_tool_calls(state: vf.State) -> Iterator[str]:


def count_tool_calls(state: vf.State, name: str | None = None) -> int:
if has_navigation_tool_log(state):
nav_count = sum(
1
for call in iter_navigation_tool_calls(state)
if name is None or call.get("name") == name
)
if name in WIKISPEEDIA_TOOLS:
return nav_count
completion_count = sum(
1
for tool_name in iter_completion_tool_calls(state)
if tool_name not in WIKISPEEDIA_TOOLS
and (name is None or tool_name == name)
)
return nav_count + completion_count
if name is None:
return sum(1 for _ in iter_tool_calls(state))
return sum(1 for tool_name in iter_tool_calls(state) if tool_name == name)
return sum(1 for _ in iter_completion_tool_calls(state))
return sum(
1 for tool_name in iter_completion_tool_calls(state) if tool_name == name
)


def make_tool_count_metric(
Expand Down Expand Up @@ -232,6 +279,15 @@ async def assistant_turns(task: vf.Task, state: vf.State) -> float:


async def invalid_link_rate(task: vf.Task, state: vf.State) -> float:
if has_navigation_tool_log(state):
click_calls = [
call
for call in iter_navigation_tool_calls(state)
if call.get("name") == "click_link"
]
invalid = sum(1 for call in click_calls if call.get("valid") is False)
return float(invalid / len(click_calls)) if click_calls else 0.0

clicks = 0
invalid = 0
completion = state.get("completion") or []
Expand Down Expand Up @@ -418,6 +474,7 @@ async def run_langchain_deep_agents_wikispeedia_program(
state["reached_target"] = False
state["agent_timeout"] = False
state["links_only"] = bool(task.get("links_only", False))
state[NAVIGATION_TOOL_CALLS_KEY] = []

endpoint_config = state.get_endpoint_config(api="chat")
model = ChatOpenAI(
Expand All @@ -438,12 +495,35 @@ async def run_langchain_deep_agents_wikispeedia_program(
model=model,
tools=nav_tools,
system_prompt=state_system_prompt or SYSTEM_PROMPT,
name=AGENT_NAME,
)
prompt = str(cast(list[vf.ConfigData], state["prompt"])[-1]["content"])
recursion_limit = state.get_max_turns(max_turns)
invoke_config = (
{"recursion_limit": recursion_limit} if recursion_limit > 0 else None
)
runtime = state.get("runtime", {})
runtime = runtime if isinstance(runtime, Mapping) else {}
source = str(state["info"]["source"])
target = str(state["info"]["target"])
trajectory_id = str(state["trajectory_id"])
run_id = uuid.UUID(hex=trajectory_id)
state["langsmith_run_id"] = str(run_id)
invoke_metadata = {
"vf_env": ENV_ID,
"vf_task_id": str(task.get("task_id", "")),
"vf_trajectory_id": trajectory_id,
"vf_group_key": str(runtime.get("group_key", "")),
"source": source,
"target": target,
"shortest_path": int(state["info"]["shortest_path"]),
}
invoke_config: vf.ConfigData = {
"run_name": f"wikispeedia:{source}->{target}",
"run_id": run_id,
"configurable": {"thread_id": trajectory_id},
"metadata": invoke_metadata,
"tags": ["verifiers", "vf-v1", ENV_ID],
}
if recursion_limit > 0:
invoke_config["recursion_limit"] = recursion_limit
invoke = agent.ainvoke(
{"messages": [{"role": "user", "content": prompt}]},
config=invoke_config,
Expand Down Expand Up @@ -560,6 +640,8 @@ def load_harness(config: WikispeediaHarnessConfig) -> WikispeediaHarness:

def load_environment(config: WikispeediaEnvConfig) -> vf.Env:
"""Load the v1 Wikispeedia taskset with a LangChain Deep Agents harness."""
if os.environ.get("LANGSMITH_TRACING") == "true":
vf.ensure_keys(["LANGSMITH_API_KEY"])

return vf.Env(
taskset=load_taskset(config=config.taskset),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ tags = ["v1", "taskset", "harness", "multi-turn", "tool-use", "langchain", "deep
version = "0.1.4"
requires-python = ">=3.11,<3.13"
dependencies = [
"verifiers>=0.1.14",
"verifiers>=0.1.15.dev7",
"datasets",
"deepagents>=0.5.5",
"langgraph",
Expand Down
130 changes: 129 additions & 1 deletion tests/test_langchain_deep_agents_wikispeedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import inspect
import sys
import types
import uuid
from pathlib import Path

import pytest
Expand Down Expand Up @@ -233,6 +234,10 @@ async def test_wikispeedia_tools_resolve_through_v1_runtime(
assert sorted(tools) == ["click_link", "go_back"]
assert result.startswith("TARGET REACHED")
assert state["reached_target"] is True
assert state[module.NAVIGATION_TOOL_CALLS_KEY] == [
{"name": "click_link", "valid": True}
]
assert await module.total_tool_calls(task, state) == 1.0


@pytest.mark.asyncio
Expand Down Expand Up @@ -336,7 +341,8 @@ def fake_create_deep_agent(**kwargs):
)
state = FakeState(
{
"info": {"source": "A"},
"trajectory_id": "0123456789abcdef0123456789abcdef",
"info": {"source": "A", "target": "B", "shortest_path": 1},
"prompt": [{"role": "user", "content": "start"}],
"system_prompt": [
{"role": "user", "content": "first prompt chunk"},
Expand All @@ -353,6 +359,108 @@ def fake_create_deep_agent(**kwargs):
assert result["agent_completion"] == []


@pytest.mark.asyncio
async def test_wikispeedia_deep_agents_program_passes_langsmith_config(
monkeypatch: pytest.MonkeyPatch,
) -> None:
module = load_module(monkeypatch)

class GraphRecursionError(Exception):
pass

class FakeState(dict):
def get_endpoint_config(self, api: str):
return {
"model": "model",
"api_base": "https://example.invalid/v1",
"api_key": "key",
}

def get_tools(self):
return {}

def get_max_turns(self, default: int):
return default

def stop(self, reason: str):
self["stop_reason"] = reason

class FakeChatOpenAI:
def __init__(self, **kwargs):
self.kwargs = kwargs

class FakeAgent:
async def ainvoke(self, payload, config=None):
captured["payload"] = payload
captured["config"] = config
return {"messages": [{"role": "assistant", "content": "done"}]}

captured: dict[str, object] = {}
created: dict[str, object] = {}

def fake_create_deep_agent(**kwargs):
created.update(kwargs)
return FakeAgent()

fake_deepagents = types.ModuleType("deepagents")
fake_langchain_openai = types.ModuleType("langchain_openai")
fake_langgraph = types.ModuleType("langgraph")
fake_langgraph_errors = types.ModuleType("langgraph.errors")
fake_langchain_core = types.ModuleType("langchain_core")
fake_tools_module = types.ModuleType("langchain_core.tools")

fake_deepagents.create_deep_agent = fake_create_deep_agent
fake_langchain_openai.ChatOpenAI = FakeChatOpenAI
fake_langgraph_errors.GraphRecursionError = GraphRecursionError
fake_langgraph.errors = fake_langgraph_errors
fake_tools_module.tool = lambda func: func
fake_langchain_core.tools = fake_tools_module
monkeypatch.setitem(sys.modules, "deepagents", fake_deepagents)
monkeypatch.setitem(sys.modules, "langchain_openai", fake_langchain_openai)
monkeypatch.setitem(sys.modules, "langgraph", fake_langgraph)
monkeypatch.setitem(sys.modules, "langgraph.errors", fake_langgraph_errors)
monkeypatch.setitem(sys.modules, "langchain_core", fake_langchain_core)
monkeypatch.setitem(sys.modules, "langchain_core.tools", fake_tools_module)

trajectory_id = "0123456789abcdef0123456789abcdef"
run_id = uuid.UUID(hex=trajectory_id)
program = module.make_langchain_deep_agents_program(
max_turns=12,
timeout_seconds=30,
)
state = FakeState(
{
"trajectory_id": trajectory_id,
"runtime": {"group_key": "group-1"},
"info": {"source": "A", "target": "B", "shortest_path": 2},
"prompt": [{"role": "user", "content": "start"}],
}
)

result = await program({"task_id": "A->B"}, state)

assert created["name"] == "wikispeedia-navigator"
assert captured["payload"] == {"messages": [{"role": "user", "content": "start"}]}
assert captured["config"] == {
"run_name": "wikispeedia:A->B",
"run_id": run_id,
"configurable": {"thread_id": trajectory_id},
"metadata": {
"vf_env": "langchain-deep-agents-wikispeedia",
"vf_task_id": "A->B",
"vf_trajectory_id": trajectory_id,
"vf_group_key": "group-1",
"source": "A",
"target": "B",
"shortest_path": 2,
},
"tags": ["verifiers", "vf-v1", "langchain-deep-agents-wikispeedia"],
"recursion_limit": 12,
}
assert result["langsmith_run_id"] == str(run_id)
assert result["completion"] == [{"role": "assistant", "content": "done"}]


@pytest.mark.asyncio
async def test_wikispeedia_tool_metrics_use_agent_completion(
monkeypatch: pytest.MonkeyPatch,
Expand All @@ -375,3 +483,23 @@ async def test_wikispeedia_tool_metrics_use_agent_completion(

assert await module.total_tool_calls(task, state) == 1.0
assert await module.invalid_link_rate(task, state) == 1.0


@pytest.mark.asyncio
async def test_wikispeedia_navigation_metrics_use_state_log_when_completion_empty(
monkeypatch: pytest.MonkeyPatch,
) -> None:
module = load_module(monkeypatch)
task = vf.Task({"prompt": [], "info": {"shortest_path": 1}}).freeze()
state = vf.State.for_task(task)
state["completion"] = []
state[module.NAVIGATION_TOOL_CALLS_KEY] = [
{"name": "click_link", "valid": False},
{"name": "click_link", "valid": True},
{"name": "go_back", "valid": True},
]

assert await module.total_tool_calls(task, state) == 3.0
assert await module.make_tool_count_metric("click_link")(task, state) == 2.0
assert await module.make_tool_count_metric("go_back")(task, state) == 1.0
assert await module.invalid_link_rate(task, state) == 0.5
Loading