From 0aa45347be0ef672f707432648a77b5251359bd1 Mon Sep 17 00:00:00 2001 From: Cooper Miller Date: Mon, 18 May 2026 23:36:20 -0700 Subject: [PATCH 1/5] langsmith --- .../README.md | 12 ++ .../langchain_deep_agents_wikispeedia.py | 35 +++++- .../test_langchain_deep_agents_wikispeedia.py | 106 +++++++++++++++++- 3 files changed, 149 insertions(+), 4 deletions(-) diff --git a/environments/langchain_deep_agents_wikispeedia/README.md b/environments/langchain_deep_agents_wikispeedia/README.md index 7fbec5fce..aa51515ab 100644 --- a/environments/langchain_deep_agents_wikispeedia/README.md +++ b/environments/langchain_deep_agents_wikispeedia/README.md @@ -49,6 +49,18 @@ Notes: - The first run downloads ~5MB of SNAP data into `~/.cache/wikispeedia` (override with `cache_dir`). - Set `OPENAI_API_KEY` (or whatever the policy endpoint expects) for the agent. +### LangSmith tracing + +Deep Agents uses LangGraph/LangChain native LangSmith tracing. Enable it with +the standard LangSmith environment variables before running the eval: + +```bash +export LANGSMITH_TRACING=true +export LANGSMITH_API_KEY=... +export LANGSMITH_PROJECT=verifiers-wikispeedia +prime eval run langchain-deep-agents-wikispeedia +``` + ### Taskset Config | Field | Type | Default | Description | diff --git a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py index 7da17fef9..d31888e78 100644 --- a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py +++ b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py @@ -1,5 +1,7 @@ import asyncio import json +import os +import uuid from collections.abc import Awaitable, Callable, Iterator, Mapping, Sequence from typing import Protocol, cast @@ -46,6 +48,8 @@ def system_prompt(allow_go_back: bool = True) -> str: SYSTEM_PROMPT = system_prompt() +ENV_ID = "langchain-deep-agents-wikispeedia" +AGENT_NAME = "wikispeedia-navigator" class WikispeediaTasksetConfig(vf.TasksetConfig): @@ -438,12 +442,35 @@ async def run_langchain_deep_agents_wikispeedia_program( model=model, tools=nav_tools, system_prompt=state_system_prompt or SYSTEM_PROMPT, + name=AGENT_NAME, ) prompt = str(cast(list[vf.ConfigData], state["prompt"])[-1]["content"]) recursion_limit = state.get_max_turns(max_turns) - invoke_config = ( - {"recursion_limit": recursion_limit} if recursion_limit > 0 else None - ) + runtime = state.get("runtime", {}) + runtime = runtime if isinstance(runtime, Mapping) else {} + source = str(state["info"]["source"]) + target = str(state["info"]["target"]) + trajectory_id = str(state["trajectory_id"]) + run_id = uuid.UUID(hex=trajectory_id) + state["langsmith_run_id"] = str(run_id) + invoke_metadata = { + "vf_env": ENV_ID, + "vf_task_id": str(task.get("task_id", "")), + "vf_trajectory_id": trajectory_id, + "vf_group_key": str(runtime.get("group_key", "")), + "source": source, + "target": target, + "shortest_path": int(state["info"]["shortest_path"]), + } + invoke_config: dict[str, object] = { + "run_name": f"wikispeedia:{source}->{target}", + "run_id": run_id, + "configurable": {"thread_id": trajectory_id}, + "metadata": invoke_metadata, + "tags": ["verifiers", "vf-v1", ENV_ID], + } + if recursion_limit > 0: + invoke_config["recursion_limit"] = recursion_limit invoke = agent.ainvoke( {"messages": [{"role": "user", "content": prompt}]}, config=invoke_config, @@ -560,6 +587,8 @@ def load_harness(config: WikispeediaHarnessConfig) -> WikispeediaHarness: def load_environment(config: WikispeediaEnvConfig) -> vf.Env: """Load the v1 Wikispeedia taskset with a LangChain Deep Agents harness.""" + if os.environ.get("LANGSMITH_TRACING") == "true": + vf.ensure_keys(["LANGSMITH_API_KEY"]) return vf.Env( taskset=load_taskset(config=config.taskset), diff --git a/tests/test_langchain_deep_agents_wikispeedia.py b/tests/test_langchain_deep_agents_wikispeedia.py index 8ca55bea4..c0b85b244 100644 --- a/tests/test_langchain_deep_agents_wikispeedia.py +++ b/tests/test_langchain_deep_agents_wikispeedia.py @@ -2,6 +2,7 @@ import inspect import sys import types +import uuid from pathlib import Path import pytest @@ -336,7 +337,8 @@ def fake_create_deep_agent(**kwargs): ) state = FakeState( { - "info": {"source": "A"}, + "trajectory_id": "0123456789abcdef0123456789abcdef", + "info": {"source": "A", "target": "B", "shortest_path": 1}, "prompt": [{"role": "user", "content": "start"}], "system_prompt": [ {"role": "user", "content": "first prompt chunk"}, @@ -353,6 +355,108 @@ def fake_create_deep_agent(**kwargs): assert result["agent_completion"] == [] +@pytest.mark.asyncio +async def test_wikispeedia_deep_agents_program_passes_langsmith_config( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = load_module(monkeypatch) + + class GraphRecursionError(Exception): + pass + + class FakeState(dict): + def get_endpoint_config(self, api: str): + return { + "model": "model", + "api_base": "https://example.invalid/v1", + "api_key": "key", + } + + def get_tools(self): + return {} + + def get_max_turns(self, default: int): + return default + + def stop(self, reason: str): + self["stop_reason"] = reason + + class FakeChatOpenAI: + def __init__(self, **kwargs): + self.kwargs = kwargs + + class FakeAgent: + async def ainvoke(self, payload, config=None): + captured["payload"] = payload + captured["config"] = config + return {"messages": [{"role": "assistant", "content": "done"}]} + + captured: dict[str, object] = {} + created: dict[str, object] = {} + + def fake_create_deep_agent(**kwargs): + created.update(kwargs) + return FakeAgent() + + fake_deepagents = types.ModuleType("deepagents") + fake_langchain_openai = types.ModuleType("langchain_openai") + fake_langgraph = types.ModuleType("langgraph") + fake_langgraph_errors = types.ModuleType("langgraph.errors") + fake_langchain_core = types.ModuleType("langchain_core") + fake_tools_module = types.ModuleType("langchain_core.tools") + + fake_deepagents.create_deep_agent = fake_create_deep_agent + fake_langchain_openai.ChatOpenAI = FakeChatOpenAI + fake_langgraph_errors.GraphRecursionError = GraphRecursionError + fake_langgraph.errors = fake_langgraph_errors + fake_tools_module.tool = lambda func: func + fake_langchain_core.tools = fake_tools_module + monkeypatch.setitem(sys.modules, "deepagents", fake_deepagents) + monkeypatch.setitem(sys.modules, "langchain_openai", fake_langchain_openai) + monkeypatch.setitem(sys.modules, "langgraph", fake_langgraph) + monkeypatch.setitem(sys.modules, "langgraph.errors", fake_langgraph_errors) + monkeypatch.setitem(sys.modules, "langchain_core", fake_langchain_core) + monkeypatch.setitem(sys.modules, "langchain_core.tools", fake_tools_module) + + trajectory_id = "0123456789abcdef0123456789abcdef" + run_id = uuid.UUID(hex=trajectory_id) + program = module.make_langchain_deep_agents_program( + max_turns=12, + timeout_seconds=30, + ) + state = FakeState( + { + "trajectory_id": trajectory_id, + "runtime": {"group_key": "group-1"}, + "info": {"source": "A", "target": "B", "shortest_path": 2}, + "prompt": [{"role": "user", "content": "start"}], + } + ) + + result = await program({"task_id": "A->B"}, state) + + assert created["name"] == "wikispeedia-navigator" + assert captured["payload"] == {"messages": [{"role": "user", "content": "start"}]} + assert captured["config"] == { + "run_name": "wikispeedia:A->B", + "run_id": run_id, + "configurable": {"thread_id": trajectory_id}, + "metadata": { + "vf_env": "langchain-deep-agents-wikispeedia", + "vf_task_id": "A->B", + "vf_trajectory_id": trajectory_id, + "vf_group_key": "group-1", + "source": "A", + "target": "B", + "shortest_path": 2, + }, + "tags": ["verifiers", "vf-v1", "langchain-deep-agents-wikispeedia"], + "recursion_limit": 12, + } + assert result["langsmith_run_id"] == str(run_id) + assert result["completion"] == [{"role": "assistant", "content": "done"}] + + @pytest.mark.asyncio async def test_wikispeedia_tool_metrics_use_agent_completion( monkeypatch: pytest.MonkeyPatch, From 64e86af626441878e57b14a848a84df2b928f687 Mon Sep 17 00:00:00 2001 From: Cooper Miller Date: Mon, 18 May 2026 23:44:48 -0700 Subject: [PATCH 2/5] invoke_config type --- .../langchain_deep_agents_wikispeedia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py index d31888e78..826ed8071 100644 --- a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py +++ b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py @@ -462,7 +462,7 @@ async def run_langchain_deep_agents_wikispeedia_program( "target": target, "shortest_path": int(state["info"]["shortest_path"]), } - invoke_config: dict[str, object] = { + invoke_config: vf.ConfigData = { "run_name": f"wikispeedia:{source}->{target}", "run_id": run_id, "configurable": {"thread_id": trajectory_id}, From e919d273cc5d6f8f210fd01c759893a7263ecafd Mon Sep 17 00:00:00 2001 From: Cooper Miller Date: Tue, 19 May 2026 12:15:40 -0700 Subject: [PATCH 3/5] bump verifiers --- environments/langchain_deep_agents_wikispeedia/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/langchain_deep_agents_wikispeedia/pyproject.toml b/environments/langchain_deep_agents_wikispeedia/pyproject.toml index 5818a214a..9b880415d 100644 --- a/environments/langchain_deep_agents_wikispeedia/pyproject.toml +++ b/environments/langchain_deep_agents_wikispeedia/pyproject.toml @@ -5,7 +5,7 @@ tags = ["v1", "taskset", "harness", "multi-turn", "tool-use", "langchain", "deep version = "0.1.4" requires-python = ">=3.11,<3.13" dependencies = [ - "verifiers>=0.1.14", + "verifiers>=0.1.15.dev7", "datasets", "deepagents>=0.5.5", "langgraph", From 99ee9da05bf139de243dc582ac4f06b9bc4d80f4 Mon Sep 17 00:00:00 2001 From: Cooper Miller Date: Tue, 19 May 2026 15:34:26 -0700 Subject: [PATCH 4/5] fix metrics --- .../README.md | 5 +- .../langchain_deep_agents_wikispeedia.py | 59 ++++++- ..._lab2_langchain_deep_agents_wikispeedia.py | 144 ++++++++++++++++++ .../test_langchain_deep_agents_wikispeedia.py | 24 +++ 4 files changed, 227 insertions(+), 5 deletions(-) create mode 100644 tests/test_lab2_langchain_deep_agents_wikispeedia.py diff --git a/environments/langchain_deep_agents_wikispeedia/README.md b/environments/langchain_deep_agents_wikispeedia/README.md index aa51515ab..861f19a2c 100644 --- a/environments/langchain_deep_agents_wikispeedia/README.md +++ b/environments/langchain_deep_agents_wikispeedia/README.md @@ -74,7 +74,7 @@ prime eval run langchain-deep-agents-wikispeedia | `split_seed` | int | `0` | Seed for deterministic train/eval split. | | `links_only` | bool | `False` | Render articles as just the link menu (ablation: tests whether the agent navigates from semantic content or link names alone). | | `allow_go_back` | bool | `True` | Expose the `go_back` tool. | -| `max_turns` | int | `50` | Per-rollout turn cap. | +| `max_turns` | int | `50` | Per-rollout LangGraph recursion limit stored on each task row. This is not a literal model-turn count; Deep Agents may spend multiple graph steps per model/tool cycle. | | `efficiency_weight` | float | `0.0` | If `> 0`, mix `path_efficiency` into the reward at this weight (a near-optimal route earns up to `1 + efficiency_weight`; a wanderer that reaches the target still earns `1`). Default `0.0` keeps reward as pure binary reachability. | | `stratify_path_length` | bool | `True` | Take equal counts at each shortest-path bucket inside `[min_path_length, max_path_length]`, capped at the smallest non-empty bucket. The SNAP graph's natural distribution heavily skews toward the lower end of any band (4-6 → 83% sp=4); without stratification the policy over-trains on the trivial floor. Set `False` to recover the natural distribution. | @@ -82,7 +82,7 @@ prime eval run langchain-deep-agents-wikispeedia | Field | Type | Default | Description | | --- | ---- | ------- | ----------- | -| `max_turns` | int | `50` | LangChain recursion limit fallback when runtime config does not provide one. | +| `max_turns` | int | `50` | LangGraph recursion limit fallback when runtime config does not provide one. This is not directly correlated with model turns. | | `timeout_seconds` | float | `1200.0` | Per-rollout wall-clock cap. | ### Metrics @@ -103,3 +103,4 @@ prime eval run langchain-deep-agents-wikispeedia - Reward is `reached_target` only — exact, deterministic, no judge required. The deep-agent structural metrics are zero-weight so they show up in eval tables without shaping the policy. - `min_path_length=4, max_path_length=6` is the calibrated RL difficulty band for Nemotron-30B-A3B-BF16 — predicted ~0.3-0.4 reach rate, the useful-gradient zone. The 3-5 band landed at 0.61 mean reach (dominated by the trivial sp=3 floor where the deep-agent scaffolding is decorative); the 5-7 band landed at 0.13 with 27% timeouts. - This is the primary LangChain Deep Agents example because tool use is load-bearing: the model cannot reach the target without invoking `click_link`. +- `max_turns` is passed through to LangGraph as `recursion_limit`. It caps graph execution steps, not model calls, so the observed number of model/tool cycles can be lower than the configured value. diff --git a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py index 826ed8071..8c801e556 100644 --- a/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py +++ b/environments/langchain_deep_agents_wikispeedia/langchain_deep_agents_wikispeedia.py @@ -50,6 +50,7 @@ def system_prompt(allow_go_back: bool = True) -> str: SYSTEM_PROMPT = system_prompt() ENV_ID = "langchain-deep-agents-wikispeedia" AGENT_NAME = "wikispeedia-navigator" +NAVIGATION_TOOL_CALLS_KEY = "navigation_tool_calls" class WikispeediaTasksetConfig(vf.TasksetConfig): @@ -94,6 +95,14 @@ def format_article(wiki: WikiGraph, article: str, links_only: bool = False) -> s return f"# {article}\n\n{text}\n\n---\nAvailable links: {links_str}" +def record_navigation_tool_call(state: vf.State, name: str, valid: bool) -> None: + calls = state.get(NAVIGATION_TOOL_CALLS_KEY) + if not isinstance(calls, list): + calls = [] + state[NAVIGATION_TOOL_CALLS_KEY] = calls + calls.append({"name": name, "valid": valid}) + + async def click_link(article: str, wiki: WikiGraph, state: vf.State) -> str: """Navigate to a linked Wikipedia article.""" links_only = bool(state.get("links_only", False)) @@ -101,11 +110,13 @@ async def click_link(article: str, wiki: WikiGraph, state: vf.State) -> str: available = wiki.get_links(current) normalized = wiki.normalize_name(article) if normalized is None or normalized not in available: + record_navigation_tool_call(state, "click_link", valid=False) avail_str = ", ".join(available) if available else "(none)" return ( f"'{article}' is not a valid link from '{current}'.\n" f"Available links: {avail_str}" ) + record_navigation_tool_call(state, "click_link", valid=True) state["current_article"] = normalized state["path"].append(normalized) if normalized == state["info"]["target"]: @@ -123,7 +134,9 @@ async def go_back(wiki: WikiGraph, state: vf.State) -> str: """Undo the last click_link and return to the previous article.""" path = state["path"] if len(path) <= 1: + record_navigation_tool_call(state, "go_back", valid=False) return "You are already at the starting article. Cannot go back." + record_navigation_tool_call(state, "go_back", valid=True) path.pop() state["current_article"] = path[-1] return format_article( @@ -167,7 +180,20 @@ async def agent_timeout(task: vf.Task, state: vf.State) -> float: return 1.0 if state.get("agent_timeout", False) else 0.0 -def iter_tool_calls(state: vf.State) -> Iterator[str]: +def has_navigation_tool_log(state: vf.State) -> bool: + return isinstance(state.get(NAVIGATION_TOOL_CALLS_KEY), list) + + +def iter_navigation_tool_calls(state: vf.State) -> Iterator[vf.ConfigMap]: + calls = state.get(NAVIGATION_TOOL_CALLS_KEY) + if not isinstance(calls, list): + return + for call in calls: + if isinstance(call, Mapping): + yield call + + +def iter_completion_tool_calls(state: vf.State) -> Iterator[str]: completion = state.get("completion") or [] messages = ( vf.get_messages(completion, role="assistant") @@ -183,9 +209,26 @@ def iter_tool_calls(state: vf.State) -> Iterator[str]: def count_tool_calls(state: vf.State, name: str | None = None) -> int: + if has_navigation_tool_log(state): + nav_count = sum( + 1 + for call in iter_navigation_tool_calls(state) + if name is None or call.get("name") == name + ) + if name in WIKISPEEDIA_TOOLS: + return nav_count + completion_count = sum( + 1 + for tool_name in iter_completion_tool_calls(state) + if tool_name not in WIKISPEEDIA_TOOLS + and (name is None or tool_name == name) + ) + return nav_count + completion_count if name is None: - return sum(1 for _ in iter_tool_calls(state)) - return sum(1 for tool_name in iter_tool_calls(state) if tool_name == name) + return sum(1 for _ in iter_completion_tool_calls(state)) + return sum( + 1 for tool_name in iter_completion_tool_calls(state) if tool_name == name + ) def make_tool_count_metric( @@ -236,6 +279,15 @@ async def assistant_turns(task: vf.Task, state: vf.State) -> float: async def invalid_link_rate(task: vf.Task, state: vf.State) -> float: + if has_navigation_tool_log(state): + click_calls = [ + call + for call in iter_navigation_tool_calls(state) + if call.get("name") == "click_link" + ] + invalid = sum(1 for call in click_calls if call.get("valid") is False) + return float(invalid / len(click_calls)) if click_calls else 0.0 + clicks = 0 invalid = 0 completion = state.get("completion") or [] @@ -422,6 +474,7 @@ async def run_langchain_deep_agents_wikispeedia_program( state["reached_target"] = False state["agent_timeout"] = False state["links_only"] = bool(task.get("links_only", False)) + state[NAVIGATION_TOOL_CALLS_KEY] = [] endpoint_config = state.get_endpoint_config(api="chat") model = ChatOpenAI( diff --git a/tests/test_lab2_langchain_deep_agents_wikispeedia.py b/tests/test_lab2_langchain_deep_agents_wikispeedia.py new file mode 100644 index 000000000..20a7fcd2e --- /dev/null +++ b/tests/test_lab2_langchain_deep_agents_wikispeedia.py @@ -0,0 +1,144 @@ +import importlib +import sys +import types +import uuid +from pathlib import Path + +import pytest + +import verifiers as vf + + +def load_lab2_module(monkeypatch: pytest.MonkeyPatch): + env_dir = ( + Path(__file__).parents[1] + / "lab2" + / "environments" + / "langchain_deep_agents_wikispeedia" + ) + monkeypatch.syspath_prepend(str(env_dir)) + sys.modules.pop("langchain_deep_agents_wikispeedia", None) + sys.modules.pop("wiki_graph", None) + return importlib.import_module("langchain_deep_agents_wikispeedia") + + +@pytest.mark.asyncio +async def test_lab2_deep_agents_program_passes_langsmith_config( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = load_lab2_module(monkeypatch) + + class GraphRecursionError(Exception): + pass + + class FakeState(dict): + def get_endpoint_config(self, api: str): + return { + "model": "model", + "api_base": "https://example.invalid/v1", + "api_key": "key", + } + + def get_tools(self): + return {} + + def get_max_turns(self, default: int): + return default + + def stop(self, reason: str): + self["stop_reason"] = reason + + class FakeChatOpenAI: + def __init__(self, **kwargs): + self.kwargs = kwargs + + class FakeAgent: + async def ainvoke(self, payload, config=None): + captured["payload"] = payload + captured["config"] = config + return {"messages": [{"role": "assistant", "content": "done"}]} + + captured: dict[str, object] = {} + created: dict[str, object] = {} + + def fake_create_deep_agent(**kwargs): + created.update(kwargs) + return FakeAgent() + + fake_deepagents = types.ModuleType("deepagents") + fake_langchain_openai = types.ModuleType("langchain_openai") + fake_langgraph = types.ModuleType("langgraph") + fake_langgraph_errors = types.ModuleType("langgraph.errors") + fake_langchain_core = types.ModuleType("langchain_core") + fake_tools_module = types.ModuleType("langchain_core.tools") + + fake_deepagents.create_deep_agent = fake_create_deep_agent + fake_langchain_openai.ChatOpenAI = FakeChatOpenAI + fake_langgraph_errors.GraphRecursionError = GraphRecursionError + fake_langgraph.errors = fake_langgraph_errors + fake_tools_module.tool = lambda func: func + fake_langchain_core.tools = fake_tools_module + monkeypatch.setitem(sys.modules, "deepagents", fake_deepagents) + monkeypatch.setitem(sys.modules, "langchain_openai", fake_langchain_openai) + monkeypatch.setitem(sys.modules, "langgraph", fake_langgraph) + monkeypatch.setitem(sys.modules, "langgraph.errors", fake_langgraph_errors) + monkeypatch.setitem(sys.modules, "langchain_core", fake_langchain_core) + monkeypatch.setitem(sys.modules, "langchain_core.tools", fake_tools_module) + + trajectory_id = "0123456789abcdef0123456789abcdef" + run_id = uuid.UUID(hex=trajectory_id) + program = module.make_langchain_deep_agents_program( + max_turns=12, + timeout_seconds=30, + ) + state = FakeState( + { + "trajectory_id": trajectory_id, + "runtime": {"group_key": "group-1"}, + "info": {"source": "A", "target": "B", "shortest_path": 2}, + "prompt": [{"role": "user", "content": "start"}], + } + ) + + result = await program({"task_id": "A->B"}, state) + + assert created["name"] == "wikispeedia-navigator" + assert captured["payload"] == {"messages": [{"role": "user", "content": "start"}]} + assert captured["config"] == { + "run_name": "wikispeedia:A->B", + "run_id": run_id, + "configurable": {"thread_id": trajectory_id}, + "metadata": { + "vf_env": "langchain-deep-agents-wikispeedia", + "vf_task_id": "A->B", + "vf_trajectory_id": trajectory_id, + "vf_group_key": "group-1", + "source": "A", + "target": "B", + "shortest_path": 2, + }, + "tags": ["verifiers", "vf-v1", "langchain-deep-agents-wikispeedia"], + "recursion_limit": 12, + } + assert result["langsmith_run_id"] == str(run_id) + assert result["completion"] == [{"role": "assistant", "content": "done"}] + + +@pytest.mark.asyncio +async def test_lab2_navigation_metrics_use_state_log_when_completion_empty( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = load_lab2_module(monkeypatch) + task = vf.Task({"prompt": [], "info": {"shortest_path": 1}}).freeze() + state = vf.State.for_task(task) + state["completion"] = [] + state[module.NAVIGATION_TOOL_CALLS_KEY] = [ + {"name": "click_link", "valid": False}, + {"name": "click_link", "valid": True}, + {"name": "go_back", "valid": True}, + ] + + assert await module.total_tool_calls(task, state) == 3.0 + assert await module.make_tool_count_metric("click_link")(task, state) == 2.0 + assert await module.make_tool_count_metric("go_back")(task, state) == 1.0 + assert await module.invalid_link_rate(task, state) == 0.5 diff --git a/tests/test_langchain_deep_agents_wikispeedia.py b/tests/test_langchain_deep_agents_wikispeedia.py index c0b85b244..a0abba978 100644 --- a/tests/test_langchain_deep_agents_wikispeedia.py +++ b/tests/test_langchain_deep_agents_wikispeedia.py @@ -234,6 +234,10 @@ async def test_wikispeedia_tools_resolve_through_v1_runtime( assert sorted(tools) == ["click_link", "go_back"] assert result.startswith("TARGET REACHED") assert state["reached_target"] is True + assert state[module.NAVIGATION_TOOL_CALLS_KEY] == [ + {"name": "click_link", "valid": True} + ] + assert await module.total_tool_calls(task, state) == 1.0 @pytest.mark.asyncio @@ -479,3 +483,23 @@ async def test_wikispeedia_tool_metrics_use_agent_completion( assert await module.total_tool_calls(task, state) == 1.0 assert await module.invalid_link_rate(task, state) == 1.0 + + +@pytest.mark.asyncio +async def test_wikispeedia_navigation_metrics_use_state_log_when_completion_empty( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = load_module(monkeypatch) + task = vf.Task({"prompt": [], "info": {"shortest_path": 1}}).freeze() + state = vf.State.for_task(task) + state["completion"] = [] + state[module.NAVIGATION_TOOL_CALLS_KEY] = [ + {"name": "click_link", "valid": False}, + {"name": "click_link", "valid": True}, + {"name": "go_back", "valid": True}, + ] + + assert await module.total_tool_calls(task, state) == 3.0 + assert await module.make_tool_count_metric("click_link")(task, state) == 2.0 + assert await module.make_tool_count_metric("go_back")(task, state) == 1.0 + assert await module.invalid_link_rate(task, state) == 0.5 From 0f5801197c6259fa197eb164b8db336b58459b26 Mon Sep 17 00:00:00 2001 From: Cooper Miller Date: Tue, 19 May 2026 15:35:17 -0700 Subject: [PATCH 5/5] rm test --- ..._lab2_langchain_deep_agents_wikispeedia.py | 144 ------------------ 1 file changed, 144 deletions(-) delete mode 100644 tests/test_lab2_langchain_deep_agents_wikispeedia.py diff --git a/tests/test_lab2_langchain_deep_agents_wikispeedia.py b/tests/test_lab2_langchain_deep_agents_wikispeedia.py deleted file mode 100644 index 20a7fcd2e..000000000 --- a/tests/test_lab2_langchain_deep_agents_wikispeedia.py +++ /dev/null @@ -1,144 +0,0 @@ -import importlib -import sys -import types -import uuid -from pathlib import Path - -import pytest - -import verifiers as vf - - -def load_lab2_module(monkeypatch: pytest.MonkeyPatch): - env_dir = ( - Path(__file__).parents[1] - / "lab2" - / "environments" - / "langchain_deep_agents_wikispeedia" - ) - monkeypatch.syspath_prepend(str(env_dir)) - sys.modules.pop("langchain_deep_agents_wikispeedia", None) - sys.modules.pop("wiki_graph", None) - return importlib.import_module("langchain_deep_agents_wikispeedia") - - -@pytest.mark.asyncio -async def test_lab2_deep_agents_program_passes_langsmith_config( - monkeypatch: pytest.MonkeyPatch, -) -> None: - module = load_lab2_module(monkeypatch) - - class GraphRecursionError(Exception): - pass - - class FakeState(dict): - def get_endpoint_config(self, api: str): - return { - "model": "model", - "api_base": "https://example.invalid/v1", - "api_key": "key", - } - - def get_tools(self): - return {} - - def get_max_turns(self, default: int): - return default - - def stop(self, reason: str): - self["stop_reason"] = reason - - class FakeChatOpenAI: - def __init__(self, **kwargs): - self.kwargs = kwargs - - class FakeAgent: - async def ainvoke(self, payload, config=None): - captured["payload"] = payload - captured["config"] = config - return {"messages": [{"role": "assistant", "content": "done"}]} - - captured: dict[str, object] = {} - created: dict[str, object] = {} - - def fake_create_deep_agent(**kwargs): - created.update(kwargs) - return FakeAgent() - - fake_deepagents = types.ModuleType("deepagents") - fake_langchain_openai = types.ModuleType("langchain_openai") - fake_langgraph = types.ModuleType("langgraph") - fake_langgraph_errors = types.ModuleType("langgraph.errors") - fake_langchain_core = types.ModuleType("langchain_core") - fake_tools_module = types.ModuleType("langchain_core.tools") - - fake_deepagents.create_deep_agent = fake_create_deep_agent - fake_langchain_openai.ChatOpenAI = FakeChatOpenAI - fake_langgraph_errors.GraphRecursionError = GraphRecursionError - fake_langgraph.errors = fake_langgraph_errors - fake_tools_module.tool = lambda func: func - fake_langchain_core.tools = fake_tools_module - monkeypatch.setitem(sys.modules, "deepagents", fake_deepagents) - monkeypatch.setitem(sys.modules, "langchain_openai", fake_langchain_openai) - monkeypatch.setitem(sys.modules, "langgraph", fake_langgraph) - monkeypatch.setitem(sys.modules, "langgraph.errors", fake_langgraph_errors) - monkeypatch.setitem(sys.modules, "langchain_core", fake_langchain_core) - monkeypatch.setitem(sys.modules, "langchain_core.tools", fake_tools_module) - - trajectory_id = "0123456789abcdef0123456789abcdef" - run_id = uuid.UUID(hex=trajectory_id) - program = module.make_langchain_deep_agents_program( - max_turns=12, - timeout_seconds=30, - ) - state = FakeState( - { - "trajectory_id": trajectory_id, - "runtime": {"group_key": "group-1"}, - "info": {"source": "A", "target": "B", "shortest_path": 2}, - "prompt": [{"role": "user", "content": "start"}], - } - ) - - result = await program({"task_id": "A->B"}, state) - - assert created["name"] == "wikispeedia-navigator" - assert captured["payload"] == {"messages": [{"role": "user", "content": "start"}]} - assert captured["config"] == { - "run_name": "wikispeedia:A->B", - "run_id": run_id, - "configurable": {"thread_id": trajectory_id}, - "metadata": { - "vf_env": "langchain-deep-agents-wikispeedia", - "vf_task_id": "A->B", - "vf_trajectory_id": trajectory_id, - "vf_group_key": "group-1", - "source": "A", - "target": "B", - "shortest_path": 2, - }, - "tags": ["verifiers", "vf-v1", "langchain-deep-agents-wikispeedia"], - "recursion_limit": 12, - } - assert result["langsmith_run_id"] == str(run_id) - assert result["completion"] == [{"role": "assistant", "content": "done"}] - - -@pytest.mark.asyncio -async def test_lab2_navigation_metrics_use_state_log_when_completion_empty( - monkeypatch: pytest.MonkeyPatch, -) -> None: - module = load_lab2_module(monkeypatch) - task = vf.Task({"prompt": [], "info": {"shortest_path": 1}}).freeze() - state = vf.State.for_task(task) - state["completion"] = [] - state[module.NAVIGATION_TOOL_CALLS_KEY] = [ - {"name": "click_link", "valid": False}, - {"name": "click_link", "valid": True}, - {"name": "go_back", "valid": True}, - ] - - assert await module.total_tool_calls(task, state) == 3.0 - assert await module.make_tool_count_metric("click_link")(task, state) == 2.0 - assert await module.make_tool_count_metric("go_back")(task, state) == 1.0 - assert await module.invalid_link_rate(task, state) == 0.5