From 6f27af2528dd4f229d7625aed790ef584b0171aa Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 13:44:16 -0700 Subject: [PATCH 01/27] feat(verifier): record agent trajectories --- .../core/lib/v3/agent/AnthropicCUAClient.ts | 4 + .../v3/agent/utils/captureAriaTreeProbe.ts | 75 +++ .../core/lib/v3/handlers/v3AgentHandler.ts | 109 ++++ .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 160 +++++- .../core/lib/v3/types/public/busEvents.ts | 108 ++++ packages/core/lib/v3/types/public/index.ts | 1 + .../evals/framework/trajectoryRecorder.ts | 507 ++++++++++++++++++ .../scripts/verify-trajectory-recorder.ts | 230 ++++++++ 8 files changed, 1192 insertions(+), 2 deletions(-) create mode 100644 packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts create mode 100644 packages/core/lib/v3/types/public/busEvents.ts create mode 100644 packages/evals/framework/trajectoryRecorder.ts create mode 100644 packages/evals/scripts/verify-trajectory-recorder.ts diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts index 752d208e22..54d64f15d0 100644 --- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts +++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts @@ -902,6 +902,10 @@ export class AnthropicCUAClient extends AgentClient { ...input, }; } else if (action === "triple_click" || action === "tripleClick") { + // Anthropic's computer_20250124 tool emits `triple_click` with + // `coordinate: [x, y]`. Without this branch the snake_case name + + // raw coordinate array fall through to the generic `else` and + // executeAction logs "Unknown action type: triple_click". return { type: "tripleClick", x: diff --git a/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts new file mode 100644 index 0000000000..8e3fcc050b --- /dev/null +++ b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts @@ -0,0 +1,75 @@ +/** + * captureAriaTreeProbe — capture a truncated accessibility tree of the active + * page for use as tier-2 evidence in the trajectory recorder. + * + * Shared by v3AgentHandler and v3CuaAgentHandler. Listener-gated by the + * callers so ordinary agent runs (no TrajectoryRecorder attached) don't pay + * the cost. + * + * The a11y tree is the same payload the agent's `ariaTree` tool sees, but + * captured by the harness (not the agent) so the verifier has independent + * textual ground truth for grounding non-visual claims — prices, names, + * dates, list contents — without OCR'ing screenshots. + * + * Budget: defaults to ~8000 tokens (32k chars). Per-step a11y captures + * across a ~30-step trajectory at that cap sum to ~240k tokens total, + * which the verifier handles via per-criterion top-K selection. The cap + * is configurable via VERIFIER_ARIATREE_TOKEN_BUDGET so consumers can + * trade RAM/disk for fidelity. Truncated content is marked explicitly so + * the verifier knows it was clipped. + */ +import type { V3 } from "../../v3.js"; + +const APPROX_CHARS_PER_TOKEN = 4; +const DEFAULT_TOKEN_BUDGET = 8_000; +const DEFAULT_TIMEOUT_MS = 5_000; + +interface CaptureAriaTreeOptions { + /** Soft cap on token count (chars/4 approximation). Default 8000. */ + tokenBudget?: number; + /** Hard timeout on the capture. Default 5s. */ + timeoutMs?: number; +} + +/** + * Returns the truncated a11y tree as a plain string, or undefined when + * capture fails. Never throws — a11y capture is best-effort tier-2 evidence, + * not a hard requirement, so failures are silently absorbed (the verifier + * surfaces this via evidence_insufficient). + */ +export async function captureAriaTreeProbe( + v3: V3, + opts: CaptureAriaTreeOptions = {}, +): Promise { + const envBudget = parseInt( + process.env.VERIFIER_ARIATREE_TOKEN_BUDGET ?? "", + 10, + ); + const tokenBudget = + opts.tokenBudget ?? + (Number.isFinite(envBudget) && envBudget > 0 + ? envBudget + : DEFAULT_TOKEN_BUDGET); + const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS; + const maxChars = tokenBudget * APPROX_CHARS_PER_TOKEN; + + try { + // v3.extract() without a schema returns { pageText } where pageText is the + // rendered accessibility tree — same path the agent's ariaTree tool uses. + const result = (await v3.extract({ timeout: timeoutMs })) as { + pageText?: string; + }; + const pageText = result?.pageText; + if (typeof pageText !== "string" || pageText.length === 0) return undefined; + + if (pageText.length > maxChars) { + return ( + pageText.slice(0, maxChars) + + `\n\n[CONTENT TRUNCATED at ~${tokenBudget} tokens — set VERIFIER_ARIATREE_TOKEN_BUDGET to raise]` + ); + } + return pageText; + } catch { + return undefined; + } +} diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index cff08c8a28..d0308bdd8a 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -41,6 +41,7 @@ import { AgentAbortError, } from "../types/public/sdkErrors.js"; import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js"; +import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js"; import { CaptchaSolver, CAPTCHA_SOLVED_MSG, @@ -248,6 +249,10 @@ export class V3AgentHandler { | GenerateTextOnStepFinishCallback | StreamTextOnStepFinishCallback, ) { + // Monotonic step counter scoped to this execute() call. Each tool call in + // the agent loop becomes one trajectory step. The counter feeds stepIndex + // on the bus events the TrajectoryRecorder subscribes to. + let stepCounter = 0; return async (event: StepResult) => { this.logger({ category: "agent", @@ -255,6 +260,11 @@ export class V3AgentHandler { level: 2, }); + const stepIndicesInTurn: number[] = []; + let lastFinalAnswer: + | { message: string; output?: Record } + | undefined; + if (event.toolCalls && event.toolCalls.length > 0) { for (let i = 0; i < event.toolCalls.length; i++) { const toolCall = event.toolCalls[i]; @@ -279,6 +289,13 @@ export class V3AgentHandler { ? `${allReasoning} ${doneReasoning}`.trim() : allReasoning || "Task completed successfully"; } + lastFinalAnswer = { + message: state.finalMessage, + output: + typeof args?.output === "object" && args?.output !== null + ? (args.output as Record) + : undefined, + }; } const mappedActions = mapToolResultToActions({ toolCallName: toolCall.toolName, @@ -292,8 +309,100 @@ export class V3AgentHandler { action.timestamp = Date.now(); state.actions.push(action); } + + // Emit step_finished_event per tool call. The TrajectoryRecorder + // builds one Trajectory.Step per emission. tier-1 evidence (the + // bytes the LLM consumed) is captured separately via an + // onStepFinish wrapper in the harness (plan §10 Q1). + const stepIndex = stepCounter++; + stepIndicesInTurn.push(stepIndex); + const toolOk = + !toolResult || + (typeof toolResult === "object" && + !("error" in toolResult) && + !("isError" in toolResult && toolResult.isError)); + this.v3.bus.emit("agent_step_finished_event", { + stepIndex, + actionName: toolCall.toolName, + actionArgs: + typeof args === "object" && args !== null + ? (args as Record) + : {}, + reasoning: event.text ?? "", + toolOutput: { + ok: toolOk, + result: toolResult, + error: + toolResult && + typeof toolResult === "object" && + "error" in toolResult && + typeof (toolResult as { error?: unknown }).error === "string" + ? (toolResult as { error: string }).error + : undefined, + }, + finishedAt: new Date().toISOString(), + }); } state.currentPageUrl = (await this.v3.context.awaitActivePage()).url(); + + // Harness probe — take a single screenshot / a11y snapshot per AI SDK + // step and attach it to every tool call in that turn. The observation + // reflects the settled page state after the batch of tool calls; this + // is more faithful than dropping probe evidence for all but the last + // tool call, while still avoiding per-tool screenshot overhead. + const wantsScreenshotProbe = + this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0; + const wantsStepObservation = + this.v3.bus.listenerCount("agent_step_observed_event") > 0; + if ( + stepIndicesInTurn.length > 0 && + (wantsScreenshotProbe || wantsStepObservation) + ) { + try { + const page = await this.v3.context.awaitActivePage(); + let screenshot: Buffer | undefined; + if (wantsScreenshotProbe) { + screenshot = await page.screenshot({ fullPage: false }); + } + let ariaTree: string | undefined; + if (wantsStepObservation) { + // Capture the a11y tree alongside the URL probe so the verifier + // can ground textual claims (prices, names, dates) without OCR. + // Best-effort: returns undefined on failure/timeout. + ariaTree = await captureAriaTreeProbe(this.v3); + } + for (const stepIndex of stepIndicesInTurn) { + if (screenshot) { + // DOM/hybrid: this post-step screenshot is a harness probe + // only. The agent's tier-1 evidence is the tool's return value + // captured separately in agent_step_finished_event. + this.v3.bus.emit("agent_screenshot_taken_event", { + stepIndex, + screenshot, + url: state.currentPageUrl, + evidenceRole: "probe", + }); + } + if (wantsStepObservation) { + this.v3.bus.emit("agent_step_observed_event", { + stepIndex, + url: state.currentPageUrl, + ariaTree, + }); + } + } + } catch (e) { + this.logger({ + category: "agent", + message: `Warning: harness probe failed: ${getErrorMessage(e)}`, + level: 1, + }); + } + } + } + + if (lastFinalAnswer) { + this.v3.bus.emit("agent_final_answer_event", lastFinalAnswer); } if (userCallback) { diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index af3a3dad87..f1dd2666e6 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -7,6 +7,7 @@ import { GoogleCUAClient } from "../agent/GoogleCUAClient.js"; import { OpenAICUAClient } from "../agent/OpenAICUAClient.js"; import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js"; import { ensureXPath } from "../agent/utils/xpath.js"; +import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js"; import { ActionExecutionResult, AgentAction, @@ -16,6 +17,7 @@ import { SafetyConfirmationHandler, } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; +import type { AgentScreenshotTakenEvent } from "../types/public/busEvents.js"; import { type Action, V3FunctionName } from "../types/public/methods.js"; import { FlowLogger } from "../flowlogger/FlowLogger.js"; import { toTitleCase } from "../../utils.js"; @@ -37,6 +39,13 @@ export class V3CuaAgentHandler { private captchaSolver: CaptchaSolver | null = null; private captchaClickGuardRemaining = 0; private currentInstruction = ""; + // Monotonic step counter used by bus events. The CUA loop is internal to + // the agent client, so unlike v3AgentHandler we don't have per-tool-call + // step events; instead we tag every screenshot emission with an + // incrementing index. Wave 1 may add finer-grained step events here. + private cuaStepCounter = 0; + private latestCuaScreenshot?: AgentScreenshotTakenEvent; + private latestCuaScreenshotConsumed = true; constructor( v3: V3, @@ -76,6 +85,17 @@ export class V3CuaAgentHandler { this.ensureNotClosed(); const page = await this.v3.context.awaitActivePage(); const screenshotBuffer = await page.screenshot({ fullPage: false }); + + // Emit bus event so TrajectoryRecorder can capture the screenshot. In + // CUA mode this is the same buffer the provider receives — i.e., it + // serves both as tier-1 evidence (what the model saw) and as a tier-2 + // probe. See plan §04 "Mode-by-mode sources". + try { + this.emitCuaScreenshot(screenshotBuffer, page.url()); + } catch { + // bus emit errors are non-fatal + } + return screenshotBuffer.toString("base64"); // base64 png }); @@ -120,6 +140,7 @@ export class V3CuaAgentHandler { (this.options.clientOptions?.waitBetweenActions as number) || defaultDelay; try { + let executionResult: ActionExecutionResult | undefined; // Try to inject cursor before each action if enabled if (this.highlightCursor) { try { @@ -133,7 +154,7 @@ export class V3CuaAgentHandler { // takes its own screenshot via screenshotProvider between API turns. const shouldLog = action.type !== "screenshot"; if (shouldLog) { - await FlowLogger.runWithLogging( + executionResult = await FlowLogger.runWithLogging( { eventType: `V3Cua${toTitleCase(action.type)}`, // e.g. "V3CuaClick" data: { @@ -145,10 +166,13 @@ export class V3CuaAgentHandler { [action], ); } else { - await this.executeAction(action); + executionResult = await this.executeAction(action); } action.timestamp = Date.now(); + if (shouldLog) { + await this.emitCuaActionStep(action, executionResult); + } await new Promise((r) => setTimeout(r, waitBetween)); } catch (error) { @@ -658,6 +682,15 @@ export class V3CuaAgentHandler { const screenshotBuffer = await page.screenshot({ fullPage: false }); const currentUrl = page.url(); + + // Mirror the screenshot to the bus — same buffer the CUA client + // received, so it serves as both tier-1 evidence and tier-2 probe. + try { + this.emitCuaScreenshot(screenshotBuffer, currentUrl); + } catch { + // non-fatal + } + return await this.agentClient.captureScreenshot({ base64Image: screenshotBuffer.toString("base64"), currentUrl, @@ -767,6 +800,129 @@ export class V3CuaAgentHandler { } } + /** + * Emit a pre-action CUA screenshot — the exact buffer the model received + * as input. Tier-1 evidence (agent-mirrored); the tier-2 probe is taken + * separately in emitCuaActionStep after the action runs, so the recorder + * can compare what the model saw against what the page actually showed + * once the keystrokes/clicks landed. + */ + private emitCuaScreenshot( + screenshot: Buffer, + url: string, + ): AgentScreenshotTakenEvent { + const event: AgentScreenshotTakenEvent = { + stepIndex: this.cuaStepCounter++, + screenshot, + url, + evidenceRole: "agent", + }; + this.latestCuaScreenshot = event; + this.latestCuaScreenshotConsumed = false; + this.v3.bus.emit("agent_screenshot_taken_event", event); + return event; + } + + private async emitCuaActionStep( + action: AgentAction, + result: ActionExecutionResult | undefined, + ): Promise { + let pageUrl = + typeof action.pageUrl === "string" + ? action.pageUrl + : this.latestCuaScreenshot?.url; + try { + pageUrl = (await this.v3.context.awaitActivePage()).url(); + } catch { + // Keep the best pre-action URL fallback. + } + let stepIndex: number; + + if (this.latestCuaScreenshot && !this.latestCuaScreenshotConsumed) { + stepIndex = this.latestCuaScreenshot.stepIndex; + this.latestCuaScreenshotConsumed = true; + } else if (this.latestCuaScreenshot) { + stepIndex = this.cuaStepCounter++; + this.v3.bus.emit("agent_screenshot_taken_event", { + ...this.latestCuaScreenshot, + stepIndex, + }); + } else { + stepIndex = this.cuaStepCounter++; + } + + const actionArgs = Object.fromEntries( + Object.entries(action).filter(([key]) => key !== "screenshot"), + ); + const reasoning = + typeof action.reasoning === "string" + ? action.reasoning + : typeof action.action === "string" + ? action.action + : ""; + + this.v3.bus.emit("agent_step_finished_event", { + stepIndex, + actionName: String(action.type), + actionArgs, + reasoning, + toolOutput: { + ok: result?.success !== false, + result: result ?? { success: true }, + error: result?.error, + }, + finishedAt: new Date().toISOString(), + }); + + // Post-action tier-2 probe. The pre-action screenshot from + // screenshotProvider is what the model SAW; this one shows what the + // page actually LOOKS LIKE after the action ran. Without this the + // verifier has no visual evidence that keystrokes/clicks landed, and + // has to trust the action history alone. + // + // Listener-gated to keep ordinary agent runs free of the extra + // screenshot cost — mirrors v3AgentHandler's post-step probe. + const wantsScreenshotProbe = + this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0; + const wantsStepObservation = + this.v3.bus.listenerCount("agent_step_observed_event") > 0; + let probeUrl = pageUrl; + if (wantsScreenshotProbe || wantsStepObservation) { + try { + const page = await this.v3.context.awaitActivePage(); + probeUrl = page.url(); + if (wantsScreenshotProbe) { + const probeScreenshot = await page.screenshot({ fullPage: false }); + this.v3.bus.emit("agent_screenshot_taken_event", { + stepIndex, + screenshot: probeScreenshot, + url: probeUrl, + evidenceRole: "probe", + }); + } + } catch (e) { + this.logger({ + category: "agent", + message: `Warning: CUA post-action probe failed: ${ + e instanceof Error ? e.message : String(e) + }`, + level: 1, + }); + } + } + + if (probeUrl && wantsStepObservation) { + // Capture the a11y tree alongside the URL probe so the verifier can + // ground textual claims without OCR. Best-effort. + const ariaTree = await captureAriaTreeProbe(this.v3); + this.v3.bus.emit("agent_step_observed_event", { + stepIndex, + url: probeUrl, + ariaTree, + }); + } + } + private async injectCursor(): Promise { try { const page = await this.v3.context.awaitActivePage(); diff --git a/packages/core/lib/v3/types/public/busEvents.ts b/packages/core/lib/v3/types/public/busEvents.ts new file mode 100644 index 0000000000..62e9929492 --- /dev/null +++ b/packages/core/lib/v3/types/public/busEvents.ts @@ -0,0 +1,108 @@ +/** + * Bus event payloads emitted by V3 on `v3.bus`. + * + * The bus is an EventEmitter; these types document the payload shape per + * event name so consumers (TrajectoryRecorder in packages/evals, custom + * subscribers) can type their handlers. + * + * Wave 0 of the verifier rewrite plan introduces: + * - agent_screenshot_taken_event — independent post-step screenshot probe + * - agent_step_finished_event — fired per tool-call in a step result + * - agent_step_observed_event — fired after the harness probe completes + * - agent_final_answer_event — fired when the `done` tool resolves + * + * `agent_step_started_event` is documented in the plan but deferred — the AI + * SDK's `onStepFinish` is a post-hook, and there's no symmetric pre-hook per + * tool execution in v3AgentHandler today. Started-state can be derived from + * the finished event's stepIndex if needed. + */ + +/** + * Names of bus events the agent handlers emit. Use these constants to + * subscribe; the bus accepts arbitrary strings, but a centralized list helps + * catch typos at the call site. + */ +export const BUS_EVENTS = { + AGENT_SCREENSHOT_TAKEN: "agent_screenshot_taken_event", + AGENT_STEP_FINISHED: "agent_step_finished_event", + AGENT_STEP_OBSERVED: "agent_step_observed_event", + AGENT_FINAL_ANSWER: "agent_final_answer_event", +} as const; + +export type BusEventName = (typeof BUS_EVENTS)[keyof typeof BUS_EVENTS]; + +/** + * Payload for `agent_screenshot_taken_event`. The raw screenshot Buffer the + * harness took after a step's tool execution. + * + * Note: in CUA mode the same Buffer is also what the provider received; in + * DOM/hybrid mode it's an independent harness probe. The verifier treats them + * as different evidence tiers regardless — see plan §04 ("Mode-by-mode sources"). + */ +export interface AgentScreenshotTakenEvent { + /** Zero-based index of the step this screenshot corresponds to. */ + stepIndex: number; + /** PNG bytes from page.screenshot(). */ + screenshot: Buffer; + /** Page URL at the time of capture. */ + url: string; + /** + * Evidence role for this screenshot. + * + * DOM/hybrid post-tool screenshots are probe-only. CUA screenshots are also + * the exact image bytes sent to the provider, so they serve both as tier-1 + * agent evidence and tier-2 probe evidence. + */ + evidenceRole?: "probe" | "agent" | "agent_and_probe"; +} + +/** + * Payload for `agent_step_finished_event`. Emitted once per tool call within + * a step result. Carries the tool's reported outcome and a reference to the + * agent's textual reasoning for the step. + * + * Tier 1 evidence (the bytes the LLM consumed as the tool result) is captured + * separately by the harness via an AgentExecuteCallbacks.onStepFinish wrapper + * — not in this payload. See plan §10 Q1 (resolved: onStepFinish). + */ +export interface AgentStepFinishedEvent { + stepIndex: number; + /** Name of the tool that ran (e.g., "act", "extract", "click"). */ + actionName: string; + /** Arguments passed to the tool. */ + actionArgs: Record; + /** Agent's textual reasoning (event.text on the AI SDK StepResult). */ + reasoning: string; + /** Outcome of the tool execution as seen by the harness. */ + toolOutput: { + ok: boolean; + /** The tool's native return value. */ + result: unknown; + error?: string; + }; + /** ISO 8601 timestamp at which the step finished. */ + finishedAt: string; +} + +/** + * Payload for `agent_step_observed_event`. Emitted after the harness probe + * completes for a step (page URL captured at minimum; a11y tree and scroll + * info added in Wave 2). + */ +export interface AgentStepObservedEvent { + stepIndex: number; + /** Page URL after the step's tool execution. */ + url: string; + /** v1 — accessibility tree snapshot. */ + ariaTree?: string; + /** v1 — viewport scroll context. */ + scroll?: { top: number; pageHeight: number }; +} + +/** Payload for `agent_final_answer_event`. Emitted when the `done` tool resolves. */ +export interface AgentFinalAnswerEvent { + /** The agent's final summary message. */ + message: string; + /** Optional structured output if the agent's `output` schema was set. */ + output?: Record; +} diff --git a/packages/core/lib/v3/types/public/index.ts b/packages/core/lib/v3/types/public/index.ts index 9c5df08d01..9bf24eb271 100644 --- a/packages/core/lib/v3/types/public/index.ts +++ b/packages/core/lib/v3/types/public/index.ts @@ -1,4 +1,5 @@ export * from "./agent.js"; +export * from "./busEvents.js"; // Export api.ts under namespace to avoid conflicts with methods.ts types export * as Api from "./api.js"; // Also export BrowserbaseRegion directly for convenience diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts new file mode 100644 index 0000000000..2b7f24b529 --- /dev/null +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -0,0 +1,507 @@ +/** + * TrajectoryRecorder — subscribes to v3.bus step events emitted by the agent + * handlers (v3AgentHandler / v3CuaAgentHandler) and assembles a Trajectory + * the verifier can consume. + * + * Lifecycle: + * const recorder = new TrajectoryRecorder({ v3, taskSpec }); + * recorder.start(); + * await agent.execute(...); + * const trajectory = await recorder.finish({ status: "complete", usage }); + * + * Persistence is env-gated by `VERIFIER_PERSIST_TRAJECTORIES` (plan §10 Q2): + * - unset: persistence follows the default (on locally, off in CI). + * - "1" / "true": always persist. + * - "0" / "false": never persist. + * + * On-disk layout matches microsoft/fara's example_trajectory/ so we can + * cross-validate against verify_trajectories.py without format conversion. + * + * @see ~/.claude/plans/verifier-rewrite.html §06 (Trajectory on-disk) + */ +import fs from "node:fs/promises"; +import path from "node:path"; +import type { + AgentEvidence, + AgentFinalAnswerEvent, + AgentScreenshotTakenEvent, + AgentStepFinishedEvent, + AgentStepObservedEvent, + ProbeEvidence, + TaskSpec, + Trajectory, + TrajectoryStatus, + TrajectoryStep, + TrajectoryUsage, + Verdict, + V3, +} from "@browserbasehq/stagehand"; + +interface PartialStep { + index: number; + actionName: string; + actionArgs: Record; + reasoning: string; + agentEvidence: AgentEvidence; + probeEvidence: ProbeEvidence; + toolOutput: { ok: boolean; result: unknown; error?: string }; + finishedAt: string; +} + +export interface TrajectoryRecorderOptions { + v3: V3; + taskSpec: TaskSpec; + /** + * Root directory under which trajectory dirs are written. Each task run + * gets a subdirectory named by runId/task.id. + * Defaults to `/.trajectories`. + */ + outputRoot?: string; + /** Run identifier (e.g., ISO timestamp + env). Defaults to a fresh timestamp. */ + runId?: string; + /** + * Override the env-gated persistence default. `true` always persists, + * `false` never does, `undefined` defers to VERIFIER_PERSIST_TRAJECTORIES. + */ + persist?: boolean; +} + +export interface TrajectoryFinishOptions { + status: TrajectoryStatus; + finalAnswer?: string; + usage?: Partial; +} + +const ZERO_USAGE: TrajectoryUsage = { + input_tokens: 0, + output_tokens: 0, +}; + +/** + * Decide whether to persist by default. Honors the explicit override first, + * then env, then falls back to "persist when not in CI". + */ +function shouldPersist(override: boolean | undefined): boolean { + if (override !== undefined) return override; + const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase(); + if (env === "1" || env === "true") return true; + if (env === "0" || env === "false") return false; + return !process.env.CI; +} + +export class TrajectoryRecorder { + private readonly v3: V3; + private readonly taskSpec: TaskSpec; + private readonly runId: string; + private readonly outputDir: string; + private readonly persistEnabled: boolean; + + // Per-stepIndex builders; events can arrive out-of-order in theory, though + // the handlers emit step_finished → screenshot_taken → step_observed in the + // same microtask. + private readonly partialSteps = new Map>(); + private readonly observationByStep = new Map< + number, + AgentStepObservedEvent + >(); + private readonly screenshotsByStep = new Map< + number, + AgentScreenshotTakenEvent + >(); + private finalAnswerEvent?: AgentFinalAnswerEvent; + private startedAt = ""; + private endedAt = ""; + private listenersAttached = false; + + // Strongly-typed bound handlers so we can attach/detach the same references. + private readonly onScreenshot = (e: AgentScreenshotTakenEvent) => { + this.screenshotsByStep.set(e.stepIndex, e); + const partial = this.ensurePartial(e.stepIndex); + + // Default to "probe" when the emit site doesn't tag the role — matches + // v3AgentHandler's post-step screenshot, which is always a tier-2 probe. + const role = e.evidenceRole ?? "probe"; + + // Probe channel (tier 2): the page's state at observation time. For CUA + // the pre-action screenshot is NOT a probe — that role is filled by the + // post-action emit from emitCuaActionStep. So only update probe.screenshot + // when the event explicitly carries the probe role. + if (role === "probe" || role === "agent_and_probe") { + const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; + probe.screenshot = e.screenshot; + probe.url = e.url; + partial.probeEvidence = probe; + } else if (!partial.probeEvidence?.url) { + // Even for tier-1-only events, the URL is useful probe context if we + // don't have one yet. Doesn't overwrite a later post-action URL. + partial.probeEvidence = { + ...(partial.probeEvidence ?? {}), + url: e.url, + }; + } + + // Agent channel (tier 1): bytes the model ingested. + if (role === "agent" || role === "agent_and_probe") { + partial.agentEvidence = mergeAgentEvidence(partial.agentEvidence, { + modalities: [ + { type: "image", bytes: e.screenshot, mediaType: "image/png" }, + ], + }); + } + }; + private readonly onStepFinished = (e: AgentStepFinishedEvent) => { + const partial = this.ensurePartial(e.stepIndex); + partial.actionName = e.actionName; + partial.actionArgs = e.actionArgs; + partial.reasoning = e.reasoning; + partial.toolOutput = e.toolOutput; + partial.finishedAt = e.finishedAt; + partial.agentEvidence = mergeAgentEvidence( + partial.agentEvidence, + buildAgentEvidence(e), + ); + }; + private readonly onStepObserved = (e: AgentStepObservedEvent) => { + this.observationByStep.set(e.stepIndex, e); + const partial = this.ensurePartial(e.stepIndex); + const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; + probe.url = e.url; + if (e.ariaTree !== undefined) probe.ariaTree = e.ariaTree; + if (e.scroll !== undefined) probe.scroll = e.scroll; + partial.probeEvidence = probe; + }; + private readonly onFinalAnswer = (e: AgentFinalAnswerEvent) => { + this.finalAnswerEvent = e; + }; + + constructor(opts: TrajectoryRecorderOptions) { + this.v3 = opts.v3; + this.taskSpec = opts.taskSpec; + this.runId = + opts.runId ?? + new Date().toISOString().replace(/[:.]/g, "-").replace("T", "T"); + const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories"); + this.outputDir = path.join(root, this.runId, opts.taskSpec.id); + this.persistEnabled = shouldPersist(opts.persist); + } + + /** Subscribe to bus events. Call once before agent.execute(). */ + start(): void { + if (this.listenersAttached) return; + this.startedAt = new Date().toISOString(); + this.v3.bus.on("agent_screenshot_taken_event", this.onScreenshot); + this.v3.bus.on("agent_step_finished_event", this.onStepFinished); + this.v3.bus.on("agent_step_observed_event", this.onStepObserved); + this.v3.bus.on("agent_final_answer_event", this.onFinalAnswer); + this.listenersAttached = true; + } + + /** + * Detach listeners, assemble the Trajectory, and (if persistence is on) + * write the on-disk layout. Idempotent. + */ + async finish(opts: TrajectoryFinishOptions): Promise { + this.detach(); + this.endedAt = new Date().toISOString(); + + const steps = this.assembleSteps(); + const trajectory: Trajectory = { + task: this.taskSpec, + steps, + finalAnswer: opts.finalAnswer ?? this.finalAnswerEvent?.message, + status: opts.status, + usage: { ...ZERO_USAGE, ...(opts.usage ?? {}) }, + timing: { startedAt: this.startedAt, endedAt: this.endedAt }, + }; + + if (this.persistEnabled) { + await this.persist(trajectory); + } + + return trajectory; + } + + /** Throw away in-memory state without writing to disk. Used on early abort. */ + cancel(): void { + this.detach(); + this.partialSteps.clear(); + this.observationByStep.clear(); + this.screenshotsByStep.clear(); + this.finalAnswerEvent = undefined; + } + + /** Where the trajectory dir lives (whether or not it was persisted). */ + get directory(): string { + return this.outputDir; + } + + /** Whether this recorder wrote the trajectory directory on finish(). */ + get persisted(): boolean { + return this.persistEnabled; + } + + /** + * Persist verifier scores next to the trajectory. No-op when trajectory + * persistence is disabled. + */ + async persistVerdict( + verdict: Verdict, + filename = "mmrubric_v1.json", + ): Promise { + if (!this.persistEnabled) return; + + const scoresDir = path.join(this.outputDir, "scores"); + await fs.mkdir(scoresDir, { recursive: true }); + await fs.writeFile( + path.join(scoresDir, filename), + JSON.stringify(verdict, null, 2), + ); + + const taskDataPath = path.join(this.outputDir, "task_data.json"); + let taskData: Record = {}; + try { + taskData = JSON.parse(await fs.readFile(taskDataPath, "utf8")) as Record< + string, + unknown + >; + } catch { + taskData = { task: this.taskSpec }; + } + await fs.writeFile( + taskDataPath, + JSON.stringify({ ...taskData, verdict }, null, 2), + ); + } + + private detach(): void { + if (!this.listenersAttached) return; + this.v3.bus.off("agent_screenshot_taken_event", this.onScreenshot); + this.v3.bus.off("agent_step_finished_event", this.onStepFinished); + this.v3.bus.off("agent_step_observed_event", this.onStepObserved); + this.v3.bus.off("agent_final_answer_event", this.onFinalAnswer); + this.listenersAttached = false; + } + + private ensurePartial(stepIndex: number): Partial { + let p = this.partialSteps.get(stepIndex); + if (!p) { + p = { index: stepIndex }; + this.partialSteps.set(stepIndex, p); + } + return p; + } + + /** + * Materialize ordered TrajectoryStep[] from the accumulated partials. + * Steps that never received a step_finished event are skipped (they can + * appear for CUA where only screenshot events fire — those are recorded as + * orphan probe screenshots and elided here). + */ + private assembleSteps(): TrajectoryStep[] { + const out: TrajectoryStep[] = []; + const indices = [...this.partialSteps.keys()].sort((a, b) => a - b); + for (const i of indices) { + const p = this.partialSteps.get(i)!; + if ( + p.actionName === undefined || + p.toolOutput === undefined || + p.finishedAt === undefined + ) { + // Orphan screenshot-only entry (typically CUA). Skip — we record + // these by writing the screenshot to disk separately during persist(). + continue; + } + out.push({ + index: i, + actionName: p.actionName, + actionArgs: p.actionArgs ?? {}, + reasoning: p.reasoning ?? "", + agentEvidence: p.agentEvidence ?? { modalities: [] }, + probeEvidence: p.probeEvidence ?? {}, + toolOutput: p.toolOutput, + startedAt: this.startedAt, + finishedAt: p.finishedAt, + }); + } + return out; + } + + /** + * Write the trajectory directory layout. Mirrors fara's example_trajectory/: + * + * / + * ├── task_data.json + * ├── trajectory.json (screenshots referenced by path) + * ├── screenshot_.png + * └── times.json + */ + private async persist(trajectory: Trajectory): Promise { + await fs.mkdir(this.outputDir, { recursive: true }); + + // Walk steps and write screenshots; replace Buffer with path reference in + // the serialized trajectory. Both tiers externalize image bytes under + // screenshots/probe/.png — tier 2, what the harness observed + // screenshots/agent/.png — tier 1, what the model received + // The `_` suffix only appears when a step carries multiple images + // (rare; typically zero or one per step). Paths in JSON are relative to + // the trajectory dir so the directory is movable/copyable as a unit. + await fs.mkdir(path.join(this.outputDir, "screenshots", "probe"), { + recursive: true, + }); + await fs.mkdir(path.join(this.outputDir, "screenshots", "agent"), { + recursive: true, + }); + + const serializableSteps: unknown[] = []; + for (const step of trajectory.steps) { + const probe: ProbeEvidence = { ...step.probeEvidence }; + if (probe.screenshot) { + const relPath = `screenshots/probe/${step.index + 1}.png`; + await fs.writeFile( + path.join(this.outputDir, relPath), + probe.screenshot, + ); + probe.screenshotPath = relPath; + delete probe.screenshot; + } + + const imageModalities = step.agentEvidence.modalities.filter( + (m) => m.type === "image", + ); + const multipleImages = imageModalities.length > 1; + let imageSeq = 0; + const modalities: unknown[] = []; + for (const m of step.agentEvidence.modalities) { + if (m.type !== "image") { + modalities.push(m); + continue; + } + const suffix = multipleImages ? `_${imageSeq}` : ""; + const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`; + await fs.writeFile(path.join(this.outputDir, relPath), m.bytes); + modalities.push({ + type: "image", + imagePath: relPath, + mediaType: m.mediaType, + }); + imageSeq += 1; + } + const agentEvidence = { modalities }; + serializableSteps.push({ ...step, probeEvidence: probe, agentEvidence }); + } + + // Image modalities carry imagePath instead of raw bytes on disk, so this + // is no longer a strict Trajectory at the type level. Cast through + // unknown rather than widening the type contract. + const serialized = { + ...trajectory, + steps: serializableSteps, + } as unknown; + + await fs.writeFile( + path.join(this.outputDir, "trajectory.json"), + JSON.stringify(serialized, null, 2), + ); + + // task_data.json mirrors fara's shape: TaskSpec + (later) verdict. + await fs.writeFile( + path.join(this.outputDir, "task_data.json"), + JSON.stringify( + { + task: trajectory.task, + status: trajectory.status, + finalAnswer: trajectory.finalAnswer ?? null, + }, + null, + 2, + ), + ); + + await fs.writeFile( + path.join(this.outputDir, "times.json"), + JSON.stringify( + { + timing: trajectory.timing, + usage: trajectory.usage, + stepCount: trajectory.steps.length, + }, + null, + 2, + ), + ); + + await fs.mkdir(path.join(this.outputDir, "scores"), { recursive: true }); + await fs.writeFile( + path.join(this.outputDir, "core.log"), + coreLog(trajectory), + ); + } +} + +function mergeAgentEvidence( + ...parts: Array +): AgentEvidence { + return { + modalities: parts.flatMap((p) => p?.modalities ?? []), + }; +} + +/** + * Build a tier-1 AgentEvidence from a step_finished event. The handler's + * toolOutput.result is what the LLM consumed next turn (modulo SDK + * serialization). Wave 1 will replace this with a higher-fidelity capture + * pulled from event.response.messages. + */ +function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence { + const modalities: AgentEvidence["modalities"] = []; + if (e.reasoning) { + modalities.push({ type: "text", content: e.reasoning }); + } + const result = e.toolOutput.result; + if (result === undefined || result === null) { + return { modalities }; + } + if (typeof result === "string") { + modalities.push({ type: "text", content: result }); + } else if (Buffer.isBuffer(result)) { + modalities.push({ + type: "image", + bytes: result, + mediaType: "image/png", + }); + } else if (typeof result === "object") { + // Tool results commonly include a screenshotBase64 field for vision tools. + const r = result as { screenshotBase64?: string } & Record; + if (typeof r.screenshotBase64 === "string") { + try { + modalities.push({ + type: "image", + bytes: Buffer.from(r.screenshotBase64, "base64"), + mediaType: "image/png", + }); + } catch { + // ignore + } + } + modalities.push({ type: "json", content: result }); + } + return { modalities }; +} + +function coreLog(trajectory: Trajectory): string { + return ( + trajectory.steps + .map((step) => + JSON.stringify({ + step: step.index, + action: step.actionName, + url: step.probeEvidence.url ?? null, + ok: step.toolOutput.ok, + reasoning: step.reasoning || undefined, + startedAt: step.startedAt, + finishedAt: step.finishedAt, + }), + ) + .join("\n") + "\n" + ); +} diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts new file mode 100644 index 0000000000..20dfb85b6b --- /dev/null +++ b/packages/evals/scripts/verify-trajectory-recorder.ts @@ -0,0 +1,230 @@ +/** + * Wave 0 smoke test — verifies the TrajectoryRecorder plumbing end-to-end + * without launching a browser or calling an LLM. + * + * Drives a fake V3 (just an EventEmitter-shaped `bus`) through the same bus + * events the real agent handlers emit, then asserts: + * 1. The recorder assembles a Trajectory with the expected step shape. + * 2. The persisted directory layout matches fara's example_trajectory/. + * 3. V3Evaluator.verify() returns a parseable stub Verdict. + * + * Run via: pnpm tsx packages/evals/scripts/verify-trajectory-recorder.ts + */ +import assert from "node:assert/strict"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { EventEmitter } from "node:events"; + +import { TrajectoryRecorder } from "../framework/trajectoryRecorder.js"; +import { V3Evaluator } from "@browserbasehq/stagehand"; +import type { TaskSpec, V3 } from "@browserbasehq/stagehand"; + +interface FakeV3 { + bus: EventEmitter; +} + +async function main(): Promise { + const tmpRoot = await fs.mkdtemp( + path.join(os.tmpdir(), "verifier-rewrite-smoke-"), + ); + console.log(`▸ tmpdir: ${tmpRoot}`); + + const bus = new EventEmitter(); + const v3 = { bus } as unknown as V3; + const taskSpec: TaskSpec = { + id: "smoke-united_13", + instruction: + "What is the price difference between economy and business class on United?", + initUrl: "https://www.google.com", + precomputedRubric: { + items: [ + { + criterion: "Identify correct route", + description: "Agent identifies United CHI→GRU flight.", + max_points: 2, + }, + { + criterion: "Report price delta", + description: "Agent reports economy↔business price delta.", + max_points: 3, + }, + ], + }, + expectedAnswer: "Approximately $4,000 difference.", + }; + + const recorder = new TrajectoryRecorder({ + v3, + taskSpec, + outputRoot: tmpRoot, + runId: "smoke-run", + persist: true, + }); + recorder.start(); + + // Emit a three-step synthetic trajectory. + bus.emit("agent_step_finished_event", { + stepIndex: 0, + actionName: "goto", + actionArgs: { url: "https://united.com" }, + reasoning: "Open United Airlines homepage.", + toolOutput: { ok: true, result: { url: "https://united.com" } }, + finishedAt: new Date().toISOString(), + }); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 0, + screenshot: Buffer.from("fake-png-bytes-0"), + url: "https://united.com", + evidenceRole: "agent_and_probe", + }); + bus.emit("agent_step_observed_event", { + stepIndex: 0, + url: "https://united.com", + }); + + bus.emit("agent_step_finished_event", { + stepIndex: 1, + actionName: "act", + actionArgs: { instruction: "Search Chicago to São Paulo, Nov 24" }, + reasoning: "Enter route and dates.", + toolOutput: { + ok: true, + result: { success: true, describe: "Filled route + dates" }, + }, + finishedAt: new Date().toISOString(), + }); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 1, + screenshot: Buffer.from("fake-png-bytes-1"), + url: "https://united.com/search", + }); + bus.emit("agent_step_observed_event", { + stepIndex: 1, + url: "https://united.com/search", + }); + + bus.emit("agent_step_finished_event", { + stepIndex: 2, + actionName: "extract", + actionArgs: { instruction: "extract fare cells" }, + reasoning: "Read economy and business fares from the results page.", + toolOutput: { + ok: true, + result: { economy: "$1,234", business: "$5,789" }, + }, + finishedAt: new Date().toISOString(), + }); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 2, + screenshot: Buffer.from("fake-png-bytes-2"), + url: "https://united.com/results", + }); + bus.emit("agent_step_observed_event", { + stepIndex: 2, + url: "https://united.com/results", + ariaTree: + "[0-1] RootWebArea: United Search Results\n [0-3] heading: Flight 1234\n [0-4] StaticText: Economy $1,234\n [0-5] StaticText: Business $5,789", + }); + + bus.emit("agent_final_answer_event", { + message: "Economy $1,234 vs business $5,789 — delta $4,555.", + }); + + const trajectory = await recorder.finish({ + status: "complete", + usage: { input_tokens: 1234, output_tokens: 567 }, + }); + + // ── Assertions ────────────────────────────────────────────────────────── + assert.equal(trajectory.steps.length, 3, "expected 3 steps"); + assert.equal(trajectory.steps[0].actionName, "goto"); + assert.equal(trajectory.steps[1].actionName, "act"); + assert.equal(trajectory.steps[2].actionName, "extract"); + assert.ok( + trajectory.steps[0].agentEvidence.modalities.some( + (m) => m.type === "image", + ), + "CUA-style screenshot event should populate tier-1 image evidence", + ); + assert.ok( + trajectory.steps[2].agentEvidence.modalities.some( + (m) => + m.type === "json" && + typeof m.content === "object" && + m.content !== null && + "economy" in (m.content as Record), + ), + "extract step should carry a json modality with economy field", + ); + assert.equal( + trajectory.finalAnswer, + "Economy $1,234 vs business $5,789 — delta $4,555.", + ); + assert.equal(trajectory.status, "complete"); + assert.equal(trajectory.usage.input_tokens, 1234); + // a11y dump on step 2 should round-trip through the recorder into + // probeEvidence.ariaTree. + assert.ok( + trajectory.steps[2].probeEvidence.ariaTree?.includes("Economy $1,234"), + "step_observed.ariaTree should populate probeEvidence.ariaTree", + ); + console.log(" ✓ in-memory Trajectory shape (incl. ariaTree round-trip)"); + + // ── On-disk layout ────────────────────────────────────────────────────── + const taskDir = path.join(tmpRoot, "smoke-run", "smoke-united_13"); + const files = (await fs.readdir(taskDir)).sort(); + assert.deepEqual( + files, + [ + "core.log", + "scores", + "screenshots", + "task_data.json", + "times.json", + "trajectory.json", + ], + `expected new trajectory layout, got ${files.join(", ")}`, + ); + const probeFiles = ( + await fs.readdir(path.join(taskDir, "screenshots", "probe")) + ).sort(); + assert.deepEqual( + probeFiles, + ["1.png", "2.png", "3.png"], + `expected probe screenshots, got ${probeFiles.join(", ")}`, + ); + const screenshotBytes = await fs.readFile( + path.join(taskDir, "screenshots", "probe", "1.png"), + ); + assert.equal(screenshotBytes.toString(), "fake-png-bytes-0"); + const coreLog = await fs.readFile(path.join(taskDir, "core.log"), "utf8"); + assert.ok(coreLog.includes('"action":"goto"')); + console.log(" ✓ on-disk layout matches fara's example_trajectory"); + + const persistedTask = JSON.parse( + await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"), + ); + assert.equal(persistedTask.task.id, "smoke-united_13"); + assert.equal(persistedTask.status, "complete"); + + // ── V3Evaluator.verify() exercised live in verify-live-trajectory.ts ── + // Sanity-check that the V3Evaluator class still constructs from a minimal + // V3 shape (recorder doesn't depend on the evaluator for plumbing). + const _unused: typeof V3Evaluator = V3Evaluator; + void _unused; + console.log( + " ✓ V3Evaluator still constructs (verify() exercised live elsewhere)", + ); + + console.log("\n✅ Wave 0 plumbing OK"); + await fs.rm(tmpRoot, { recursive: true, force: true }); +} + +main().catch((err) => { + console.error("\n❌ Wave 0 plumbing FAILED:", err); + process.exit(1); +}); + +// Type guard for FakeV3 lint suppression (the file uses `as unknown as V3`). +export type { FakeV3 }; From 40e7ab30b81903677551f28d1238d20747763bc5 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:16:26 -0700 Subject: [PATCH 02/27] fix(verifier): align trajectory naming --- .changeset/verifier-trajectory-events.md | 5 +++++ packages/evals/framework/trajectoryRecorder.ts | 6 ++++-- packages/evals/scripts/verify-trajectory-recorder.ts | 4 ++-- 3 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 .changeset/verifier-trajectory-events.md diff --git a/.changeset/verifier-trajectory-events.md b/.changeset/verifier-trajectory-events.md new file mode 100644 index 0000000000..9dcb5c8192 --- /dev/null +++ b/.changeset/verifier-trajectory-events.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Capture verifier trajectory evidence from v3 agent events for offline scoring. diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index 2b7f24b529..501668c2be 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -327,12 +327,14 @@ export class TrajectoryRecorder { } /** - * Write the trajectory directory layout. Mirrors fara's example_trajectory/: + * Write the trajectory directory layout. * * / * ├── task_data.json * ├── trajectory.json (screenshots referenced by path) - * ├── screenshot_.png + * ├── screenshots/ + * │ ├── probe/.png + * │ └── agent/.png * └── times.json */ private async persist(trajectory: Trajectory): Promise { diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts index 20dfb85b6b..7076fff21b 100644 --- a/packages/evals/scripts/verify-trajectory-recorder.ts +++ b/packages/evals/scripts/verify-trajectory-recorder.ts @@ -42,12 +42,12 @@ async function main(): Promise { { criterion: "Identify correct route", description: "Agent identifies United CHI→GRU flight.", - max_points: 2, + maxPoints: 2, }, { criterion: "Report price delta", description: "Agent reports economy↔business price delta.", - max_points: 3, + maxPoints: 3, }, ], }, From c25367bbcf0af855fadb6321dc0d8f94b7aadd7e Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:35:06 -0700 Subject: [PATCH 03/27] chore(evals): remove upstream trajectory references --- packages/evals/framework/trajectoryRecorder.ts | 6 +++--- packages/evals/scripts/verify-trajectory-recorder.ts | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index 501668c2be..5a8a62f1dc 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -14,8 +14,8 @@ * - "1" / "true": always persist. * - "0" / "false": never persist. * - * On-disk layout matches microsoft/fara's example_trajectory/ so we can - * cross-validate against verify_trajectories.py without format conversion. + * On-disk layout is stable JSON + screenshots so saved runs can be re-scored + * without format conversion. * * @see ~/.claude/plans/verifier-rewrite.html §06 (Trajectory on-disk) */ @@ -405,7 +405,7 @@ export class TrajectoryRecorder { JSON.stringify(serialized, null, 2), ); - // task_data.json mirrors fara's shape: TaskSpec + (later) verdict. + // task_data.json stores TaskSpec + (later) verdict. await fs.writeFile( path.join(this.outputDir, "task_data.json"), JSON.stringify( diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts index 7076fff21b..049b96c706 100644 --- a/packages/evals/scripts/verify-trajectory-recorder.ts +++ b/packages/evals/scripts/verify-trajectory-recorder.ts @@ -5,7 +5,7 @@ * Drives a fake V3 (just an EventEmitter-shaped `bus`) through the same bus * events the real agent handlers emit, then asserts: * 1. The recorder assembles a Trajectory with the expected step shape. - * 2. The persisted directory layout matches fara's example_trajectory/. + * 2. The persisted directory layout has the expected verifier files. * 3. V3Evaluator.verify() returns a parseable stub Verdict. * * Run via: pnpm tsx packages/evals/scripts/verify-trajectory-recorder.ts @@ -200,7 +200,7 @@ async function main(): Promise { assert.equal(screenshotBytes.toString(), "fake-png-bytes-0"); const coreLog = await fs.readFile(path.join(taskDir, "core.log"), "utf8"); assert.ok(coreLog.includes('"action":"goto"')); - console.log(" ✓ on-disk layout matches fara's example_trajectory"); + console.log(" ✓ on-disk layout has expected verifier files"); const persistedTask = JSON.parse( await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"), From 8e9962cc78c486ec9d497569d57c208fd0c79a3f Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 15:19:16 -0700 Subject: [PATCH 04/27] docs(verifier): remove rollout comments from trajectory capture --- packages/core/lib/v3/handlers/v3AgentHandler.ts | 2 +- packages/core/lib/v3/handlers/v3CuaAgentHandler.ts | 4 ++-- packages/evals/framework/trajectoryRecorder.ts | 7 ++----- packages/evals/scripts/verify-trajectory-recorder.ts | 6 +++--- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index d0308bdd8a..afddddef22 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -313,7 +313,7 @@ export class V3AgentHandler { // Emit step_finished_event per tool call. The TrajectoryRecorder // builds one Trajectory.Step per emission. tier-1 evidence (the // bytes the LLM consumed) is captured separately via an - // onStepFinish wrapper in the harness (plan §10 Q1). + // onStepFinish wrapper in the harness. const stepIndex = stepCounter++; stepIndicesInTurn.push(stepIndex); const toolOk = diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index f1dd2666e6..2fd08b8647 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -42,7 +42,7 @@ export class V3CuaAgentHandler { // Monotonic step counter used by bus events. The CUA loop is internal to // the agent client, so unlike v3AgentHandler we don't have per-tool-call // step events; instead we tag every screenshot emission with an - // incrementing index. Wave 1 may add finer-grained step events here. + // incrementing index. private cuaStepCounter = 0; private latestCuaScreenshot?: AgentScreenshotTakenEvent; private latestCuaScreenshotConsumed = true; @@ -89,7 +89,7 @@ export class V3CuaAgentHandler { // Emit bus event so TrajectoryRecorder can capture the screenshot. In // CUA mode this is the same buffer the provider receives — i.e., it // serves both as tier-1 evidence (what the model saw) and as a tier-2 - // probe. See plan §04 "Mode-by-mode sources". + // probe. try { this.emitCuaScreenshot(screenshotBuffer, page.url()); } catch { diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index 5a8a62f1dc..d7c4d62ab4 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -9,15 +9,13 @@ * await agent.execute(...); * const trajectory = await recorder.finish({ status: "complete", usage }); * - * Persistence is env-gated by `VERIFIER_PERSIST_TRAJECTORIES` (plan §10 Q2): + * Persistence is env-gated by `VERIFIER_PERSIST_TRAJECTORIES`: * - unset: persistence follows the default (on locally, off in CI). * - "1" / "true": always persist. * - "0" / "false": never persist. * * On-disk layout is stable JSON + screenshots so saved runs can be re-scored * without format conversion. - * - * @see ~/.claude/plans/verifier-rewrite.html §06 (Trajectory on-disk) */ import fs from "node:fs/promises"; import path from "node:path"; @@ -451,8 +449,7 @@ function mergeAgentEvidence( /** * Build a tier-1 AgentEvidence from a step_finished event. The handler's * toolOutput.result is what the LLM consumed next turn (modulo SDK - * serialization). Wave 1 will replace this with a higher-fidelity capture - * pulled from event.response.messages. + * serialization). */ function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence { const modalities: AgentEvidence["modalities"] = []; diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts index 049b96c706..c2df86fd15 100644 --- a/packages/evals/scripts/verify-trajectory-recorder.ts +++ b/packages/evals/scripts/verify-trajectory-recorder.ts @@ -1,5 +1,5 @@ /** - * Wave 0 smoke test — verifies the TrajectoryRecorder plumbing end-to-end + * Smoke test — verifies the TrajectoryRecorder plumbing end-to-end * without launching a browser or calling an LLM. * * Drives a fake V3 (just an EventEmitter-shaped `bus`) through the same bus @@ -217,12 +217,12 @@ async function main(): Promise { " ✓ V3Evaluator still constructs (verify() exercised live elsewhere)", ); - console.log("\n✅ Wave 0 plumbing OK"); + console.log("\n✅ Trajectory recorder plumbing OK"); await fs.rm(tmpRoot, { recursive: true, force: true }); } main().catch((err) => { - console.error("\n❌ Wave 0 plumbing FAILED:", err); + console.error("\n❌ Trajectory recorder plumbing FAILED:", err); process.exit(1); }); From bb514e3ab5a8f42970fc5bdf9ef5c423281969db Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 22:31:14 -0700 Subject: [PATCH 05/27] test(evals): cover trajectory recorder in vitest --- .../evals/framework/trajectoryRecorder.ts | 16 +- .../scripts/verify-trajectory-recorder.ts | 230 ------------------ .../framework/trajectoryRecorder.test.ts | 197 +++++++++++++++ 3 files changed, 205 insertions(+), 238 deletions(-) delete mode 100644 packages/evals/scripts/verify-trajectory-recorder.ts create mode 100644 packages/evals/tests/framework/trajectoryRecorder.test.ts diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index d7c4d62ab4..8895a08443 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -31,7 +31,7 @@ import type { TrajectoryStatus, TrajectoryStep, TrajectoryUsage, - Verdict, + EvaluationResult, V3, } from "@browserbasehq/stagehand"; @@ -239,12 +239,12 @@ export class TrajectoryRecorder { } /** - * Persist verifier scores next to the trajectory. No-op when trajectory + * Persist evaluator result next to the trajectory. No-op when trajectory * persistence is disabled. */ - async persistVerdict( - verdict: Verdict, - filename = "mmrubric_v1.json", + async persistResult( + result: EvaluationResult, + filename = "result.json", ): Promise { if (!this.persistEnabled) return; @@ -252,7 +252,7 @@ export class TrajectoryRecorder { await fs.mkdir(scoresDir, { recursive: true }); await fs.writeFile( path.join(scoresDir, filename), - JSON.stringify(verdict, null, 2), + JSON.stringify(result, null, 2), ); const taskDataPath = path.join(this.outputDir, "task_data.json"); @@ -267,7 +267,7 @@ export class TrajectoryRecorder { } await fs.writeFile( taskDataPath, - JSON.stringify({ ...taskData, verdict }, null, 2), + JSON.stringify({ ...taskData, result }, null, 2), ); } @@ -403,7 +403,7 @@ export class TrajectoryRecorder { JSON.stringify(serialized, null, 2), ); - // task_data.json stores TaskSpec + (later) verdict. + // task_data.json stores TaskSpec + (later) result. await fs.writeFile( path.join(this.outputDir, "task_data.json"), JSON.stringify( diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts deleted file mode 100644 index c2df86fd15..0000000000 --- a/packages/evals/scripts/verify-trajectory-recorder.ts +++ /dev/null @@ -1,230 +0,0 @@ -/** - * Smoke test — verifies the TrajectoryRecorder plumbing end-to-end - * without launching a browser or calling an LLM. - * - * Drives a fake V3 (just an EventEmitter-shaped `bus`) through the same bus - * events the real agent handlers emit, then asserts: - * 1. The recorder assembles a Trajectory with the expected step shape. - * 2. The persisted directory layout has the expected verifier files. - * 3. V3Evaluator.verify() returns a parseable stub Verdict. - * - * Run via: pnpm tsx packages/evals/scripts/verify-trajectory-recorder.ts - */ -import assert from "node:assert/strict"; -import fs from "node:fs/promises"; -import os from "node:os"; -import path from "node:path"; -import { EventEmitter } from "node:events"; - -import { TrajectoryRecorder } from "../framework/trajectoryRecorder.js"; -import { V3Evaluator } from "@browserbasehq/stagehand"; -import type { TaskSpec, V3 } from "@browserbasehq/stagehand"; - -interface FakeV3 { - bus: EventEmitter; -} - -async function main(): Promise { - const tmpRoot = await fs.mkdtemp( - path.join(os.tmpdir(), "verifier-rewrite-smoke-"), - ); - console.log(`▸ tmpdir: ${tmpRoot}`); - - const bus = new EventEmitter(); - const v3 = { bus } as unknown as V3; - const taskSpec: TaskSpec = { - id: "smoke-united_13", - instruction: - "What is the price difference between economy and business class on United?", - initUrl: "https://www.google.com", - precomputedRubric: { - items: [ - { - criterion: "Identify correct route", - description: "Agent identifies United CHI→GRU flight.", - maxPoints: 2, - }, - { - criterion: "Report price delta", - description: "Agent reports economy↔business price delta.", - maxPoints: 3, - }, - ], - }, - expectedAnswer: "Approximately $4,000 difference.", - }; - - const recorder = new TrajectoryRecorder({ - v3, - taskSpec, - outputRoot: tmpRoot, - runId: "smoke-run", - persist: true, - }); - recorder.start(); - - // Emit a three-step synthetic trajectory. - bus.emit("agent_step_finished_event", { - stepIndex: 0, - actionName: "goto", - actionArgs: { url: "https://united.com" }, - reasoning: "Open United Airlines homepage.", - toolOutput: { ok: true, result: { url: "https://united.com" } }, - finishedAt: new Date().toISOString(), - }); - bus.emit("agent_screenshot_taken_event", { - stepIndex: 0, - screenshot: Buffer.from("fake-png-bytes-0"), - url: "https://united.com", - evidenceRole: "agent_and_probe", - }); - bus.emit("agent_step_observed_event", { - stepIndex: 0, - url: "https://united.com", - }); - - bus.emit("agent_step_finished_event", { - stepIndex: 1, - actionName: "act", - actionArgs: { instruction: "Search Chicago to São Paulo, Nov 24" }, - reasoning: "Enter route and dates.", - toolOutput: { - ok: true, - result: { success: true, describe: "Filled route + dates" }, - }, - finishedAt: new Date().toISOString(), - }); - bus.emit("agent_screenshot_taken_event", { - stepIndex: 1, - screenshot: Buffer.from("fake-png-bytes-1"), - url: "https://united.com/search", - }); - bus.emit("agent_step_observed_event", { - stepIndex: 1, - url: "https://united.com/search", - }); - - bus.emit("agent_step_finished_event", { - stepIndex: 2, - actionName: "extract", - actionArgs: { instruction: "extract fare cells" }, - reasoning: "Read economy and business fares from the results page.", - toolOutput: { - ok: true, - result: { economy: "$1,234", business: "$5,789" }, - }, - finishedAt: new Date().toISOString(), - }); - bus.emit("agent_screenshot_taken_event", { - stepIndex: 2, - screenshot: Buffer.from("fake-png-bytes-2"), - url: "https://united.com/results", - }); - bus.emit("agent_step_observed_event", { - stepIndex: 2, - url: "https://united.com/results", - ariaTree: - "[0-1] RootWebArea: United Search Results\n [0-3] heading: Flight 1234\n [0-4] StaticText: Economy $1,234\n [0-5] StaticText: Business $5,789", - }); - - bus.emit("agent_final_answer_event", { - message: "Economy $1,234 vs business $5,789 — delta $4,555.", - }); - - const trajectory = await recorder.finish({ - status: "complete", - usage: { input_tokens: 1234, output_tokens: 567 }, - }); - - // ── Assertions ────────────────────────────────────────────────────────── - assert.equal(trajectory.steps.length, 3, "expected 3 steps"); - assert.equal(trajectory.steps[0].actionName, "goto"); - assert.equal(trajectory.steps[1].actionName, "act"); - assert.equal(trajectory.steps[2].actionName, "extract"); - assert.ok( - trajectory.steps[0].agentEvidence.modalities.some( - (m) => m.type === "image", - ), - "CUA-style screenshot event should populate tier-1 image evidence", - ); - assert.ok( - trajectory.steps[2].agentEvidence.modalities.some( - (m) => - m.type === "json" && - typeof m.content === "object" && - m.content !== null && - "economy" in (m.content as Record), - ), - "extract step should carry a json modality with economy field", - ); - assert.equal( - trajectory.finalAnswer, - "Economy $1,234 vs business $5,789 — delta $4,555.", - ); - assert.equal(trajectory.status, "complete"); - assert.equal(trajectory.usage.input_tokens, 1234); - // a11y dump on step 2 should round-trip through the recorder into - // probeEvidence.ariaTree. - assert.ok( - trajectory.steps[2].probeEvidence.ariaTree?.includes("Economy $1,234"), - "step_observed.ariaTree should populate probeEvidence.ariaTree", - ); - console.log(" ✓ in-memory Trajectory shape (incl. ariaTree round-trip)"); - - // ── On-disk layout ────────────────────────────────────────────────────── - const taskDir = path.join(tmpRoot, "smoke-run", "smoke-united_13"); - const files = (await fs.readdir(taskDir)).sort(); - assert.deepEqual( - files, - [ - "core.log", - "scores", - "screenshots", - "task_data.json", - "times.json", - "trajectory.json", - ], - `expected new trajectory layout, got ${files.join(", ")}`, - ); - const probeFiles = ( - await fs.readdir(path.join(taskDir, "screenshots", "probe")) - ).sort(); - assert.deepEqual( - probeFiles, - ["1.png", "2.png", "3.png"], - `expected probe screenshots, got ${probeFiles.join(", ")}`, - ); - const screenshotBytes = await fs.readFile( - path.join(taskDir, "screenshots", "probe", "1.png"), - ); - assert.equal(screenshotBytes.toString(), "fake-png-bytes-0"); - const coreLog = await fs.readFile(path.join(taskDir, "core.log"), "utf8"); - assert.ok(coreLog.includes('"action":"goto"')); - console.log(" ✓ on-disk layout has expected verifier files"); - - const persistedTask = JSON.parse( - await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"), - ); - assert.equal(persistedTask.task.id, "smoke-united_13"); - assert.equal(persistedTask.status, "complete"); - - // ── V3Evaluator.verify() exercised live in verify-live-trajectory.ts ── - // Sanity-check that the V3Evaluator class still constructs from a minimal - // V3 shape (recorder doesn't depend on the evaluator for plumbing). - const _unused: typeof V3Evaluator = V3Evaluator; - void _unused; - console.log( - " ✓ V3Evaluator still constructs (verify() exercised live elsewhere)", - ); - - console.log("\n✅ Trajectory recorder plumbing OK"); - await fs.rm(tmpRoot, { recursive: true, force: true }); -} - -main().catch((err) => { - console.error("\n❌ Trajectory recorder plumbing FAILED:", err); - process.exit(1); -}); - -// Type guard for FakeV3 lint suppression (the file uses `as unknown as V3`). -export type { FakeV3 }; diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts new file mode 100644 index 0000000000..5c5268e66a --- /dev/null +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -0,0 +1,197 @@ +import { EventEmitter } from "node:events"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { afterEach, describe, expect, it } from "vitest"; +import type { TaskSpec, V3 } from "@browserbasehq/stagehand"; + +import { TrajectoryRecorder } from "../../framework/trajectoryRecorder.js"; + +const tempDirs: string[] = []; + +afterEach(async () => { + while (tempDirs.length > 0) { + const dir = tempDirs.pop(); + if (dir) await fs.rm(dir, { recursive: true, force: true }); + } +}); + +function makeTempDir(): Promise { + return fs + .mkdtemp(path.join(os.tmpdir(), "trajectory-recorder-")) + .then((dir) => { + tempDirs.push(dir); + return dir; + }); +} + +function makeV3(bus = new EventEmitter()): V3 { + return { bus } as unknown as V3; +} + +function makeTaskSpec(): TaskSpec { + return { + id: "recorder-task", + instruction: "Compare economy and business fares.", + initUrl: "https://example.com", + precomputedRubric: { + items: [ + { + criterion: "Report fare delta", + description: "Report the difference between two fares.", + maxPoints: 1, + }, + ], + }, + }; +} + +describe("TrajectoryRecorder", () => { + it("assembles trajectory evidence from bus events", async () => { + const bus = new EventEmitter(); + const recorder = new TrajectoryRecorder({ + v3: makeV3(bus), + taskSpec: makeTaskSpec(), + persist: false, + }); + const screenshot = Buffer.from("screen-1"); + + recorder.start(); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 0, + screenshot, + url: "https://example.com/search", + evidenceRole: "agent_and_probe", + }); + bus.emit("agent_step_finished_event", { + stepIndex: 0, + actionName: "extract", + actionArgs: { instruction: "Read fares" }, + reasoning: "Read visible fare cells.", + toolOutput: { + ok: true, + result: { economy: "$100", business: "$250" }, + }, + finishedAt: new Date(0).toISOString(), + }); + bus.emit("agent_step_observed_event", { + stepIndex: 0, + url: "https://example.com/search", + ariaTree: "RootWebArea\nStaticText: Economy $100", + }); + bus.emit("agent_final_answer_event", { + message: "Business is $150 more than economy.", + }); + + const trajectory = await recorder.finish({ + status: "complete", + usage: { input_tokens: 10, output_tokens: 5 }, + }); + + expect(trajectory.steps).toHaveLength(1); + expect(trajectory.steps[0]).toMatchObject({ + index: 0, + actionName: "extract", + actionArgs: { instruction: "Read fares" }, + reasoning: "Read visible fare cells.", + toolOutput: { + ok: true, + result: { economy: "$100", business: "$250" }, + }, + probeEvidence: { + url: "https://example.com/search", + ariaTree: "RootWebArea\nStaticText: Economy $100", + }, + }); + expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot); + expect(trajectory.steps[0].agentEvidence.modalities).toEqual( + expect.arrayContaining([ + { type: "image", bytes: screenshot, mediaType: "image/png" }, + { type: "text", content: "Read visible fare cells." }, + { type: "json", content: { economy: "$100", business: "$250" } }, + ]), + ); + expect(trajectory.finalAnswer).toBe("Business is $150 more than economy."); + }); + + it("persists trajectory files and evaluator results", async () => { + const outputRoot = await makeTempDir(); + const bus = new EventEmitter(); + const recorder = new TrajectoryRecorder({ + v3: makeV3(bus), + taskSpec: makeTaskSpec(), + outputRoot, + runId: "run-1", + persist: true, + }); + const screenshot = Buffer.from("screen-1"); + + recorder.start(); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 0, + screenshot, + url: "https://example.com/search", + evidenceRole: "agent_and_probe", + }); + bus.emit("agent_step_finished_event", { + stepIndex: 0, + actionName: "act", + actionArgs: { instruction: "Search fares" }, + reasoning: "Search for fares.", + toolOutput: { ok: true, result: "done" }, + finishedAt: new Date(0).toISOString(), + }); + bus.emit("agent_step_observed_event", { + stepIndex: 0, + url: "https://example.com/search", + }); + + await recorder.finish({ status: "complete" }); + await recorder.persistResult({ + outcomeSuccess: true, + explanation: "The task was completed.", + }); + + const taskDir = path.join(outputRoot, "run-1", "recorder-task"); + await expect(fs.readdir(taskDir)).resolves.toEqual( + expect.arrayContaining([ + "core.log", + "scores", + "screenshots", + "task_data.json", + "times.json", + "trajectory.json", + ]), + ); + await expect( + fs.readFile(path.join(taskDir, "screenshots", "probe", "1.png")), + ).resolves.toEqual(screenshot); + await expect( + fs.readFile(path.join(taskDir, "screenshots", "agent", "1.png")), + ).resolves.toEqual(screenshot); + await expect( + fs.readFile(path.join(taskDir, "scores", "result.json"), "utf8"), + ).resolves.toContain('"outcomeSuccess": true'); + + const trajectory = JSON.parse( + await fs.readFile(path.join(taskDir, "trajectory.json"), "utf8"), + ); + expect(trajectory.steps[0].probeEvidence.screenshotPath).toBe( + "screenshots/probe/1.png", + ); + expect(trajectory.steps[0].agentEvidence.modalities).toContainEqual({ + type: "image", + imagePath: "screenshots/agent/1.png", + mediaType: "image/png", + }); + + const taskData = JSON.parse( + await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"), + ); + expect(taskData.result).toMatchObject({ + outcomeSuccess: true, + explanation: "The task was completed.", + }); + }); +}); From 9138ddf88a514c67d37bdfe444e8c7135f549e26 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 22:48:53 -0700 Subject: [PATCH 06/27] docs(verifier): trim trajectory event comments --- .../core/lib/v3/agent/AnthropicCUAClient.ts | 4 ---- .../core/lib/v3/types/public/busEvents.ts | 19 +++++-------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts index 54d64f15d0..752d208e22 100644 --- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts +++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts @@ -902,10 +902,6 @@ export class AnthropicCUAClient extends AgentClient { ...input, }; } else if (action === "triple_click" || action === "tripleClick") { - // Anthropic's computer_20250124 tool emits `triple_click` with - // `coordinate: [x, y]`. Without this branch the snake_case name + - // raw coordinate array fall through to the generic `else` and - // executeAction logs "Unknown action type: triple_click". return { type: "tripleClick", x: diff --git a/packages/core/lib/v3/types/public/busEvents.ts b/packages/core/lib/v3/types/public/busEvents.ts index 62e9929492..e2fa119499 100644 --- a/packages/core/lib/v3/types/public/busEvents.ts +++ b/packages/core/lib/v3/types/public/busEvents.ts @@ -5,16 +5,8 @@ * event name so consumers (TrajectoryRecorder in packages/evals, custom * subscribers) can type their handlers. * - * Wave 0 of the verifier rewrite plan introduces: - * - agent_screenshot_taken_event — independent post-step screenshot probe - * - agent_step_finished_event — fired per tool-call in a step result - * - agent_step_observed_event — fired after the harness probe completes - * - agent_final_answer_event — fired when the `done` tool resolves - * - * `agent_step_started_event` is documented in the plan but deferred — the AI - * SDK's `onStepFinish` is a post-hook, and there's no symmetric pre-hook per - * tool execution in v3AgentHandler today. Started-state can be derived from - * the finished event's stepIndex if needed. + * The verifier recorder consumes these events to assemble persisted + * trajectories without coupling to individual agent handlers. */ /** @@ -37,7 +29,7 @@ export type BusEventName = (typeof BUS_EVENTS)[keyof typeof BUS_EVENTS]; * * Note: in CUA mode the same Buffer is also what the provider received; in * DOM/hybrid mode it's an independent harness probe. The verifier treats them - * as different evidence tiers regardless — see plan §04 ("Mode-by-mode sources"). + * as different evidence tiers regardless. */ export interface AgentScreenshotTakenEvent { /** Zero-based index of the step this screenshot corresponds to. */ @@ -63,7 +55,7 @@ export interface AgentScreenshotTakenEvent { * * Tier 1 evidence (the bytes the LLM consumed as the tool result) is captured * separately by the harness via an AgentExecuteCallbacks.onStepFinish wrapper - * — not in this payload. See plan §10 Q1 (resolved: onStepFinish). + * and is not part of this payload. */ export interface AgentStepFinishedEvent { stepIndex: number; @@ -86,8 +78,7 @@ export interface AgentStepFinishedEvent { /** * Payload for `agent_step_observed_event`. Emitted after the harness probe - * completes for a step (page URL captured at minimum; a11y tree and scroll - * info added in Wave 2). + * completes for a step. */ export interface AgentStepObservedEvent { stepIndex: number; From 1303315cc91c69ed8880f65961638ad0a0cc4c32 Mon Sep 17 00:00:00 2001 From: miguel Date: Mon, 18 May 2026 17:00:45 -0700 Subject: [PATCH 07/27] refactor(verifier): extract writeTrajectoryDir + shouldPersistTrajectory Lift the on-disk persistence helpers from TrajectoryRecorder into verifier/trajectory.ts so #2137's harness adapter can share them. Also drop the recorder's no-op .replace("T","T") and the WHAT-narration comments per project policy. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/core/lib/v3/index.ts | 6 + packages/core/lib/v3/verifier/index.ts | 2 + packages/core/lib/v3/verifier/trajectory.ts | 137 +++++++++++ .../evals/framework/trajectoryRecorder.ts | 214 ++---------------- 4 files changed, 165 insertions(+), 194 deletions(-) diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts index 8e21fb0309..a5cbccf746 100644 --- a/packages/core/lib/v3/index.ts +++ b/packages/core/lib/v3/index.ts @@ -28,6 +28,8 @@ import { loadTrajectoryFromDisk, nextResultFilename, normalizeRubric, + shouldPersistTrajectory, + writeTrajectoryDir, } from "./verifier/index.js"; export { V3 } from "./v3.js"; @@ -93,6 +95,8 @@ export { loadTrajectoryFromDisk, nextResultFilename, normalizeRubric, + shouldPersistTrajectory, + writeTrajectoryDir, } from "./verifier/index.js"; export { tool } from "ai"; export { getAISDKLanguageModel } from "./llm/LLMProvider.js"; @@ -147,6 +151,8 @@ const StagehandDefault = { loadTrajectoryFromDisk, nextResultFilename, normalizeRubric, + shouldPersistTrajectory, + writeTrajectoryDir, tool, getAISDKLanguageModel, __internalCreateInMemoryAgentCacheHandle, diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts index 4061533ab9..2b14cfb16a 100644 --- a/packages/core/lib/v3/verifier/index.ts +++ b/packages/core/lib/v3/verifier/index.ts @@ -25,4 +25,6 @@ export { loadTrajectoryFromDisk, nextResultFilename, normalizeRubric, + shouldPersistTrajectory, + writeTrajectoryDir, } from "./trajectory.js"; diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index a18f025c37..ec602d04d0 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -1,3 +1,5 @@ +import fs from "node:fs/promises"; +import path from "node:path"; import type { AgentEvidenceModality, ProbeEvidence, @@ -187,3 +189,138 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { export function nextResultFilename(label?: string): string { return `result_${normalizeResultLabel(label)}.json`; } + +/** + * Default persistence policy: explicit override, then env, then "on unless CI". + */ +export function shouldPersistTrajectory( + override: boolean | undefined, +): boolean { + if (override !== undefined) return override; + const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase(); + if (env === "1" || env === "true") return true; + if (env === "0" || env === "false") return false; + return !process.env.CI; +} + +/** + * Write the on-disk trajectory layout under `dir`: + * + * / + * ├── task_data.json + * ├── trajectory.json (screenshots referenced by path) + * ├── screenshots/ + * │ ├── probe/.png + * │ └── agent/[_M].png + * ├── times.json + * ├── scores/ (empty; populated separately) + * └── core.log + * + * Image bytes are externalized to PNG files; the in-memory Trajectory is left + * untouched so callers can keep using it after persistence. + */ +export async function writeTrajectoryDir( + dir: string, + trajectory: Trajectory, +): Promise { + await fs.mkdir(dir, { recursive: true }); + await fs.mkdir(path.join(dir, "screenshots", "probe"), { recursive: true }); + await fs.mkdir(path.join(dir, "screenshots", "agent"), { recursive: true }); + + const serializableSteps: unknown[] = []; + for (const step of trajectory.steps) { + const probe: ProbeEvidence = { ...step.probeEvidence }; + if (probe.screenshot) { + const relPath = `screenshots/probe/${step.index + 1}.png`; + await fs.writeFile(path.join(dir, relPath), probe.screenshot); + probe.screenshotPath = relPath; + delete probe.screenshot; + } + + const imageModalities = step.agentEvidence.modalities.filter( + (m) => m.type === "image", + ); + const multipleImages = imageModalities.length > 1; + let imageSeq = 0; + const modalities: unknown[] = []; + for (const m of step.agentEvidence.modalities) { + if (m.type !== "image") { + modalities.push(m); + continue; + } + const suffix = multipleImages ? `_${imageSeq}` : ""; + const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`; + await fs.writeFile(path.join(dir, relPath), m.bytes); + modalities.push({ + type: "image", + imagePath: relPath, + mediaType: m.mediaType, + }); + imageSeq += 1; + } + serializableSteps.push({ + ...step, + probeEvidence: probe, + agentEvidence: { modalities }, + }); + } + + // Image modalities carry imagePath instead of raw bytes on disk; cast + // through unknown rather than widen Trajectory's type contract. + const serialized = { + ...trajectory, + steps: serializableSteps, + } as unknown; + + await fs.writeFile( + path.join(dir, "trajectory.json"), + JSON.stringify(serialized, null, 2), + ); + + await fs.writeFile( + path.join(dir, "task_data.json"), + JSON.stringify( + { + task: trajectory.task, + status: trajectory.status, + finalAnswer: trajectory.finalAnswer ?? null, + }, + null, + 2, + ), + ); + + await fs.writeFile( + path.join(dir, "times.json"), + JSON.stringify( + { + timing: trajectory.timing, + usage: trajectory.usage, + stepCount: trajectory.steps.length, + }, + null, + 2, + ), + ); + + await fs.mkdir(path.join(dir, "scores"), { recursive: true }); + await fs.writeFile(path.join(dir, "core.log"), coreLog(trajectory)); +} + +function coreLog(trajectory: Trajectory): string { + return ( + trajectory.steps + .map((step) => + JSON.stringify({ + step: step.index, + action: step.actionName, + url: step.probeEvidence.url ?? null, + ok: step.toolOutput.ok, + reasoning: step.reasoning || undefined, + startedAt: step.startedAt, + finishedAt: step.finishedAt, + }), + ) + .join("\n") + "\n" + ); +} diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index 8895a08443..91c7b42987 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -1,24 +1,9 @@ -/** - * TrajectoryRecorder — subscribes to v3.bus step events emitted by the agent - * handlers (v3AgentHandler / v3CuaAgentHandler) and assembles a Trajectory - * the verifier can consume. - * - * Lifecycle: - * const recorder = new TrajectoryRecorder({ v3, taskSpec }); - * recorder.start(); - * await agent.execute(...); - * const trajectory = await recorder.finish({ status: "complete", usage }); - * - * Persistence is env-gated by `VERIFIER_PERSIST_TRAJECTORIES`: - * - unset: persistence follows the default (on locally, off in CI). - * - "1" / "true": always persist. - * - "0" / "false": never persist. - * - * On-disk layout is stable JSON + screenshots so saved runs can be re-scored - * without format conversion. - */ import fs from "node:fs/promises"; import path from "node:path"; +import { + shouldPersistTrajectory, + writeTrajectoryDir, +} from "@browserbasehq/stagehand"; import type { AgentEvidence, AgentFinalAnswerEvent, @@ -75,18 +60,6 @@ const ZERO_USAGE: TrajectoryUsage = { output_tokens: 0, }; -/** - * Decide whether to persist by default. Honors the explicit override first, - * then env, then falls back to "persist when not in CI". - */ -function shouldPersist(override: boolean | undefined): boolean { - if (override !== undefined) return override; - const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase(); - if (env === "1" || env === "true") return true; - if (env === "0" || env === "false") return false; - return !process.env.CI; -} - export class TrajectoryRecorder { private readonly v3: V3; private readonly taskSpec: TaskSpec; @@ -94,9 +67,8 @@ export class TrajectoryRecorder { private readonly outputDir: string; private readonly persistEnabled: boolean; - // Per-stepIndex builders; events can arrive out-of-order in theory, though - // the handlers emit step_finished → screenshot_taken → step_observed in the - // same microtask. + // Events can arrive out-of-order across step indices; same-step events all + // fire in one microtask. private readonly partialSteps = new Map>(); private readonly observationByStep = new Map< number, @@ -111,34 +83,30 @@ export class TrajectoryRecorder { private endedAt = ""; private listenersAttached = false; - // Strongly-typed bound handlers so we can attach/detach the same references. + // Bound handlers so attach/detach refer to the same references. private readonly onScreenshot = (e: AgentScreenshotTakenEvent) => { this.screenshotsByStep.set(e.stepIndex, e); const partial = this.ensurePartial(e.stepIndex); - // Default to "probe" when the emit site doesn't tag the role — matches - // v3AgentHandler's post-step screenshot, which is always a tier-2 probe. + // Default to probe when the emit site doesn't tag a role: matches + // v3AgentHandler's post-step screenshot. For CUA the pre-action shot is + // NOT a probe — emitCuaActionStep fills that role post-action. const role = e.evidenceRole ?? "probe"; - // Probe channel (tier 2): the page's state at observation time. For CUA - // the pre-action screenshot is NOT a probe — that role is filled by the - // post-action emit from emitCuaActionStep. So only update probe.screenshot - // when the event explicitly carries the probe role. if (role === "probe" || role === "agent_and_probe") { const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; probe.screenshot = e.screenshot; probe.url = e.url; partial.probeEvidence = probe; } else if (!partial.probeEvidence?.url) { - // Even for tier-1-only events, the URL is useful probe context if we - // don't have one yet. Doesn't overwrite a later post-action URL. + // Capture URL even for tier-1-only events; a later post-action URL + // can still overwrite it. partial.probeEvidence = { ...(partial.probeEvidence ?? {}), url: e.url, }; } - // Agent channel (tier 1): bytes the model ingested. if (role === "agent" || role === "agent_and_probe") { partial.agentEvidence = mergeAgentEvidence(partial.agentEvidence, { modalities: [ @@ -176,11 +144,10 @@ export class TrajectoryRecorder { this.v3 = opts.v3; this.taskSpec = opts.taskSpec; this.runId = - opts.runId ?? - new Date().toISOString().replace(/[:.]/g, "-").replace("T", "T"); + opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-"); const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories"); this.outputDir = path.join(root, this.runId, opts.taskSpec.id); - this.persistEnabled = shouldPersist(opts.persist); + this.persistEnabled = shouldPersistTrajectory(opts.persist); } /** Subscribe to bus events. Call once before agent.execute(). */ @@ -213,7 +180,7 @@ export class TrajectoryRecorder { }; if (this.persistEnabled) { - await this.persist(trajectory); + await writeTrajectoryDir(this.outputDir, trajectory); } return trajectory; @@ -289,12 +256,6 @@ export class TrajectoryRecorder { return p; } - /** - * Materialize ordered TrajectoryStep[] from the accumulated partials. - * Steps that never received a step_finished event are skipped (they can - * appear for CUA where only screenshot events fire — those are recorded as - * orphan probe screenshots and elided here). - */ private assembleSteps(): TrajectoryStep[] { const out: TrajectoryStep[] = []; const indices = [...this.partialSteps.keys()].sort((a, b) => a - b); @@ -305,8 +266,8 @@ export class TrajectoryRecorder { p.toolOutput === undefined || p.finishedAt === undefined ) { - // Orphan screenshot-only entry (typically CUA). Skip — we record - // these by writing the screenshot to disk separately during persist(). + // CUA emits screenshot-only entries between actions; skip them here + // and let writeTrajectoryDir record them via the probe channel. continue; } out.push({ @@ -323,119 +284,6 @@ export class TrajectoryRecorder { } return out; } - - /** - * Write the trajectory directory layout. - * - * / - * ├── task_data.json - * ├── trajectory.json (screenshots referenced by path) - * ├── screenshots/ - * │ ├── probe/.png - * │ └── agent/.png - * └── times.json - */ - private async persist(trajectory: Trajectory): Promise { - await fs.mkdir(this.outputDir, { recursive: true }); - - // Walk steps and write screenshots; replace Buffer with path reference in - // the serialized trajectory. Both tiers externalize image bytes under - // screenshots/probe/.png — tier 2, what the harness observed - // screenshots/agent/.png — tier 1, what the model received - // The `_` suffix only appears when a step carries multiple images - // (rare; typically zero or one per step). Paths in JSON are relative to - // the trajectory dir so the directory is movable/copyable as a unit. - await fs.mkdir(path.join(this.outputDir, "screenshots", "probe"), { - recursive: true, - }); - await fs.mkdir(path.join(this.outputDir, "screenshots", "agent"), { - recursive: true, - }); - - const serializableSteps: unknown[] = []; - for (const step of trajectory.steps) { - const probe: ProbeEvidence = { ...step.probeEvidence }; - if (probe.screenshot) { - const relPath = `screenshots/probe/${step.index + 1}.png`; - await fs.writeFile( - path.join(this.outputDir, relPath), - probe.screenshot, - ); - probe.screenshotPath = relPath; - delete probe.screenshot; - } - - const imageModalities = step.agentEvidence.modalities.filter( - (m) => m.type === "image", - ); - const multipleImages = imageModalities.length > 1; - let imageSeq = 0; - const modalities: unknown[] = []; - for (const m of step.agentEvidence.modalities) { - if (m.type !== "image") { - modalities.push(m); - continue; - } - const suffix = multipleImages ? `_${imageSeq}` : ""; - const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`; - await fs.writeFile(path.join(this.outputDir, relPath), m.bytes); - modalities.push({ - type: "image", - imagePath: relPath, - mediaType: m.mediaType, - }); - imageSeq += 1; - } - const agentEvidence = { modalities }; - serializableSteps.push({ ...step, probeEvidence: probe, agentEvidence }); - } - - // Image modalities carry imagePath instead of raw bytes on disk, so this - // is no longer a strict Trajectory at the type level. Cast through - // unknown rather than widening the type contract. - const serialized = { - ...trajectory, - steps: serializableSteps, - } as unknown; - - await fs.writeFile( - path.join(this.outputDir, "trajectory.json"), - JSON.stringify(serialized, null, 2), - ); - - // task_data.json stores TaskSpec + (later) result. - await fs.writeFile( - path.join(this.outputDir, "task_data.json"), - JSON.stringify( - { - task: trajectory.task, - status: trajectory.status, - finalAnswer: trajectory.finalAnswer ?? null, - }, - null, - 2, - ), - ); - - await fs.writeFile( - path.join(this.outputDir, "times.json"), - JSON.stringify( - { - timing: trajectory.timing, - usage: trajectory.usage, - stepCount: trajectory.steps.length, - }, - null, - 2, - ), - ); - - await fs.mkdir(path.join(this.outputDir, "scores"), { recursive: true }); - await fs.writeFile( - path.join(this.outputDir, "core.log"), - coreLog(trajectory), - ); - } } function mergeAgentEvidence( @@ -446,11 +294,6 @@ function mergeAgentEvidence( }; } -/** - * Build a tier-1 AgentEvidence from a step_finished event. The handler's - * toolOutput.result is what the LLM consumed next turn (modulo SDK - * serialization). - */ function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence { const modalities: AgentEvidence["modalities"] = []; if (e.reasoning) { @@ -469,7 +312,8 @@ function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence { mediaType: "image/png", }); } else if (typeof result === "object") { - // Tool results commonly include a screenshotBase64 field for vision tools. + // Vision tools embed a screenshotBase64 alongside the JSON result; lift + // it to its own image modality so the verifier sees both. const r = result as { screenshotBase64?: string } & Record; if (typeof r.screenshotBase64 === "string") { try { @@ -479,28 +323,10 @@ function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence { mediaType: "image/png", }); } catch { - // ignore + // Malformed base64; skip the image and keep the JSON modality. } } modalities.push({ type: "json", content: result }); } return { modalities }; } - -function coreLog(trajectory: Trajectory): string { - return ( - trajectory.steps - .map((step) => - JSON.stringify({ - step: step.index, - action: step.actionName, - url: step.probeEvidence.url ?? null, - ok: step.toolOutput.ok, - reasoning: step.reasoning || undefined, - startedAt: step.startedAt, - finishedAt: step.finishedAt, - }), - ) - .join("\n") + "\n" - ); -} From 10b03ca7385f00b98c940a65bfdc68e8823d9531 Mon Sep 17 00:00:00 2001 From: miguel Date: Mon, 18 May 2026 17:45:32 -0700 Subject: [PATCH 08/27] =?UTF-8?q?style(recorder):=20prettier=20=E2=80=94?= =?UTF-8?q?=20collapse=20runId=20fallback=20onto=20one=20line?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/evals/framework/trajectoryRecorder.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index 91c7b42987..3cf1c17621 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -143,8 +143,7 @@ export class TrajectoryRecorder { constructor(opts: TrajectoryRecorderOptions) { this.v3 = opts.v3; this.taskSpec = opts.taskSpec; - this.runId = - opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-"); + this.runId = opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-"); const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories"); this.outputDir = path.join(root, this.runId, opts.taskSpec.id); this.persistEnabled = shouldPersistTrajectory(opts.persist); From 6caeb1bac5737ed640dac1833b813f9afb606fbf Mon Sep 17 00:00:00 2001 From: miguel Date: Mon, 18 May 2026 17:49:40 -0700 Subject: [PATCH 09/27] fix(verifier): guard bus.listenerCount and align export-surface snapshot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - v3AgentHandler / v3CuaAgentHandler use optional-chained listenerCount so test mocks without one (captcha-hooks, temperature) don't blow up. - Add bus stub to the agent-temperature createV3() mock so bus.emit doesn't NPE on the new agent_step_finished_event emit. - Add BUS_EVENTS, shouldPersistTrajectory, writeTrajectoryDir to the export-surface snapshot — these are intentional new public exports. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/core/lib/v3/handlers/v3AgentHandler.ts | 4 ++-- packages/core/lib/v3/handlers/v3CuaAgentHandler.ts | 4 ++-- packages/core/tests/unit/agent-temperature.test.ts | 6 ++++++ packages/core/tests/unit/public-api/export-surface.test.ts | 3 +++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index afddddef22..fc2761d902 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -351,9 +351,9 @@ export class V3AgentHandler { // is more faithful than dropping probe evidence for all but the last // tool call, while still avoiding per-tool screenshot overhead. const wantsScreenshotProbe = - this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0; + this.v3.bus.listenerCount?.("agent_screenshot_taken_event") > 0; const wantsStepObservation = - this.v3.bus.listenerCount("agent_step_observed_event") > 0; + this.v3.bus.listenerCount?.("agent_step_observed_event") > 0; if ( stepIndicesInTurn.length > 0 && (wantsScreenshotProbe || wantsStepObservation) diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index 2fd08b8647..bc1d6d5fd4 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -883,9 +883,9 @@ export class V3CuaAgentHandler { // Listener-gated to keep ordinary agent runs free of the extra // screenshot cost — mirrors v3AgentHandler's post-step probe. const wantsScreenshotProbe = - this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0; + this.v3.bus.listenerCount?.("agent_screenshot_taken_event") > 0; const wantsStepObservation = - this.v3.bus.listenerCount("agent_step_observed_event") > 0; + this.v3.bus.listenerCount?.("agent_step_observed_event") > 0; let probeUrl = pageUrl; if (wantsScreenshotProbe || wantsStepObservation) { try { diff --git a/packages/core/tests/unit/agent-temperature.test.ts b/packages/core/tests/unit/agent-temperature.test.ts index 45184a9888..8f12b4a7e6 100644 --- a/packages/core/tests/unit/agent-temperature.test.ts +++ b/packages/core/tests/unit/agent-temperature.test.ts @@ -125,6 +125,12 @@ function createV3() { context: { awaitActivePage: vi.fn(async () => page), }, + bus: { + emit: vi.fn(), + on: vi.fn(), + off: vi.fn(), + listenerCount: vi.fn(() => 0), + }, isCaptchaAutoSolveEnabled: false, browserbaseApiKey: undefined, logger: vi.fn(), diff --git a/packages/core/tests/unit/public-api/export-surface.test.ts b/packages/core/tests/unit/public-api/export-surface.test.ts index e73cde4178..fe4003f138 100644 --- a/packages/core/tests/unit/public-api/export-surface.test.ts +++ b/packages/core/tests/unit/public-api/export-surface.test.ts @@ -21,6 +21,7 @@ const publicApiShape = { AISdkClient: Stagehand.AISdkClient, Api: Stagehand.Api, AVAILABLE_CUA_MODELS: Stagehand.AVAILABLE_CUA_MODELS, + BUS_EVENTS: Stagehand.BUS_EVENTS, AgentProvider: Stagehand.AgentProvider, AnnotatedScreenshotText: Stagehand.AnnotatedScreenshotText, ConsoleMessage: Stagehand.ConsoleMessage, @@ -50,12 +51,14 @@ const publicApiShape = { normalizeRubric: Stagehand.normalizeRubric, pageTextSchema: Stagehand.pageTextSchema, providerEnvVarMap: Stagehand.providerEnvVarMap, + shouldPersistTrajectory: Stagehand.shouldPersistTrajectory, toGeminiSchema: Stagehand.toGeminiSchema, toJsonSchema: Stagehand.toJsonSchema, tool: Stagehand.tool, transformSchema: Stagehand.transformSchema, trimTrailingTextNode: Stagehand.trimTrailingTextNode, validateZodSchema: Stagehand.validateZodSchema, + writeTrajectoryDir: Stagehand.writeTrajectoryDir, ...publicErrorTypes, } as const; From 16669e80d7805e0b91649782ee0b820693686f10 Mon Sep 17 00:00:00 2001 From: miguel Date: Thu, 21 May 2026 10:30:45 -0700 Subject: [PATCH 10/27] refactor(verifier): collect evidence via agent callbacks --- .changeset/verifier-trajectory-events.md | 2 +- .../core/lib/v3/handlers/v3AgentHandler.ts | 96 +++++++++--------- .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 88 +++++++++-------- packages/core/lib/v3/types/public/agent.ts | 6 ++ .../v3/types/public/agentEvidenceEvents.ts | 84 ++++++++++++++++ .../core/lib/v3/types/public/busEvents.ts | 99 ------------------- packages/core/lib/v3/types/public/index.ts | 2 +- .../unit/public-api/export-surface.test.ts | 1 - .../evals/framework/trajectoryRecorder.ts | 70 ++++++------- .../framework/trajectoryRecorder.test.ts | 34 +++---- 10 files changed, 238 insertions(+), 244 deletions(-) create mode 100644 packages/core/lib/v3/types/public/agentEvidenceEvents.ts delete mode 100644 packages/core/lib/v3/types/public/busEvents.ts diff --git a/.changeset/verifier-trajectory-events.md b/.changeset/verifier-trajectory-events.md index 9dcb5c8192..4a4ee2e32e 100644 --- a/.changeset/verifier-trajectory-events.md +++ b/.changeset/verifier-trajectory-events.md @@ -2,4 +2,4 @@ "@browserbasehq/stagehand": patch --- -Capture verifier trajectory evidence from v3 agent events for offline scoring. +Capture verifier trajectory evidence from agent evidence callbacks for offline scoring. diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index fc2761d902..965c30eded 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -31,6 +31,7 @@ import { AgentModelConfig, Variables, } from "../types/public/agent.js"; +import type { AgentEvidenceCallback } from "../types/public/agentEvidenceEvents.js"; import { HYBRID_CAPABLE_MODEL_PATTERNS } from "../types/private/agent.js"; import { V3FunctionName } from "../types/public/methods.js"; import { mapToolResultToActions } from "../agent/utils/actionMapping.js"; @@ -248,10 +249,11 @@ export class V3AgentHandler { userCallback?: | GenerateTextOnStepFinishCallback | StreamTextOnStepFinishCallback, + evidenceCallback?: AgentEvidenceCallback, ) { // Monotonic step counter scoped to this execute() call. Each tool call in // the agent loop becomes one trajectory step. The counter feeds stepIndex - // on the bus events the TrajectoryRecorder subscribes to. + // on evidence callback events. let stepCounter = 0; return async (event: StepResult) => { this.logger({ @@ -310,10 +312,6 @@ export class V3AgentHandler { state.actions.push(action); } - // Emit step_finished_event per tool call. The TrajectoryRecorder - // builds one Trajectory.Step per emission. tier-1 evidence (the - // bytes the LLM consumed) is captured separately via an - // onStepFinish wrapper in the harness. const stepIndex = stepCounter++; stepIndicesInTurn.push(stepIndex); const toolOk = @@ -321,7 +319,8 @@ export class V3AgentHandler { (typeof toolResult === "object" && !("error" in toolResult) && !("isError" in toolResult && toolResult.isError)); - this.v3.bus.emit("agent_step_finished_event", { + await evidenceCallback?.({ + type: "step_finished", stepIndex, actionName: toolCall.toolName, actionArgs: @@ -350,47 +349,17 @@ export class V3AgentHandler { // reflects the settled page state after the batch of tool calls; this // is more faithful than dropping probe evidence for all but the last // tool call, while still avoiding per-tool screenshot overhead. - const wantsScreenshotProbe = - this.v3.bus.listenerCount?.("agent_screenshot_taken_event") > 0; - const wantsStepObservation = - this.v3.bus.listenerCount?.("agent_step_observed_event") > 0; - if ( - stepIndicesInTurn.length > 0 && - (wantsScreenshotProbe || wantsStepObservation) - ) { + const wantsEvidence = evidenceCallback !== undefined; + if (stepIndicesInTurn.length > 0 && wantsEvidence) { + let screenshot: Buffer | undefined; + let ariaTree: string | undefined; try { const page = await this.v3.context.awaitActivePage(); - let screenshot: Buffer | undefined; - if (wantsScreenshotProbe) { - screenshot = await page.screenshot({ fullPage: false }); - } - let ariaTree: string | undefined; - if (wantsStepObservation) { - // Capture the a11y tree alongside the URL probe so the verifier - // can ground textual claims (prices, names, dates) without OCR. - // Best-effort: returns undefined on failure/timeout. - ariaTree = await captureAriaTreeProbe(this.v3); - } - for (const stepIndex of stepIndicesInTurn) { - if (screenshot) { - // DOM/hybrid: this post-step screenshot is a harness probe - // only. The agent's tier-1 evidence is the tool's return value - // captured separately in agent_step_finished_event. - this.v3.bus.emit("agent_screenshot_taken_event", { - stepIndex, - screenshot, - url: state.currentPageUrl, - evidenceRole: "probe", - }); - } - if (wantsStepObservation) { - this.v3.bus.emit("agent_step_observed_event", { - stepIndex, - url: state.currentPageUrl, - ariaTree, - }); - } - } + screenshot = await page.screenshot({ fullPage: false }); + // Capture the a11y tree alongside the URL probe so the verifier + // can ground textual claims (prices, names, dates) without OCR. + // Best-effort: returns undefined on failure/timeout. + ariaTree = await captureAriaTreeProbe(this.v3); } catch (e) { this.logger({ category: "agent", @@ -398,11 +367,34 @@ export class V3AgentHandler { level: 1, }); } + for (const stepIndex of stepIndicesInTurn) { + // DOM/hybrid: this post-step screenshot is a harness probe + // only. The agent's tier-1 evidence is the tool's return value + // captured separately in step_finished. + if (screenshot) { + await evidenceCallback?.({ + type: "screenshot", + stepIndex, + screenshot, + url: state.currentPageUrl, + evidenceRole: "probe", + }); + } + await evidenceCallback?.({ + type: "step_observed", + stepIndex, + url: state.currentPageUrl, + ariaTree, + }); + } } } if (lastFinalAnswer) { - this.v3.bus.emit("agent_final_answer_event", lastFinalAnswer); + await evidenceCallback?.({ + type: "final_answer", + ...lastFinalAnswer, + }); } if (userCallback) { @@ -488,7 +480,11 @@ export class V3AgentHandler { callbacks?.prepareStep, captchaSolver, ), - onStepFinish: this.createStepHandler(state, callbacks?.onStepFinish), + onStepFinish: this.createStepHandler( + state, + callbacks?.onStepFinish, + callbacks?.onEvidence, + ), abortSignal: preparedOptions.signal, providerOptions: { google: { mediaResolution: "MEDIA_RESOLUTION_HIGH" }, @@ -624,7 +620,11 @@ export class V3AgentHandler { callbacks?.prepareStep, captchaSolver, ), - onStepFinish: this.createStepHandler(state, callbacks?.onStepFinish), + onStepFinish: this.createStepHandler( + state, + callbacks?.onStepFinish, + callbacks?.onEvidence, + ), onError: (event) => { captchaSolver?.dispose(); if (callbacks?.onError) { diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index bc1d6d5fd4..31f0a649c8 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -17,7 +17,10 @@ import { SafetyConfirmationHandler, } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; -import type { AgentScreenshotTakenEvent } from "../types/public/busEvents.js"; +import type { + AgentEvidenceCallback, + AgentScreenshotEvidenceEvent, +} from "../types/public/agentEvidenceEvents.js"; import { type Action, V3FunctionName } from "../types/public/methods.js"; import { FlowLogger } from "../flowlogger/FlowLogger.js"; import { toTitleCase } from "../../utils.js"; @@ -39,13 +42,14 @@ export class V3CuaAgentHandler { private captchaSolver: CaptchaSolver | null = null; private captchaClickGuardRemaining = 0; private currentInstruction = ""; - // Monotonic step counter used by bus events. The CUA loop is internal to + // Monotonic step counter used by evidence callbacks. The CUA loop is internal to // the agent client, so unlike v3AgentHandler we don't have per-tool-call // step events; instead we tag every screenshot emission with an // incrementing index. private cuaStepCounter = 0; - private latestCuaScreenshot?: AgentScreenshotTakenEvent; + private latestCuaScreenshot?: AgentScreenshotEvidenceEvent; private latestCuaScreenshotConsumed = true; + private evidenceCallback?: AgentEvidenceCallback; constructor( v3: V3, @@ -86,15 +90,7 @@ export class V3CuaAgentHandler { const page = await this.v3.context.awaitActivePage(); const screenshotBuffer = await page.screenshot({ fullPage: false }); - // Emit bus event so TrajectoryRecorder can capture the screenshot. In - // CUA mode this is the same buffer the provider receives — i.e., it - // serves both as tier-1 evidence (what the model saw) and as a tier-2 - // probe. - try { - this.emitCuaScreenshot(screenshotBuffer, page.url()); - } catch { - // bus emit errors are non-fatal - } + await this.emitCuaScreenshot(screenshotBuffer, page.url()); return screenshotBuffer.toString("base64"); // base64 png }); @@ -208,6 +204,10 @@ export class V3CuaAgentHandler { : optionsOrInstruction; this.setSafetyConfirmationHandler(options.callbacks?.onSafetyConfirmation); + this.evidenceCallback = options.callbacks?.onEvidence; + this.cuaStepCounter = 0; + this.latestCuaScreenshot = undefined; + this.latestCuaScreenshotConsumed = true; this.highlightCursor = options.highlightCursor !== false; this.currentInstruction = options.instruction; @@ -263,7 +263,13 @@ export class V3CuaAgentHandler { let result: AgentResult; try { result = await this.agent.execute({ options, logger: this.logger }); + await this.evidenceCallback?.({ + type: "final_answer", + message: result.message, + output: result.output, + }); } finally { + this.evidenceCallback = undefined; this.captchaSolver?.dispose(); this.captchaSolver = null; } @@ -683,13 +689,8 @@ export class V3CuaAgentHandler { const currentUrl = page.url(); - // Mirror the screenshot to the bus — same buffer the CUA client - // received, so it serves as both tier-1 evidence and tier-2 probe. - try { - this.emitCuaScreenshot(screenshotBuffer, currentUrl); - } catch { - // non-fatal - } + // Mirror the same buffer the CUA client receives as agent evidence. + await this.emitCuaScreenshot(screenshotBuffer, currentUrl); return await this.agentClient.captureScreenshot({ base64Image: screenshotBuffer.toString("base64"), @@ -807,11 +808,12 @@ export class V3CuaAgentHandler { * can compare what the model saw against what the page actually showed * once the keystrokes/clicks landed. */ - private emitCuaScreenshot( + private async emitCuaScreenshot( screenshot: Buffer, url: string, - ): AgentScreenshotTakenEvent { - const event: AgentScreenshotTakenEvent = { + ): Promise { + const event: AgentScreenshotEvidenceEvent = { + type: "screenshot", stepIndex: this.cuaStepCounter++, screenshot, url, @@ -819,7 +821,7 @@ export class V3CuaAgentHandler { }; this.latestCuaScreenshot = event; this.latestCuaScreenshotConsumed = false; - this.v3.bus.emit("agent_screenshot_taken_event", event); + await this.evidenceCallback?.(event); return event; } @@ -843,7 +845,7 @@ export class V3CuaAgentHandler { this.latestCuaScreenshotConsumed = true; } else if (this.latestCuaScreenshot) { stepIndex = this.cuaStepCounter++; - this.v3.bus.emit("agent_screenshot_taken_event", { + await this.evidenceCallback?.({ ...this.latestCuaScreenshot, stepIndex, }); @@ -861,7 +863,8 @@ export class V3CuaAgentHandler { ? action.action : ""; - this.v3.bus.emit("agent_step_finished_event", { + await this.evidenceCallback?.({ + type: "step_finished", stepIndex, actionName: String(action.type), actionArgs, @@ -880,26 +883,16 @@ export class V3CuaAgentHandler { // verifier has no visual evidence that keystrokes/clicks landed, and // has to trust the action history alone. // - // Listener-gated to keep ordinary agent runs free of the extra + // Callback-gated to keep ordinary agent runs free of the extra // screenshot cost — mirrors v3AgentHandler's post-step probe. - const wantsScreenshotProbe = - this.v3.bus.listenerCount?.("agent_screenshot_taken_event") > 0; - const wantsStepObservation = - this.v3.bus.listenerCount?.("agent_step_observed_event") > 0; + const wantsEvidence = this.evidenceCallback !== undefined; let probeUrl = pageUrl; - if (wantsScreenshotProbe || wantsStepObservation) { + let probeScreenshot: Buffer | undefined; + if (wantsEvidence) { try { const page = await this.v3.context.awaitActivePage(); probeUrl = page.url(); - if (wantsScreenshotProbe) { - const probeScreenshot = await page.screenshot({ fullPage: false }); - this.v3.bus.emit("agent_screenshot_taken_event", { - stepIndex, - screenshot: probeScreenshot, - url: probeUrl, - evidenceRole: "probe", - }); - } + probeScreenshot = await page.screenshot({ fullPage: false }); } catch (e) { this.logger({ category: "agent", @@ -911,11 +904,22 @@ export class V3CuaAgentHandler { } } - if (probeUrl && wantsStepObservation) { + if (probeScreenshot) { + await this.evidenceCallback?.({ + type: "screenshot", + stepIndex, + screenshot: probeScreenshot, + url: probeUrl, + evidenceRole: "probe", + }); + } + + if (probeUrl && wantsEvidence) { // Capture the a11y tree alongside the URL probe so the verifier can // ground textual claims without OCR. Best-effort. const ariaTree = await captureAriaTreeProbe(this.v3); - this.v3.bus.emit("agent_step_observed_event", { + await this.evidenceCallback?.({ + type: "step_observed", stepIndex, url: probeUrl, ariaTree, diff --git a/packages/core/lib/v3/types/public/agent.ts b/packages/core/lib/v3/types/public/agent.ts index 830fb1c966..3e958fc332 100644 --- a/packages/core/lib/v3/types/public/agent.ts +++ b/packages/core/lib/v3/types/public/agent.ts @@ -15,6 +15,7 @@ import { import { LogLine } from "./logs.js"; import { ClientOptions } from "./model.js"; import { StagehandZodObject } from "../../zodCompat.js"; +import type { AgentEvidenceCallback } from "./agentEvidenceEvents.js"; // Re-export ModelMessage for consumers who want to use it for conversation continuation export type { ModelMessage } from "ai"; @@ -136,6 +137,11 @@ export interface AgentCallbacks { onStepFinish?: | GenerateTextOnStepFinishCallback | StreamTextOnStepFinishCallback; + /** + * Callback called when Stagehand captures agent-run evidence such as + * screenshots, completed tool/action steps, or post-action observations. + */ + onEvidence?: AgentEvidenceCallback; } /** diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts new file mode 100644 index 0000000000..b31f493145 --- /dev/null +++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts @@ -0,0 +1,84 @@ +/** + * Evidence events emitted through AgentExecuteOptions.callbacks.onEvidence. + * + * These events describe observations made by Stagehand during an agent run. + * They are intentionally transport-level callback payloads; verifier-specific + * storage and normalization live in the evals/verifier layers. + */ + +export type AgentEvidenceRole = "probe" | "agent" | "agent_and_probe"; + +export type AgentEvidenceEvent = + | AgentScreenshotEvidenceEvent + | AgentStepFinishedEvent + | AgentStepObservedEvent + | AgentFinalAnswerEvent; + +/** + * Screenshot captured during an agent run. + * + * In DOM/hybrid mode, post-tool screenshots are probe evidence. In CUA mode, + * screenshots captured by the screenshot provider are agent evidence because + * they are the exact bytes sent to the provider. + */ +export interface AgentScreenshotEvidenceEvent { + type: "screenshot"; + /** Zero-based index of the step this screenshot corresponds to. */ + stepIndex: number; + /** PNG bytes from page.screenshot(). */ + screenshot: Buffer; + /** Page URL at the time of capture. */ + url: string; + /** Role this screenshot plays in downstream evidence collection. */ + evidenceRole?: AgentEvidenceRole; +} + +/** + * One completed agent tool/action step. + */ +export interface AgentStepFinishedEvent { + type: "step_finished"; + stepIndex: number; + /** Name of the tool/action that ran, e.g. "act", "extract", "click". */ + actionName: string; + /** Arguments passed to the tool/action. */ + actionArgs: Record; + /** Agent textual reasoning for the step, when available. */ + reasoning: string; + /** Outcome of the tool/action as seen by Stagehand. */ + toolOutput: { + ok: boolean; + /** Native return value from the tool/action. */ + result: unknown; + error?: string; + }; + /** ISO 8601 timestamp at which the step finished. */ + finishedAt: string; +} + +/** + * Independent post-step browser observation. + */ +export interface AgentStepObservedEvent { + type: "step_observed"; + stepIndex: number; + /** Page URL after the step's tool/action execution. */ + url: string; + /** Accessibility tree snapshot, when captured. */ + ariaTree?: string; + /** Viewport scroll context, when captured. */ + scroll?: { top: number; pageHeight: number }; +} + +/** Final answer emitted by the agent, when available. */ +export interface AgentFinalAnswerEvent { + type: "final_answer"; + /** The agent's final summary message. */ + message: string; + /** Optional structured output if the agent's output schema was set. */ + output?: Record; +} + +export type AgentEvidenceCallback = ( + event: AgentEvidenceEvent, +) => PromiseLike | void; diff --git a/packages/core/lib/v3/types/public/busEvents.ts b/packages/core/lib/v3/types/public/busEvents.ts deleted file mode 100644 index e2fa119499..0000000000 --- a/packages/core/lib/v3/types/public/busEvents.ts +++ /dev/null @@ -1,99 +0,0 @@ -/** - * Bus event payloads emitted by V3 on `v3.bus`. - * - * The bus is an EventEmitter; these types document the payload shape per - * event name so consumers (TrajectoryRecorder in packages/evals, custom - * subscribers) can type their handlers. - * - * The verifier recorder consumes these events to assemble persisted - * trajectories without coupling to individual agent handlers. - */ - -/** - * Names of bus events the agent handlers emit. Use these constants to - * subscribe; the bus accepts arbitrary strings, but a centralized list helps - * catch typos at the call site. - */ -export const BUS_EVENTS = { - AGENT_SCREENSHOT_TAKEN: "agent_screenshot_taken_event", - AGENT_STEP_FINISHED: "agent_step_finished_event", - AGENT_STEP_OBSERVED: "agent_step_observed_event", - AGENT_FINAL_ANSWER: "agent_final_answer_event", -} as const; - -export type BusEventName = (typeof BUS_EVENTS)[keyof typeof BUS_EVENTS]; - -/** - * Payload for `agent_screenshot_taken_event`. The raw screenshot Buffer the - * harness took after a step's tool execution. - * - * Note: in CUA mode the same Buffer is also what the provider received; in - * DOM/hybrid mode it's an independent harness probe. The verifier treats them - * as different evidence tiers regardless. - */ -export interface AgentScreenshotTakenEvent { - /** Zero-based index of the step this screenshot corresponds to. */ - stepIndex: number; - /** PNG bytes from page.screenshot(). */ - screenshot: Buffer; - /** Page URL at the time of capture. */ - url: string; - /** - * Evidence role for this screenshot. - * - * DOM/hybrid post-tool screenshots are probe-only. CUA screenshots are also - * the exact image bytes sent to the provider, so they serve both as tier-1 - * agent evidence and tier-2 probe evidence. - */ - evidenceRole?: "probe" | "agent" | "agent_and_probe"; -} - -/** - * Payload for `agent_step_finished_event`. Emitted once per tool call within - * a step result. Carries the tool's reported outcome and a reference to the - * agent's textual reasoning for the step. - * - * Tier 1 evidence (the bytes the LLM consumed as the tool result) is captured - * separately by the harness via an AgentExecuteCallbacks.onStepFinish wrapper - * and is not part of this payload. - */ -export interface AgentStepFinishedEvent { - stepIndex: number; - /** Name of the tool that ran (e.g., "act", "extract", "click"). */ - actionName: string; - /** Arguments passed to the tool. */ - actionArgs: Record; - /** Agent's textual reasoning (event.text on the AI SDK StepResult). */ - reasoning: string; - /** Outcome of the tool execution as seen by the harness. */ - toolOutput: { - ok: boolean; - /** The tool's native return value. */ - result: unknown; - error?: string; - }; - /** ISO 8601 timestamp at which the step finished. */ - finishedAt: string; -} - -/** - * Payload for `agent_step_observed_event`. Emitted after the harness probe - * completes for a step. - */ -export interface AgentStepObservedEvent { - stepIndex: number; - /** Page URL after the step's tool execution. */ - url: string; - /** v1 — accessibility tree snapshot. */ - ariaTree?: string; - /** v1 — viewport scroll context. */ - scroll?: { top: number; pageHeight: number }; -} - -/** Payload for `agent_final_answer_event`. Emitted when the `done` tool resolves. */ -export interface AgentFinalAnswerEvent { - /** The agent's final summary message. */ - message: string; - /** Optional structured output if the agent's `output` schema was set. */ - output?: Record; -} diff --git a/packages/core/lib/v3/types/public/index.ts b/packages/core/lib/v3/types/public/index.ts index 9bf24eb271..4fe0fb8a48 100644 --- a/packages/core/lib/v3/types/public/index.ts +++ b/packages/core/lib/v3/types/public/index.ts @@ -1,5 +1,5 @@ export * from "./agent.js"; -export * from "./busEvents.js"; +export * from "./agentEvidenceEvents.js"; // Export api.ts under namespace to avoid conflicts with methods.ts types export * as Api from "./api.js"; // Also export BrowserbaseRegion directly for convenience diff --git a/packages/core/tests/unit/public-api/export-surface.test.ts b/packages/core/tests/unit/public-api/export-surface.test.ts index fe4003f138..7a1f1f65dc 100644 --- a/packages/core/tests/unit/public-api/export-surface.test.ts +++ b/packages/core/tests/unit/public-api/export-surface.test.ts @@ -21,7 +21,6 @@ const publicApiShape = { AISdkClient: Stagehand.AISdkClient, Api: Stagehand.Api, AVAILABLE_CUA_MODELS: Stagehand.AVAILABLE_CUA_MODELS, - BUS_EVENTS: Stagehand.BUS_EVENTS, AgentProvider: Stagehand.AgentProvider, AnnotatedScreenshotText: Stagehand.AnnotatedScreenshotText, ConsoleMessage: Stagehand.ConsoleMessage, diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index 3cf1c17621..2b136fe0b5 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -6,8 +6,9 @@ import { } from "@browserbasehq/stagehand"; import type { AgentEvidence, + AgentEvidenceEvent, AgentFinalAnswerEvent, - AgentScreenshotTakenEvent, + AgentScreenshotEvidenceEvent, AgentStepFinishedEvent, AgentStepObservedEvent, ProbeEvidence, @@ -17,7 +18,6 @@ import type { TrajectoryStep, TrajectoryUsage, EvaluationResult, - V3, } from "@browserbasehq/stagehand"; interface PartialStep { @@ -32,7 +32,6 @@ interface PartialStep { } export interface TrajectoryRecorderOptions { - v3: V3; taskSpec: TaskSpec; /** * Root directory under which trajectory dirs are written. Each task run @@ -61,7 +60,6 @@ const ZERO_USAGE: TrajectoryUsage = { }; export class TrajectoryRecorder { - private readonly v3: V3; private readonly taskSpec: TaskSpec; private readonly runId: string; private readonly outputDir: string; @@ -76,15 +74,13 @@ export class TrajectoryRecorder { >(); private readonly screenshotsByStep = new Map< number, - AgentScreenshotTakenEvent + AgentScreenshotEvidenceEvent >(); private finalAnswerEvent?: AgentFinalAnswerEvent; private startedAt = ""; private endedAt = ""; - private listenersAttached = false; - // Bound handlers so attach/detach refer to the same references. - private readonly onScreenshot = (e: AgentScreenshotTakenEvent) => { + private onScreenshot(e: AgentScreenshotEvidenceEvent): void { this.screenshotsByStep.set(e.stepIndex, e); const partial = this.ensurePartial(e.stepIndex); @@ -114,8 +110,9 @@ export class TrajectoryRecorder { ], }); } - }; - private readonly onStepFinished = (e: AgentStepFinishedEvent) => { + } + + private onStepFinished(e: AgentStepFinishedEvent): void { const partial = this.ensurePartial(e.stepIndex); partial.actionName = e.actionName; partial.actionArgs = e.actionArgs; @@ -126,8 +123,9 @@ export class TrajectoryRecorder { partial.agentEvidence, buildAgentEvidence(e), ); - }; - private readonly onStepObserved = (e: AgentStepObservedEvent) => { + } + + private onStepObserved(e: AgentStepObservedEvent): void { this.observationByStep.set(e.stepIndex, e); const partial = this.ensurePartial(e.stepIndex); const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; @@ -135,13 +133,13 @@ export class TrajectoryRecorder { if (e.ariaTree !== undefined) probe.ariaTree = e.ariaTree; if (e.scroll !== undefined) probe.scroll = e.scroll; partial.probeEvidence = probe; - }; - private readonly onFinalAnswer = (e: AgentFinalAnswerEvent) => { + } + + private onFinalAnswer(e: AgentFinalAnswerEvent): void { this.finalAnswerEvent = e; - }; + } constructor(opts: TrajectoryRecorderOptions) { - this.v3 = opts.v3; this.taskSpec = opts.taskSpec; this.runId = opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-"); const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories"); @@ -149,15 +147,29 @@ export class TrajectoryRecorder { this.persistEnabled = shouldPersistTrajectory(opts.persist); } - /** Subscribe to bus events. Call once before agent.execute(). */ + /** Mark the beginning of collection. Call once before agent.execute(). */ start(): void { - if (this.listenersAttached) return; + if (this.startedAt) return; this.startedAt = new Date().toISOString(); - this.v3.bus.on("agent_screenshot_taken_event", this.onScreenshot); - this.v3.bus.on("agent_step_finished_event", this.onStepFinished); - this.v3.bus.on("agent_step_observed_event", this.onStepObserved); - this.v3.bus.on("agent_final_answer_event", this.onFinalAnswer); - this.listenersAttached = true; + } + + /** Ingest an evidence callback event from agent.execute(). */ + record(event: AgentEvidenceEvent): void { + if (!this.startedAt) this.start(); + switch (event.type) { + case "screenshot": + this.onScreenshot(event); + break; + case "step_finished": + this.onStepFinished(event); + break; + case "step_observed": + this.onStepObserved(event); + break; + case "final_answer": + this.onFinalAnswer(event); + break; + } } /** @@ -165,7 +177,7 @@ export class TrajectoryRecorder { * write the on-disk layout. Idempotent. */ async finish(opts: TrajectoryFinishOptions): Promise { - this.detach(); + if (!this.startedAt) this.start(); this.endedAt = new Date().toISOString(); const steps = this.assembleSteps(); @@ -187,7 +199,6 @@ export class TrajectoryRecorder { /** Throw away in-memory state without writing to disk. Used on early abort. */ cancel(): void { - this.detach(); this.partialSteps.clear(); this.observationByStep.clear(); this.screenshotsByStep.clear(); @@ -237,15 +248,6 @@ export class TrajectoryRecorder { ); } - private detach(): void { - if (!this.listenersAttached) return; - this.v3.bus.off("agent_screenshot_taken_event", this.onScreenshot); - this.v3.bus.off("agent_step_finished_event", this.onStepFinished); - this.v3.bus.off("agent_step_observed_event", this.onStepObserved); - this.v3.bus.off("agent_final_answer_event", this.onFinalAnswer); - this.listenersAttached = false; - } - private ensurePartial(stepIndex: number): Partial { let p = this.partialSteps.get(stepIndex); if (!p) { diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts index 5c5268e66a..81f9ef8b53 100644 --- a/packages/evals/tests/framework/trajectoryRecorder.test.ts +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -1,10 +1,9 @@ -import { EventEmitter } from "node:events"; import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; import { afterEach, describe, expect, it } from "vitest"; -import type { TaskSpec, V3 } from "@browserbasehq/stagehand"; +import type { TaskSpec } from "@browserbasehq/stagehand"; import { TrajectoryRecorder } from "../../framework/trajectoryRecorder.js"; @@ -26,10 +25,6 @@ function makeTempDir(): Promise { }); } -function makeV3(bus = new EventEmitter()): V3 { - return { bus } as unknown as V3; -} - function makeTaskSpec(): TaskSpec { return { id: "recorder-task", @@ -48,23 +43,23 @@ function makeTaskSpec(): TaskSpec { } describe("TrajectoryRecorder", () => { - it("assembles trajectory evidence from bus events", async () => { - const bus = new EventEmitter(); + it("assembles trajectory evidence from callback events", async () => { const recorder = new TrajectoryRecorder({ - v3: makeV3(bus), taskSpec: makeTaskSpec(), persist: false, }); const screenshot = Buffer.from("screen-1"); recorder.start(); - bus.emit("agent_screenshot_taken_event", { + recorder.record({ + type: "screenshot", stepIndex: 0, screenshot, url: "https://example.com/search", evidenceRole: "agent_and_probe", }); - bus.emit("agent_step_finished_event", { + recorder.record({ + type: "step_finished", stepIndex: 0, actionName: "extract", actionArgs: { instruction: "Read fares" }, @@ -75,12 +70,14 @@ describe("TrajectoryRecorder", () => { }, finishedAt: new Date(0).toISOString(), }); - bus.emit("agent_step_observed_event", { + recorder.record({ + type: "step_observed", stepIndex: 0, url: "https://example.com/search", ariaTree: "RootWebArea\nStaticText: Economy $100", }); - bus.emit("agent_final_answer_event", { + recorder.record({ + type: "final_answer", message: "Business is $150 more than economy.", }); @@ -117,9 +114,7 @@ describe("TrajectoryRecorder", () => { it("persists trajectory files and evaluator results", async () => { const outputRoot = await makeTempDir(); - const bus = new EventEmitter(); const recorder = new TrajectoryRecorder({ - v3: makeV3(bus), taskSpec: makeTaskSpec(), outputRoot, runId: "run-1", @@ -128,13 +123,15 @@ describe("TrajectoryRecorder", () => { const screenshot = Buffer.from("screen-1"); recorder.start(); - bus.emit("agent_screenshot_taken_event", { + recorder.record({ + type: "screenshot", stepIndex: 0, screenshot, url: "https://example.com/search", evidenceRole: "agent_and_probe", }); - bus.emit("agent_step_finished_event", { + recorder.record({ + type: "step_finished", stepIndex: 0, actionName: "act", actionArgs: { instruction: "Search fares" }, @@ -142,7 +139,8 @@ describe("TrajectoryRecorder", () => { toolOutput: { ok: true, result: "done" }, finishedAt: new Date(0).toISOString(), }); - bus.emit("agent_step_observed_event", { + recorder.record({ + type: "step_observed", stepIndex: 0, url: "https://example.com/search", }); From b493fa7fdfe232f6559e0f670c8993d6d4faf96a Mon Sep 17 00:00:00 2001 From: miguel Date: Thu, 21 May 2026 10:45:01 -0700 Subject: [PATCH 11/27] fix(cua): keep screenshot provider evidence non-fatal --- .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 21 ++++++- .../tests/unit/agent-captcha-hooks.test.ts | 60 ++++++++++++++++++- 2 files changed, 78 insertions(+), 3 deletions(-) diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index 31f0a649c8..8a611275f0 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -90,7 +90,7 @@ export class V3CuaAgentHandler { const page = await this.v3.context.awaitActivePage(); const screenshotBuffer = await page.screenshot({ fullPage: false }); - await this.emitCuaScreenshot(screenshotBuffer, page.url()); + await this.emitCuaScreenshotNonFatal(screenshotBuffer, page.url()); return screenshotBuffer.toString("base64"); // base64 png }); @@ -690,7 +690,7 @@ export class V3CuaAgentHandler { const currentUrl = page.url(); // Mirror the same buffer the CUA client receives as agent evidence. - await this.emitCuaScreenshot(screenshotBuffer, currentUrl); + await this.emitCuaScreenshotNonFatal(screenshotBuffer, currentUrl); return await this.agentClient.captureScreenshot({ base64Image: screenshotBuffer.toString("base64"), @@ -825,6 +825,23 @@ export class V3CuaAgentHandler { return event; } + private async emitCuaScreenshotNonFatal( + screenshot: Buffer, + url: string, + ): Promise { + try { + await this.emitCuaScreenshot(screenshot, url); + } catch (e) { + this.logger({ + category: "agent", + message: `Warning: CUA screenshot evidence callback failed: ${ + e instanceof Error ? e.message : String(e) + }`, + level: 1, + }); + } + } + private async emitCuaActionStep( action: AgentAction, result: ActionExecutionResult | undefined, diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts index b3d584c258..e2524da417 100644 --- a/packages/core/tests/unit/agent-captcha-hooks.test.ts +++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts @@ -60,6 +60,7 @@ class FakeCuaClient { public contextNotes: string[] = []; public preStepHook?: () => Promise; public actionHandler?: (action: Record) => Promise; + public screenshotProvider?: () => Promise; public executeImpl = vi.fn(async (options: unknown) => { void options; return { @@ -72,7 +73,9 @@ class FakeCuaClient { public captureScreenshot = vi.fn(async () => null); public setViewport = vi.fn(); public setCurrentUrl = vi.fn(); - public setScreenshotProvider = vi.fn(); + public setScreenshotProvider = vi.fn((provider: () => Promise) => { + this.screenshotProvider = provider; + }); public setSafetyConfirmationHandler = vi.fn(); setActionHandler( @@ -504,4 +507,59 @@ describe("v3 cua handler screenshot behavior", () => { // the CUA client takes a single screenshot after all actions itself. expect(screenshotSpy).not.toHaveBeenCalled(); }); + + it("still returns provider screenshots when screenshot evidence callbacks fail", async () => { + const screenshotBase64 = Buffer.from("fake-image").toString("base64"); + const onEvidence = vi.fn(async (event: { type: string }) => { + if (event.type === "screenshot") { + throw new Error("recorder failed"); + } + }); + + fakeCuaClient.executeImpl = vi.fn(async () => { + await expect(fakeCuaClient.screenshotProvider?.()).resolves.toBe( + screenshotBase64, + ); + return { + success: true, + message: "ok", + actions: [], + completed: true, + }; + }); + + const handler = new V3CuaAgentHandler( + { + context: { + awaitActivePage: async () => page, + }, + bus: { emit: vi.fn() }, + isCaptchaAutoSolveEnabled: false, + isAdvancedStealth: false, + configuredViewport: { width: 1288, height: 711 }, + isAgentReplayActive: () => false, + updateMetrics: vi.fn(), + } as never, + logger, + { + modelName: "openai/gpt-5.4", + clientOptions: { waitBetweenActions: 1 }, + } as never, + ); + + await handler.execute({ + instruction: "describe the page", + highlightCursor: false, + callbacks: { onEvidence }, + }); + + expect(onEvidence).toHaveBeenCalledWith( + expect.objectContaining({ type: "screenshot" }), + ); + expect( + logs.some((line) => + line.message.includes("CUA screenshot evidence callback failed"), + ), + ).toBe(true); + }); }); From 043b3e1b661b13f369b150dc5f9f4b6e5bcb5326 Mon Sep 17 00:00:00 2001 From: miguel Date: Thu, 21 May 2026 10:52:54 -0700 Subject: [PATCH 12/27] fix(verifier): hydrate persisted agent image paths --- packages/core/lib/v3/verifier/trajectory.ts | 58 ++++++++++++++----- .../tests/unit/verifier-trajectory.test.ts | 55 +++++++++++++++++- 2 files changed, 96 insertions(+), 17 deletions(-) diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index ec602d04d0..ae72dfac95 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -94,8 +94,9 @@ function normalizeResultLabel(label?: string): string { * * Reverses the recorder's serialization tweaks: * - `probeEvidence.screenshotPath` → read file into `probeEvidence.screenshot`. - * - Image modalities in `agentEvidence.modalities` carry `bytesBase64` on - * disk (human-readable JSON) instead of raw Buffer; we decode back. + * - Image modalities in `agentEvidence.modalities` carry `imagePath` on + * disk instead of raw Buffer; legacy `bytesBase64` fixtures are also + * accepted. * * @param dir absolute or cwd-relative path to a `//` directory. */ @@ -115,10 +116,11 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { | { type: "image"; mediaType: string; - // On-disk form (recorder writes base64); accept either to - // tolerate hand-edited fixtures. + // On-disk forms. Current writer externalizes bytes to + // imagePath; bytesBase64 is accepted for older fixtures. bytes?: unknown; bytesBase64?: string; + imagePath?: string; } | { type: "json"; content: unknown } >; @@ -128,7 +130,10 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { >; }; - const resolveWithinTrajectoryDir = (candidate: string): string => { + const resolveWithinTrajectoryDir = ( + candidate: string, + fieldName = "screenshotPath", + ): string => { const resolved = path.resolve(trajectoryDir, candidate); const relative = path.relative(trajectoryDir, resolved); const outside = @@ -138,7 +143,7 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { if (outside) { throw new Error( - `Trajectory screenshotPath escapes trajectory directory: ${candidate}`, + `Trajectory ${fieldName} escapes trajectory directory: ${candidate}`, ); } @@ -158,21 +163,44 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { } } - // Decode image modalities from base64 back to Buffer. + // Decode image modalities from disk references back to Buffer. if (step.agentEvidence?.modalities) { - step.agentEvidence.modalities = step.agentEvidence.modalities.map((m) => { - // The on-disk shape carries bytesBase64 instead of bytes, so we look - // through `unknown` here rather than rely on the typed union. - const raw = m as unknown as { bytesBase64?: string }; + const modalities: AgentEvidenceModality[] = []; + for (const m of step.agentEvidence.modalities) { + // The on-disk shape carries imagePath/bytesBase64 instead of bytes, + // so we look through `unknown` rather than rely on the typed union. + const raw = m as unknown as { + bytesBase64?: string; + imagePath?: string; + }; if (m.type === "image" && typeof raw.bytesBase64 === "string") { - return { + modalities.push({ type: "image" as const, bytes: Buffer.from(raw.bytesBase64, "base64"), mediaType: m.mediaType, - }; + }); + continue; } - return m as AgentEvidenceModality; - }); + if (m.type === "image" && typeof raw.imagePath === "string") { + const resolved = resolveWithinTrajectoryDir( + raw.imagePath, + "imagePath", + ); + try { + modalities.push({ + type: "image" as const, + bytes: await fs.readFile(resolved), + mediaType: m.mediaType, + }); + } catch { + // Missing agent image file: omit that image modality. The + // verifier's evidence_insufficient path will handle missing bytes. + } + continue; + } + modalities.push(m as AgentEvidenceModality); + } + step.agentEvidence.modalities = modalities; } } diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts index 4b09e53a12..cc6e674a1a 100644 --- a/packages/core/tests/unit/verifier-trajectory.test.ts +++ b/packages/core/tests/unit/verifier-trajectory.test.ts @@ -1,4 +1,4 @@ -import { mkdtemp, writeFile } from "node:fs/promises"; +import { mkdir, mkdtemp, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; import path from "node:path"; @@ -65,6 +65,11 @@ describe("verifier trajectory utilities", () => { const screenshot = Buffer.from("probe screenshot"); const agentImage = Buffer.from("agent image"); await writeFile(path.join(dir, "screenshot_1.png"), screenshot); + await mkdir(path.join(dir, "screenshots", "agent"), { recursive: true }); + await writeFile( + path.join(dir, "screenshots", "agent", "1.png"), + agentImage, + ); await writeFile( path.join(dir, "trajectory.json"), JSON.stringify({ @@ -86,7 +91,7 @@ describe("verifier trajectory utilities", () => { { type: "image", mediaType: "image/png", - bytesBase64: agentImage.toString("base64"), + imagePath: "screenshots/agent/1.png", }, ], }, @@ -109,6 +114,52 @@ describe("verifier trajectory utilities", () => { } }); + it("loads legacy base64 image modalities from disk", async () => { + const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); + const agentImage = Buffer.from("legacy agent image"); + await writeFile( + path.join(dir, "trajectory.json"), + JSON.stringify({ + task: { id: "task", instruction: "Do the task" }, + status: "complete", + usage: { input_tokens: 0, output_tokens: 0 }, + timing: { + startedAt: new Date(0).toISOString(), + endedAt: new Date(0).toISOString(), + }, + steps: [ + { + index: 0, + actionName: "act", + actionArgs: {}, + reasoning: "", + agentEvidence: { + modalities: [ + { + type: "image", + mediaType: "image/png", + bytesBase64: agentImage.toString("base64"), + }, + ], + }, + probeEvidence: {}, + toolOutput: { ok: true, result: null }, + startedAt: new Date(0).toISOString(), + finishedAt: new Date(0).toISOString(), + }, + ], + }), + ); + + const trajectory = await loadTrajectoryFromDisk(dir); + const modality = trajectory.steps[0].agentEvidence.modalities[0]; + + expect(modality.type).toBe("image"); + if (modality.type === "image") { + expect(modality.bytes).toEqual(agentImage); + } + }); + it("rejects screenshot paths outside the trajectory directory", async () => { const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); await writeFile( From 8596d2aedd86e95fd5a6748c75bc61e1bd7d2ecb Mon Sep 17 00:00:00 2001 From: miguel Date: Thu, 21 May 2026 10:52:54 -0700 Subject: [PATCH 13/27] fix(evals): avoid useless task data assignment --- packages/evals/framework/trajectoryRecorder.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index 2b136fe0b5..af52fa67a8 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -233,7 +233,7 @@ export class TrajectoryRecorder { ); const taskDataPath = path.join(this.outputDir, "task_data.json"); - let taskData: Record = {}; + let taskData: Record; try { taskData = JSON.parse(await fs.readFile(taskDataPath, "utf8")) as Record< string, From 2ba6c1f2bbe8f1e93d0d6b0b6c8cb1435719418b Mon Sep 17 00:00:00 2001 From: miguel Date: Thu, 21 May 2026 11:04:06 -0700 Subject: [PATCH 14/27] test(agent): drop stale bus mocks --- packages/core/tests/unit/agent-captcha-hooks.test.ts | 5 ----- packages/core/tests/unit/agent-temperature.test.ts | 6 ------ 2 files changed, 11 deletions(-) diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts index e2524da417..9cb626cf39 100644 --- a/packages/core/tests/unit/agent-captcha-hooks.test.ts +++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts @@ -250,7 +250,6 @@ describe("agent captcha hooks", () => { context: { awaitActivePage: async () => page, }, - bus: { emit: vi.fn() }, isCaptchaAutoSolveEnabled: true, isAdvancedStealth: false, configuredViewport: { width: 1288, height: 711 }, @@ -319,7 +318,6 @@ describe("agent captcha hooks", () => { context: { awaitActivePage: async () => page, }, - bus: { emit: vi.fn() }, isCaptchaAutoSolveEnabled: true, isAdvancedStealth: false, configuredViewport: { width: 1288, height: 711 }, @@ -395,7 +393,6 @@ describe("agent captcha hooks", () => { context: { awaitActivePage: async () => page, }, - bus: { emit: vi.fn() }, isCaptchaAutoSolveEnabled: true, isAdvancedStealth: false, configuredViewport: { width: 1288, height: 711 }, @@ -477,7 +474,6 @@ describe("v3 cua handler screenshot behavior", () => { context: { awaitActivePage: async () => page, }, - bus: { emit: vi.fn() }, isCaptchaAutoSolveEnabled: false, isAdvancedStealth: false, configuredViewport: { width: 1288, height: 711 }, @@ -533,7 +529,6 @@ describe("v3 cua handler screenshot behavior", () => { context: { awaitActivePage: async () => page, }, - bus: { emit: vi.fn() }, isCaptchaAutoSolveEnabled: false, isAdvancedStealth: false, configuredViewport: { width: 1288, height: 711 }, diff --git a/packages/core/tests/unit/agent-temperature.test.ts b/packages/core/tests/unit/agent-temperature.test.ts index 8f12b4a7e6..45184a9888 100644 --- a/packages/core/tests/unit/agent-temperature.test.ts +++ b/packages/core/tests/unit/agent-temperature.test.ts @@ -125,12 +125,6 @@ function createV3() { context: { awaitActivePage: vi.fn(async () => page), }, - bus: { - emit: vi.fn(), - on: vi.fn(), - off: vi.fn(), - listenerCount: vi.fn(() => 0), - }, isCaptchaAutoSolveEnabled: false, browserbaseApiKey: undefined, logger: vi.fn(), From 2780db25f2bb8d1b177c2b129a7b4fca02a075c5 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 22 May 2026 13:37:55 -0700 Subject: [PATCH 15/27] fix(verifier): redact inline screenshot payloads --- packages/core/lib/v3/verifier/trajectory.ts | 47 ++++++++++- .../tests/unit/verifier-trajectory.test.ts | 64 ++++++++++++++- .../evals/framework/trajectoryRecorder.ts | 78 +++++++++++++++++-- .../framework/trajectoryRecorder.test.ts | 65 ++++++++++++++++ 4 files changed, 245 insertions(+), 9 deletions(-) diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index ae72dfac95..3bb623f16d 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -231,6 +231,37 @@ export function shouldPersistTrajectory( return !process.env.CI; } +const REDACTED_INLINE_IMAGE = "[redacted inline image payload]"; +const INLINE_IMAGE_KEYS = new Set(["screenshotBase64"]); + +function shouldRedactBase64Key(key: string, actionName?: string): boolean { + return ( + INLINE_IMAGE_KEYS.has(key) || + (actionName === "screenshot" && key === "base64") + ); +} + +function redactInlineImagePayloads( + value: unknown, + actionName?: string, +): unknown { + if (!value || typeof value !== "object") return value; + if (Buffer.isBuffer(value)) return value; + + if (Array.isArray(value)) { + return value.map((item) => redactInlineImagePayloads(item, actionName)); + } + + const out: Record = {}; + for (const [key, nested] of Object.entries(value)) { + out[key] = + shouldRedactBase64Key(key, actionName) && typeof nested === "string" + ? REDACTED_INLINE_IMAGE + : redactInlineImagePayloads(nested, actionName); + } + return out; +} + /** * Write the on-disk trajectory layout under `dir`: * @@ -273,7 +304,14 @@ export async function writeTrajectoryDir( const modalities: unknown[] = []; for (const m of step.agentEvidence.modalities) { if (m.type !== "image") { - modalities.push(m); + modalities.push( + m.type === "json" + ? { + ...m, + content: redactInlineImagePayloads(m.content, step.actionName), + } + : m, + ); continue; } const suffix = multipleImages ? `_${imageSeq}` : ""; @@ -290,6 +328,13 @@ export async function writeTrajectoryDir( ...step, probeEvidence: probe, agentEvidence: { modalities }, + toolOutput: { + ...step.toolOutput, + result: redactInlineImagePayloads( + step.toolOutput.result, + step.actionName, + ), + }, }); } diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts index cc6e674a1a..7c9351d135 100644 --- a/packages/core/tests/unit/verifier-trajectory.test.ts +++ b/packages/core/tests/unit/verifier-trajectory.test.ts @@ -1,4 +1,4 @@ -import { mkdir, mkdtemp, writeFile } from "node:fs/promises"; +import { mkdir, mkdtemp, readFile, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; import path from "node:path"; @@ -8,6 +8,7 @@ import { loadTrajectoryFromDisk, nextResultFilename, normalizeRubric, + writeTrajectoryDir, } from "../../lib/v3/verifier/trajectory.js"; describe("verifier trajectory utilities", () => { @@ -160,6 +161,67 @@ describe("verifier trajectory utilities", () => { } }); + it("redacts inline screenshot payloads when writing trajectories", async () => { + const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); + const inlineScreenshot = + Buffer.from("inline screenshot").toString("base64"); + + await writeTrajectoryDir(dir, { + task: { id: "task", instruction: "Do the task" }, + status: "complete", + usage: { input_tokens: 0, output_tokens: 0 }, + timing: { + startedAt: new Date(0).toISOString(), + endedAt: new Date(0).toISOString(), + }, + steps: [ + { + index: 0, + actionName: "click", + actionArgs: {}, + reasoning: "", + agentEvidence: { + modalities: [ + { + type: "json", + content: { + output: { + success: true, + screenshotBase64: inlineScreenshot, + }, + }, + }, + ], + }, + probeEvidence: {}, + toolOutput: { + ok: true, + result: { + output: { + success: true, + screenshotBase64: inlineScreenshot, + }, + }, + }, + startedAt: new Date(0).toISOString(), + finishedAt: new Date(0).toISOString(), + }, + ], + }); + + const raw = await readFile(path.join(dir, "trajectory.json"), "utf8"); + const trajectory = JSON.parse(raw); + + expect(raw).not.toContain(inlineScreenshot); + expect( + trajectory.steps[0].agentEvidence.modalities[0].content.output + .screenshotBase64, + ).toBe("[redacted inline image payload]"); + expect(trajectory.steps[0].toolOutput.result.output.screenshotBase64).toBe( + "[redacted inline image payload]", + ); + }); + it("rejects screenshot paths outside the trajectory directory", async () => { const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); await writeFile( diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index af52fa67a8..e28626860f 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -117,7 +117,10 @@ export class TrajectoryRecorder { partial.actionName = e.actionName; partial.actionArgs = e.actionArgs; partial.reasoning = e.reasoning; - partial.toolOutput = e.toolOutput; + partial.toolOutput = { + ...e.toolOutput, + result: redactInlineImagePayloads(e.toolOutput.result, e.actionName), + }; partial.finishedAt = e.finishedAt; partial.agentEvidence = mergeAgentEvidence( partial.agentEvidence, @@ -287,6 +290,62 @@ export class TrajectoryRecorder { } } +const REDACTED_INLINE_IMAGE = "[redacted inline image payload]"; +const INLINE_IMAGE_KEYS = new Set(["screenshotBase64"]); + +function shouldRedactBase64Key(key: string, actionName?: string): boolean { + return ( + INLINE_IMAGE_KEYS.has(key) || + (actionName === "screenshot" && key === "base64") + ); +} + +function collectInlineImagePayloads( + value: unknown, + actionName?: string, + out: string[] = [], +): string[] { + if (!value || typeof value !== "object") return out; + if (Buffer.isBuffer(value)) return out; + + if (Array.isArray(value)) { + for (const item of value) { + collectInlineImagePayloads(item, actionName, out); + } + return out; + } + + for (const [key, nested] of Object.entries(value)) { + if (shouldRedactBase64Key(key, actionName) && typeof nested === "string") { + out.push(nested); + continue; + } + collectInlineImagePayloads(nested, actionName, out); + } + return out; +} + +function redactInlineImagePayloads( + value: unknown, + actionName?: string, +): unknown { + if (!value || typeof value !== "object") return value; + if (Buffer.isBuffer(value)) return value; + + if (Array.isArray(value)) { + return value.map((item) => redactInlineImagePayloads(item, actionName)); + } + + const out: Record = {}; + for (const [key, nested] of Object.entries(value)) { + out[key] = + shouldRedactBase64Key(key, actionName) && typeof nested === "string" + ? REDACTED_INLINE_IMAGE + : redactInlineImagePayloads(nested, actionName); + } + return out; +} + function mergeAgentEvidence( ...parts: Array ): AgentEvidence { @@ -313,21 +372,26 @@ function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence { mediaType: "image/png", }); } else if (typeof result === "object") { - // Vision tools embed a screenshotBase64 alongside the JSON result; lift - // it to its own image modality so the verifier sees both. - const r = result as { screenshotBase64?: string } & Record; - if (typeof r.screenshotBase64 === "string") { + // Vision tools embed screenshot bytes alongside JSON; lift those bytes to + // image modalities and redact the inline payloads from persisted text/json. + for (const imageBase64 of collectInlineImagePayloads( + result, + e.actionName, + )) { try { modalities.push({ type: "image", - bytes: Buffer.from(r.screenshotBase64, "base64"), + bytes: Buffer.from(imageBase64, "base64"), mediaType: "image/png", }); } catch { // Malformed base64; skip the image and keep the JSON modality. } } - modalities.push({ type: "json", content: result }); + modalities.push({ + type: "json", + content: redactInlineImagePayloads(result, e.actionName), + }); } return { modalities }; } diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts index 81f9ef8b53..5f72dadab4 100644 --- a/packages/evals/tests/framework/trajectoryRecorder.test.ts +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -192,4 +192,69 @@ describe("TrajectoryRecorder", () => { explanation: "The task was completed.", }); }); + + it("lifts inline screenshot payloads into image evidence and redacts JSON", async () => { + const inlineScreenshot = + Buffer.from("inline screenshot").toString("base64"); + const recorder = new TrajectoryRecorder({ + taskSpec: makeTaskSpec(), + persist: false, + }); + + recorder.record({ + type: "step_finished", + stepIndex: 0, + actionName: "click", + actionArgs: { describe: "Open fare details" }, + reasoning: "Click the fare details button.", + toolOutput: { + ok: true, + result: { + output: { + success: true, + describe: "Open fare details", + screenshotBase64: inlineScreenshot, + }, + }, + }, + finishedAt: new Date(0).toISOString(), + }); + + const trajectory = await recorder.finish({ status: "complete" }); + const step = trajectory.steps[0]; + const rawTrajectory = JSON.stringify(trajectory); + const imageModalities = step.agentEvidence.modalities.filter( + (m) => m.type === "image", + ); + const jsonModality = step.agentEvidence.modalities.find( + (m) => m.type === "json", + ); + + expect(rawTrajectory).not.toContain(inlineScreenshot); + expect(step.toolOutput.result).toMatchObject({ + output: { + success: true, + describe: "Open fare details", + screenshotBase64: "[redacted inline image payload]", + }, + }); + expect(jsonModality).toMatchObject({ + type: "json", + content: { + output: { + screenshotBase64: "[redacted inline image payload]", + }, + }, + }); + expect(imageModalities).toHaveLength(1); + expect(imageModalities[0]).toMatchObject({ + type: "image", + mediaType: "image/png", + }); + if (imageModalities[0].type === "image") { + expect(imageModalities[0].bytes).toEqual( + Buffer.from(inlineScreenshot, "base64"), + ); + } + }); }); From 25fadb142613163835fb6d88cf35f922eb6b239d Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 22 May 2026 13:54:24 -0700 Subject: [PATCH 16/27] refactor(verifier): centralize trajectory evidence handling --- .../v3/agent/utils/captureAriaTreeProbe.ts | 4 +- .../v3/agent/utils/cuaEvidenceStepTracker.ts | 55 ++++++++ .../v3/agent/utils/postStepProbeEvidence.ts | 64 +++++++++ .../lib/v3/agent/utils/toolOutputEvidence.ts | 25 ++++ .../core/lib/v3/handlers/v3AgentHandler.ts | 67 ++------- .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 105 +++----------- packages/core/lib/v3/index.ts | 9 ++ .../v3/types/public/agentEvidenceEvents.ts | 2 +- .../lib/v3/verifier/evidenceNormalization.ts | 112 +++++++++++++++ packages/core/lib/v3/verifier/index.ts | 7 + packages/core/lib/v3/verifier/trajectory.ts | 32 +---- .../unit/cua-evidence-step-tracker.test.ts | 55 ++++++++ .../unit/public-api/export-surface.test.ts | 4 + .../evals/framework/trajectoryRecorder.ts | 132 ++---------------- .../framework/trajectoryRecorder.test.ts | 18 ++- 15 files changed, 391 insertions(+), 300 deletions(-) create mode 100644 packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts create mode 100644 packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts create mode 100644 packages/core/lib/v3/agent/utils/toolOutputEvidence.ts create mode 100644 packages/core/lib/v3/verifier/evidenceNormalization.ts create mode 100644 packages/core/tests/unit/cua-evidence-step-tracker.test.ts diff --git a/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts index 8e3fcc050b..b68663eb04 100644 --- a/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts +++ b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts @@ -56,9 +56,7 @@ export async function captureAriaTreeProbe( try { // v3.extract() without a schema returns { pageText } where pageText is the // rendered accessibility tree — same path the agent's ariaTree tool uses. - const result = (await v3.extract({ timeout: timeoutMs })) as { - pageText?: string; - }; + const result = await v3.extract({ timeout: timeoutMs }); const pageText = result?.pageText; if (typeof pageText !== "string" || pageText.length === 0) return undefined; diff --git a/packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts b/packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts new file mode 100644 index 0000000000..356cc6a98c --- /dev/null +++ b/packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts @@ -0,0 +1,55 @@ +import type { AgentScreenshotEvidenceEvent } from "../../types/public/agentEvidenceEvents.js"; + +export interface PairedCuaActionStep { + stepIndex: number; + replayScreenshot?: AgentScreenshotEvidenceEvent; +} + +export class CuaEvidenceStepTracker { + private nextStepIndex = 0; + private latestScreenshot?: AgentScreenshotEvidenceEvent; + private latestScreenshotConsumed = true; + + reset(): void { + this.nextStepIndex = 0; + this.latestScreenshot = undefined; + this.latestScreenshotConsumed = true; + } + + recordScreenshot( + screenshot: Buffer, + url: string, + ): AgentScreenshotEvidenceEvent { + const event: AgentScreenshotEvidenceEvent = { + type: "screenshot", + stepIndex: this.nextStepIndex++, + screenshot, + url, + evidenceRole: "agent", + }; + this.latestScreenshot = event; + this.latestScreenshotConsumed = false; + return event; + } + + pairAction(): PairedCuaActionStep { + if (this.latestScreenshot && !this.latestScreenshotConsumed) { + this.latestScreenshotConsumed = true; + return { stepIndex: this.latestScreenshot.stepIndex }; + } + + const stepIndex = this.nextStepIndex++; + if (this.latestScreenshot) { + return { + stepIndex, + replayScreenshot: { ...this.latestScreenshot, stepIndex }, + }; + } + + return { stepIndex }; + } + + get latestScreenshotUrl(): string | undefined { + return this.latestScreenshot?.url; + } +} diff --git a/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts new file mode 100644 index 0000000000..de9cd9d044 --- /dev/null +++ b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts @@ -0,0 +1,64 @@ +import type { AgentEvidenceCallback } from "../../types/public/agentEvidenceEvents.js"; +import type { LogLine } from "../../types/public/logs.js"; +import type { V3 } from "../../v3.js"; +import { captureAriaTreeProbe } from "./captureAriaTreeProbe.js"; + +interface EmitPostStepProbeEvidenceOptions { + v3: V3; + stepIndices: number | number[]; + url: string; + evidenceCallback?: AgentEvidenceCallback; + logger: (message: LogLine) => void; + warningMessage: string; +} + +function errorMessage(error: unknown): string { + return error instanceof Error ? error.message : String(error); +} + +export async function emitPostStepProbeEvidence({ + v3, + stepIndices, + url, + evidenceCallback, + logger, + warningMessage, +}: EmitPostStepProbeEvidenceOptions): Promise { + if (!evidenceCallback) return; + + const indices = Array.isArray(stepIndices) ? stepIndices : [stepIndices]; + if (indices.length === 0) return; + + let probeUrl = url; + let screenshot: Buffer | undefined; + try { + const page = await v3.context.awaitActivePage(); + probeUrl = page.url(); + screenshot = await page.screenshot({ fullPage: false }); + } catch (e) { + logger({ + category: "agent", + message: `${warningMessage}: ${errorMessage(e)}`, + level: 1, + }); + } + + const ariaTree = await captureAriaTreeProbe(v3); + for (const stepIndex of indices) { + if (screenshot) { + await evidenceCallback({ + type: "screenshot", + stepIndex, + screenshot, + url: probeUrl, + evidenceRole: "probe", + }); + } + await evidenceCallback({ + type: "step_observed", + stepIndex, + url: probeUrl, + ariaTree, + }); + } +} diff --git a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts new file mode 100644 index 0000000000..c8806334c4 --- /dev/null +++ b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts @@ -0,0 +1,25 @@ +import type { AgentStepFinishedEvent } from "../../types/public/agentEvidenceEvents.js"; + +export function inferToolOutput( + toolResult: unknown, +): AgentStepFinishedEvent["toolOutput"] { + const error = + toolResult && + typeof toolResult === "object" && + "error" in toolResult && + typeof (toolResult as { error?: unknown }).error === "string" + ? (toolResult as { error: string }).error + : undefined; + + const isError = + toolResult && + typeof toolResult === "object" && + "isError" in toolResult && + Boolean((toolResult as { isError?: unknown }).isError); + + return { + ok: error === undefined && !isError, + result: toolResult, + error, + }; +} diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index 965c30eded..20a9c16a74 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -42,7 +42,8 @@ import { AgentAbortError, } from "../types/public/sdkErrors.js"; import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js"; -import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js"; +import { emitPostStepProbeEvidence } from "../agent/utils/postStepProbeEvidence.js"; +import { inferToolOutput } from "../agent/utils/toolOutputEvidence.js"; import { CaptchaSolver, CAPTCHA_SOLVED_MSG, @@ -314,11 +315,6 @@ export class V3AgentHandler { const stepIndex = stepCounter++; stepIndicesInTurn.push(stepIndex); - const toolOk = - !toolResult || - (typeof toolResult === "object" && - !("error" in toolResult) && - !("isError" in toolResult && toolResult.isError)); await evidenceCallback?.({ type: "step_finished", stepIndex, @@ -328,17 +324,7 @@ export class V3AgentHandler { ? (args as Record) : {}, reasoning: event.text ?? "", - toolOutput: { - ok: toolOk, - result: toolResult, - error: - toolResult && - typeof toolResult === "object" && - "error" in toolResult && - typeof (toolResult as { error?: unknown }).error === "string" - ? (toolResult as { error: string }).error - : undefined, - }, + toolOutput: inferToolOutput(toolResult), finishedAt: new Date().toISOString(), }); } @@ -349,45 +335,14 @@ export class V3AgentHandler { // reflects the settled page state after the batch of tool calls; this // is more faithful than dropping probe evidence for all but the last // tool call, while still avoiding per-tool screenshot overhead. - const wantsEvidence = evidenceCallback !== undefined; - if (stepIndicesInTurn.length > 0 && wantsEvidence) { - let screenshot: Buffer | undefined; - let ariaTree: string | undefined; - try { - const page = await this.v3.context.awaitActivePage(); - screenshot = await page.screenshot({ fullPage: false }); - // Capture the a11y tree alongside the URL probe so the verifier - // can ground textual claims (prices, names, dates) without OCR. - // Best-effort: returns undefined on failure/timeout. - ariaTree = await captureAriaTreeProbe(this.v3); - } catch (e) { - this.logger({ - category: "agent", - message: `Warning: harness probe failed: ${getErrorMessage(e)}`, - level: 1, - }); - } - for (const stepIndex of stepIndicesInTurn) { - // DOM/hybrid: this post-step screenshot is a harness probe - // only. The agent's tier-1 evidence is the tool's return value - // captured separately in step_finished. - if (screenshot) { - await evidenceCallback?.({ - type: "screenshot", - stepIndex, - screenshot, - url: state.currentPageUrl, - evidenceRole: "probe", - }); - } - await evidenceCallback?.({ - type: "step_observed", - stepIndex, - url: state.currentPageUrl, - ariaTree, - }); - } - } + await emitPostStepProbeEvidence({ + v3: this.v3, + stepIndices: stepIndicesInTurn, + url: state.currentPageUrl, + evidenceCallback, + logger: this.logger, + warningMessage: "Warning: harness probe failed", + }); } if (lastFinalAnswer) { diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index 8a611275f0..ac552b5eb1 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -7,7 +7,8 @@ import { GoogleCUAClient } from "../agent/GoogleCUAClient.js"; import { OpenAICUAClient } from "../agent/OpenAICUAClient.js"; import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js"; import { ensureXPath } from "../agent/utils/xpath.js"; -import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js"; +import { emitPostStepProbeEvidence } from "../agent/utils/postStepProbeEvidence.js"; +import { CuaEvidenceStepTracker } from "../agent/utils/cuaEvidenceStepTracker.js"; import { ActionExecutionResult, AgentAction, @@ -17,10 +18,7 @@ import { SafetyConfirmationHandler, } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; -import type { - AgentEvidenceCallback, - AgentScreenshotEvidenceEvent, -} from "../types/public/agentEvidenceEvents.js"; +import type { AgentEvidenceCallback } from "../types/public/agentEvidenceEvents.js"; import { type Action, V3FunctionName } from "../types/public/methods.js"; import { FlowLogger } from "../flowlogger/FlowLogger.js"; import { toTitleCase } from "../../utils.js"; @@ -42,13 +40,7 @@ export class V3CuaAgentHandler { private captchaSolver: CaptchaSolver | null = null; private captchaClickGuardRemaining = 0; private currentInstruction = ""; - // Monotonic step counter used by evidence callbacks. The CUA loop is internal to - // the agent client, so unlike v3AgentHandler we don't have per-tool-call - // step events; instead we tag every screenshot emission with an - // incrementing index. - private cuaStepCounter = 0; - private latestCuaScreenshot?: AgentScreenshotEvidenceEvent; - private latestCuaScreenshotConsumed = true; + private readonly cuaEvidenceSteps = new CuaEvidenceStepTracker(); private evidenceCallback?: AgentEvidenceCallback; constructor( @@ -205,9 +197,7 @@ export class V3CuaAgentHandler { this.setSafetyConfirmationHandler(options.callbacks?.onSafetyConfirmation); this.evidenceCallback = options.callbacks?.onEvidence; - this.cuaStepCounter = 0; - this.latestCuaScreenshot = undefined; - this.latestCuaScreenshotConsumed = true; + this.cuaEvidenceSteps.reset(); this.highlightCursor = options.highlightCursor !== false; this.currentInstruction = options.instruction; @@ -811,18 +801,10 @@ export class V3CuaAgentHandler { private async emitCuaScreenshot( screenshot: Buffer, url: string, - ): Promise { - const event: AgentScreenshotEvidenceEvent = { - type: "screenshot", - stepIndex: this.cuaStepCounter++, - screenshot, - url, - evidenceRole: "agent", - }; - this.latestCuaScreenshot = event; - this.latestCuaScreenshotConsumed = false; - await this.evidenceCallback?.(event); - return event; + ): Promise { + await this.evidenceCallback?.( + this.cuaEvidenceSteps.recordScreenshot(screenshot, url), + ); } private async emitCuaScreenshotNonFatal( @@ -849,25 +831,15 @@ export class V3CuaAgentHandler { let pageUrl = typeof action.pageUrl === "string" ? action.pageUrl - : this.latestCuaScreenshot?.url; + : (this.cuaEvidenceSteps.latestScreenshotUrl ?? ""); try { pageUrl = (await this.v3.context.awaitActivePage()).url(); } catch { // Keep the best pre-action URL fallback. } - let stepIndex: number; - - if (this.latestCuaScreenshot && !this.latestCuaScreenshotConsumed) { - stepIndex = this.latestCuaScreenshot.stepIndex; - this.latestCuaScreenshotConsumed = true; - } else if (this.latestCuaScreenshot) { - stepIndex = this.cuaStepCounter++; - await this.evidenceCallback?.({ - ...this.latestCuaScreenshot, - stepIndex, - }); - } else { - stepIndex = this.cuaStepCounter++; + const { stepIndex, replayScreenshot } = this.cuaEvidenceSteps.pairAction(); + if (replayScreenshot) { + await this.evidenceCallback?.(replayScreenshot); } const actionArgs = Object.fromEntries( @@ -899,49 +871,14 @@ export class V3CuaAgentHandler { // page actually LOOKS LIKE after the action ran. Without this the // verifier has no visual evidence that keystrokes/clicks landed, and // has to trust the action history alone. - // - // Callback-gated to keep ordinary agent runs free of the extra - // screenshot cost — mirrors v3AgentHandler's post-step probe. - const wantsEvidence = this.evidenceCallback !== undefined; - let probeUrl = pageUrl; - let probeScreenshot: Buffer | undefined; - if (wantsEvidence) { - try { - const page = await this.v3.context.awaitActivePage(); - probeUrl = page.url(); - probeScreenshot = await page.screenshot({ fullPage: false }); - } catch (e) { - this.logger({ - category: "agent", - message: `Warning: CUA post-action probe failed: ${ - e instanceof Error ? e.message : String(e) - }`, - level: 1, - }); - } - } - - if (probeScreenshot) { - await this.evidenceCallback?.({ - type: "screenshot", - stepIndex, - screenshot: probeScreenshot, - url: probeUrl, - evidenceRole: "probe", - }); - } - - if (probeUrl && wantsEvidence) { - // Capture the a11y tree alongside the URL probe so the verifier can - // ground textual claims without OCR. Best-effort. - const ariaTree = await captureAriaTreeProbe(this.v3); - await this.evidenceCallback?.({ - type: "step_observed", - stepIndex, - url: probeUrl, - ariaTree, - }); - } + await emitPostStepProbeEvidence({ + v3: this.v3, + stepIndices: stepIndex, + url: pageUrl, + evidenceCallback: this.evidenceCallback, + logger: this.logger, + warningMessage: "Warning: CUA post-action probe failed", + }); } private async injectCursor(): Promise { diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts index a5cbccf746..e2f403e9a4 100644 --- a/packages/core/lib/v3/index.ts +++ b/packages/core/lib/v3/index.ts @@ -25,9 +25,12 @@ import { getAISDKLanguageModel } from "./llm/LLMProvider.js"; import { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js"; import { maybeRunShutdownSupervisorFromArgv } from "./shutdown/supervisor.js"; import { + buildAgentEvidenceFromStepFinished, loadTrajectoryFromDisk, + mergeAgentEvidence, nextResultFilename, normalizeRubric, + redactInlineImagePayloads, shouldPersistTrajectory, writeTrajectoryDir, } from "./verifier/index.js"; @@ -92,9 +95,12 @@ export type { VerifierRawSteps, } from "./verifier/index.js"; export { + buildAgentEvidenceFromStepFinished, loadTrajectoryFromDisk, + mergeAgentEvidence, nextResultFilename, normalizeRubric, + redactInlineImagePayloads, shouldPersistTrajectory, writeTrajectoryDir, } from "./verifier/index.js"; @@ -148,9 +154,12 @@ const StagehandDefault = { toJsonSchema, connectToMCPServer, V3Evaluator, + buildAgentEvidenceFromStepFinished, loadTrajectoryFromDisk, + mergeAgentEvidence, nextResultFilename, normalizeRubric, + redactInlineImagePayloads, shouldPersistTrajectory, writeTrajectoryDir, tool, diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts index b31f493145..cf8e560779 100644 --- a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts +++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts @@ -6,7 +6,7 @@ * storage and normalization live in the evals/verifier layers. */ -export type AgentEvidenceRole = "probe" | "agent" | "agent_and_probe"; +export type AgentEvidenceRole = "probe" | "agent"; export type AgentEvidenceEvent = | AgentScreenshotEvidenceEvent diff --git a/packages/core/lib/v3/verifier/evidenceNormalization.ts b/packages/core/lib/v3/verifier/evidenceNormalization.ts new file mode 100644 index 0000000000..0012e84d6e --- /dev/null +++ b/packages/core/lib/v3/verifier/evidenceNormalization.ts @@ -0,0 +1,112 @@ +import type { AgentStepFinishedEvent } from "../types/public/agentEvidenceEvents.js"; +import type { AgentEvidence } from "./types.js"; + +export const REDACTED_INLINE_IMAGE = "[redacted inline image payload]"; + +const INLINE_IMAGE_KEYS = new Set(["screenshotBase64"]); + +function shouldRedactBase64Key(key: string, actionName?: string): boolean { + return ( + INLINE_IMAGE_KEYS.has(key) || + (actionName === "screenshot" && key === "base64") + ); +} + +export function collectInlineImagePayloads( + value: unknown, + actionName?: string, + out: string[] = [], +): string[] { + if (!value || typeof value !== "object") return out; + if (Buffer.isBuffer(value)) return out; + + if (Array.isArray(value)) { + for (const item of value) { + collectInlineImagePayloads(item, actionName, out); + } + return out; + } + + for (const [key, nested] of Object.entries(value)) { + if (shouldRedactBase64Key(key, actionName) && typeof nested === "string") { + out.push(nested); + continue; + } + collectInlineImagePayloads(nested, actionName, out); + } + return out; +} + +export function redactInlineImagePayloads( + value: unknown, + actionName?: string, +): unknown { + if (!value || typeof value !== "object") return value; + if (Buffer.isBuffer(value)) return value; + + if (Array.isArray(value)) { + return value.map((item) => redactInlineImagePayloads(item, actionName)); + } + + const out: Record = {}; + for (const [key, nested] of Object.entries(value)) { + out[key] = + shouldRedactBase64Key(key, actionName) && typeof nested === "string" + ? REDACTED_INLINE_IMAGE + : redactInlineImagePayloads(nested, actionName); + } + return out; +} + +export function mergeAgentEvidence( + ...parts: Array +): AgentEvidence { + return { + modalities: parts.flatMap((p) => p?.modalities ?? []), + }; +} + +export function buildAgentEvidenceFromStepFinished( + event: AgentStepFinishedEvent, +): AgentEvidence { + const modalities: AgentEvidence["modalities"] = []; + if (event.reasoning) { + modalities.push({ type: "text", content: event.reasoning }); + } + + const result = event.toolOutput.result; + if (result === undefined || result === null) { + return { modalities }; + } + + if (typeof result === "string") { + modalities.push({ type: "text", content: result }); + } else if (Buffer.isBuffer(result)) { + modalities.push({ + type: "image", + bytes: result, + mediaType: "image/png", + }); + } else if (typeof result === "object") { + for (const imageBase64 of collectInlineImagePayloads( + result, + event.actionName, + )) { + try { + modalities.push({ + type: "image", + bytes: Buffer.from(imageBase64, "base64"), + mediaType: "image/png", + }); + } catch { + // Malformed base64; skip the image and keep the JSON modality. + } + } + modalities.push({ + type: "json", + content: redactInlineImagePayloads(result, event.actionName), + }); + } + + return { modalities }; +} diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts index 2b14cfb16a..f1638facc7 100644 --- a/packages/core/lib/v3/verifier/index.ts +++ b/packages/core/lib/v3/verifier/index.ts @@ -21,6 +21,13 @@ export type { VerifierFinding, VerifierRawSteps, } from "./types.js"; +export { + buildAgentEvidenceFromStepFinished, + collectInlineImagePayloads, + mergeAgentEvidence, + redactInlineImagePayloads, + REDACTED_INLINE_IMAGE, +} from "./evidenceNormalization.js"; export { loadTrajectoryFromDisk, nextResultFilename, diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index 3bb623f16d..75e1372bbe 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -7,6 +7,7 @@ import type { Trajectory, TrajectoryStep, } from "./types.js"; +import { redactInlineImagePayloads } from "./evidenceNormalization.js"; type RawRubricCriterion = { criterion: unknown; @@ -231,37 +232,6 @@ export function shouldPersistTrajectory( return !process.env.CI; } -const REDACTED_INLINE_IMAGE = "[redacted inline image payload]"; -const INLINE_IMAGE_KEYS = new Set(["screenshotBase64"]); - -function shouldRedactBase64Key(key: string, actionName?: string): boolean { - return ( - INLINE_IMAGE_KEYS.has(key) || - (actionName === "screenshot" && key === "base64") - ); -} - -function redactInlineImagePayloads( - value: unknown, - actionName?: string, -): unknown { - if (!value || typeof value !== "object") return value; - if (Buffer.isBuffer(value)) return value; - - if (Array.isArray(value)) { - return value.map((item) => redactInlineImagePayloads(item, actionName)); - } - - const out: Record = {}; - for (const [key, nested] of Object.entries(value)) { - out[key] = - shouldRedactBase64Key(key, actionName) && typeof nested === "string" - ? REDACTED_INLINE_IMAGE - : redactInlineImagePayloads(nested, actionName); - } - return out; -} - /** * Write the on-disk trajectory layout under `dir`: * diff --git a/packages/core/tests/unit/cua-evidence-step-tracker.test.ts b/packages/core/tests/unit/cua-evidence-step-tracker.test.ts new file mode 100644 index 0000000000..112c820a97 --- /dev/null +++ b/packages/core/tests/unit/cua-evidence-step-tracker.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, it } from "vitest"; + +import { CuaEvidenceStepTracker } from "../../lib/v3/agent/utils/cuaEvidenceStepTracker.js"; + +describe("CuaEvidenceStepTracker", () => { + it("pairs a fresh provider screenshot with the next action", () => { + const tracker = new CuaEvidenceStepTracker(); + const screenshot = Buffer.from("screen"); + + const event = tracker.recordScreenshot(screenshot, "https://example.com"); + const paired = tracker.pairAction(); + + expect(event).toMatchObject({ + type: "screenshot", + stepIndex: 0, + evidenceRole: "agent", + url: "https://example.com", + }); + expect(paired).toEqual({ stepIndex: 0 }); + }); + + it("allocates an action step without screenshot evidence", () => { + const tracker = new CuaEvidenceStepTracker(); + + expect(tracker.pairAction()).toEqual({ stepIndex: 0 }); + }); + + it("replays the latest consumed screenshot for later actions", () => { + const tracker = new CuaEvidenceStepTracker(); + const screenshot = Buffer.from("screen"); + + tracker.recordScreenshot(screenshot, "https://example.com/start"); + tracker.pairAction(); + const paired = tracker.pairAction(); + + expect(paired.stepIndex).toBe(1); + expect(paired.replayScreenshot).toMatchObject({ + type: "screenshot", + stepIndex: 1, + evidenceRole: "agent", + url: "https://example.com/start", + }); + expect(paired.replayScreenshot?.screenshot).toEqual(screenshot); + }); + + it("resets step allocation and pending screenshot state", () => { + const tracker = new CuaEvidenceStepTracker(); + + tracker.recordScreenshot(Buffer.from("screen"), "https://example.com"); + tracker.reset(); + + expect(tracker.pairAction()).toEqual({ stepIndex: 0 }); + expect(tracker.latestScreenshotUrl).toBeUndefined(); + }); +}); diff --git a/packages/core/tests/unit/public-api/export-surface.test.ts b/packages/core/tests/unit/public-api/export-surface.test.ts index 7a1f1f65dc..163fd60094 100644 --- a/packages/core/tests/unit/public-api/export-surface.test.ts +++ b/packages/core/tests/unit/public-api/export-surface.test.ts @@ -32,6 +32,8 @@ const publicApiShape = { V3: Stagehand.V3, V3Evaluator: Stagehand.V3Evaluator, V3FunctionName: Stagehand.V3FunctionName, + buildAgentEvidenceFromStepFinished: + Stagehand.buildAgentEvidenceFromStepFinished, connectToMCPServer: Stagehand.connectToMCPServer, default: StagehandDefaultExport, defaultExtractSchema: Stagehand.defaultExtractSchema, @@ -44,12 +46,14 @@ const publicApiShape = { jsonSchemaToZod: Stagehand.jsonSchemaToZod, loadApiKeyFromEnv: Stagehand.loadApiKeyFromEnv, loadTrajectoryFromDisk: Stagehand.loadTrajectoryFromDisk, + mergeAgentEvidence: Stagehand.mergeAgentEvidence, localBrowserLaunchOptionsSchema: Stagehand.localBrowserLaunchOptionsSchema, modelToAgentProviderMap: Stagehand.modelToAgentProviderMap, nextResultFilename: Stagehand.nextResultFilename, normalizeRubric: Stagehand.normalizeRubric, pageTextSchema: Stagehand.pageTextSchema, providerEnvVarMap: Stagehand.providerEnvVarMap, + redactInlineImagePayloads: Stagehand.redactInlineImagePayloads, shouldPersistTrajectory: Stagehand.shouldPersistTrajectory, toGeminiSchema: Stagehand.toGeminiSchema, toJsonSchema: Stagehand.toJsonSchema, diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index e28626860f..da2b9b5da8 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -1,6 +1,9 @@ import fs from "node:fs/promises"; import path from "node:path"; import { + buildAgentEvidenceFromStepFinished, + mergeAgentEvidence, + redactInlineImagePayloads, shouldPersistTrajectory, writeTrajectoryDir, } from "@browserbasehq/stagehand"; @@ -68,20 +71,11 @@ export class TrajectoryRecorder { // Events can arrive out-of-order across step indices; same-step events all // fire in one microtask. private readonly partialSteps = new Map>(); - private readonly observationByStep = new Map< - number, - AgentStepObservedEvent - >(); - private readonly screenshotsByStep = new Map< - number, - AgentScreenshotEvidenceEvent - >(); private finalAnswerEvent?: AgentFinalAnswerEvent; private startedAt = ""; private endedAt = ""; private onScreenshot(e: AgentScreenshotEvidenceEvent): void { - this.screenshotsByStep.set(e.stepIndex, e); const partial = this.ensurePartial(e.stepIndex); // Default to probe when the emit site doesn't tag a role: matches @@ -89,7 +83,7 @@ export class TrajectoryRecorder { // NOT a probe — emitCuaActionStep fills that role post-action. const role = e.evidenceRole ?? "probe"; - if (role === "probe" || role === "agent_and_probe") { + if (role === "probe") { const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; probe.screenshot = e.screenshot; probe.url = e.url; @@ -103,7 +97,7 @@ export class TrajectoryRecorder { }; } - if (role === "agent" || role === "agent_and_probe") { + if (role === "agent") { partial.agentEvidence = mergeAgentEvidence(partial.agentEvidence, { modalities: [ { type: "image", bytes: e.screenshot, mediaType: "image/png" }, @@ -124,12 +118,11 @@ export class TrajectoryRecorder { partial.finishedAt = e.finishedAt; partial.agentEvidence = mergeAgentEvidence( partial.agentEvidence, - buildAgentEvidence(e), + buildAgentEvidenceFromStepFinished(e), ); } private onStepObserved(e: AgentStepObservedEvent): void { - this.observationByStep.set(e.stepIndex, e); const partial = this.ensurePartial(e.stepIndex); const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; probe.url = e.url; @@ -203,8 +196,6 @@ export class TrajectoryRecorder { /** Throw away in-memory state without writing to disk. Used on early abort. */ cancel(): void { this.partialSteps.clear(); - this.observationByStep.clear(); - this.screenshotsByStep.clear(); this.finalAnswerEvent = undefined; } @@ -270,8 +261,9 @@ export class TrajectoryRecorder { p.toolOutput === undefined || p.finishedAt === undefined ) { - // CUA emits screenshot-only entries between actions; skip them here - // and let writeTrajectoryDir record them via the probe channel. + // Provider-only screenshot refreshes are transport evidence for the + // next CUA action. If no action arrives for this index, there is no + // completed trajectory step to persist. continue; } out.push({ @@ -289,109 +281,3 @@ export class TrajectoryRecorder { return out; } } - -const REDACTED_INLINE_IMAGE = "[redacted inline image payload]"; -const INLINE_IMAGE_KEYS = new Set(["screenshotBase64"]); - -function shouldRedactBase64Key(key: string, actionName?: string): boolean { - return ( - INLINE_IMAGE_KEYS.has(key) || - (actionName === "screenshot" && key === "base64") - ); -} - -function collectInlineImagePayloads( - value: unknown, - actionName?: string, - out: string[] = [], -): string[] { - if (!value || typeof value !== "object") return out; - if (Buffer.isBuffer(value)) return out; - - if (Array.isArray(value)) { - for (const item of value) { - collectInlineImagePayloads(item, actionName, out); - } - return out; - } - - for (const [key, nested] of Object.entries(value)) { - if (shouldRedactBase64Key(key, actionName) && typeof nested === "string") { - out.push(nested); - continue; - } - collectInlineImagePayloads(nested, actionName, out); - } - return out; -} - -function redactInlineImagePayloads( - value: unknown, - actionName?: string, -): unknown { - if (!value || typeof value !== "object") return value; - if (Buffer.isBuffer(value)) return value; - - if (Array.isArray(value)) { - return value.map((item) => redactInlineImagePayloads(item, actionName)); - } - - const out: Record = {}; - for (const [key, nested] of Object.entries(value)) { - out[key] = - shouldRedactBase64Key(key, actionName) && typeof nested === "string" - ? REDACTED_INLINE_IMAGE - : redactInlineImagePayloads(nested, actionName); - } - return out; -} - -function mergeAgentEvidence( - ...parts: Array -): AgentEvidence { - return { - modalities: parts.flatMap((p) => p?.modalities ?? []), - }; -} - -function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence { - const modalities: AgentEvidence["modalities"] = []; - if (e.reasoning) { - modalities.push({ type: "text", content: e.reasoning }); - } - const result = e.toolOutput.result; - if (result === undefined || result === null) { - return { modalities }; - } - if (typeof result === "string") { - modalities.push({ type: "text", content: result }); - } else if (Buffer.isBuffer(result)) { - modalities.push({ - type: "image", - bytes: result, - mediaType: "image/png", - }); - } else if (typeof result === "object") { - // Vision tools embed screenshot bytes alongside JSON; lift those bytes to - // image modalities and redact the inline payloads from persisted text/json. - for (const imageBase64 of collectInlineImagePayloads( - result, - e.actionName, - )) { - try { - modalities.push({ - type: "image", - bytes: Buffer.from(imageBase64, "base64"), - mediaType: "image/png", - }); - } catch { - // Malformed base64; skip the image and keep the JSON modality. - } - } - modalities.push({ - type: "json", - content: redactInlineImagePayloads(result, e.actionName), - }); - } - return { modalities }; -} diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts index 5f72dadab4..38443c5dc0 100644 --- a/packages/evals/tests/framework/trajectoryRecorder.test.ts +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -56,7 +56,14 @@ describe("TrajectoryRecorder", () => { stepIndex: 0, screenshot, url: "https://example.com/search", - evidenceRole: "agent_and_probe", + evidenceRole: "agent", + }); + recorder.record({ + type: "screenshot", + stepIndex: 0, + screenshot, + url: "https://example.com/search", + evidenceRole: "probe", }); recorder.record({ type: "step_finished", @@ -128,7 +135,14 @@ describe("TrajectoryRecorder", () => { stepIndex: 0, screenshot, url: "https://example.com/search", - evidenceRole: "agent_and_probe", + evidenceRole: "agent", + }); + recorder.record({ + type: "screenshot", + stepIndex: 0, + screenshot, + url: "https://example.com/search", + evidenceRole: "probe", }); recorder.record({ type: "step_finished", From 3dfa861419c405f8c0b7bb8d03425c8a6ef5be8c Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 22 May 2026 16:42:25 -0700 Subject: [PATCH 17/27] fix(agent): make onEvidence non-fatal by wrapping at boundary User-supplied onEvidence callbacks must never abort the agent loop. Wrap the callback once where each handler receives it; internal emit sites keep calling it as a plain await. Also unify CUA step_finished.toolOutput construction behind a shared inferCuaToolOutput helper alongside the existing inferToolOutput. --- .../lib/v3/agent/utils/toolOutputEvidence.ts | 11 ++++++ .../v3/agent/utils/wrapEvidenceCallback.ts | 27 +++++++++++++++ .../core/lib/v3/handlers/v3AgentHandler.ts | 5 +-- .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 34 +++++-------------- 4 files changed, 50 insertions(+), 27 deletions(-) create mode 100644 packages/core/lib/v3/agent/utils/wrapEvidenceCallback.ts diff --git a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts index c8806334c4..b4be376757 100644 --- a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts +++ b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts @@ -1,4 +1,5 @@ import type { AgentStepFinishedEvent } from "../../types/public/agentEvidenceEvents.js"; +import type { ActionExecutionResult } from "../../types/public/agent.js"; export function inferToolOutput( toolResult: unknown, @@ -23,3 +24,13 @@ export function inferToolOutput( error, }; } + +export function inferCuaToolOutput( + result: ActionExecutionResult | undefined, +): AgentStepFinishedEvent["toolOutput"] { + return { + ok: result?.success !== false, + result: result ?? { success: true }, + error: result?.error, + }; +} diff --git a/packages/core/lib/v3/agent/utils/wrapEvidenceCallback.ts b/packages/core/lib/v3/agent/utils/wrapEvidenceCallback.ts new file mode 100644 index 0000000000..1b35bc04c9 --- /dev/null +++ b/packages/core/lib/v3/agent/utils/wrapEvidenceCallback.ts @@ -0,0 +1,27 @@ +import type { AgentEvidenceCallback } from "../../types/public/agentEvidenceEvents.js"; +import type { LogLine } from "../../types/public/logs.js"; + +// onEvidence is a user-supplied observability hook (trajectory recording, +// verifier capture, etc.). Wrap it once at the boundary where the handler +// receives it so a throwing user callback can never abort the agent loop — +// internal emit sites can then call the wrapped callback directly without +// per-site try/catch. +export function wrapEvidenceCallback( + callback: AgentEvidenceCallback | undefined, + logger: (message: LogLine) => void, +): AgentEvidenceCallback | undefined { + if (!callback) return undefined; + return async (event) => { + try { + await callback(event); + } catch (e) { + logger({ + category: "agent", + message: `Warning: onEvidence callback failed for ${event.type}: ${ + e instanceof Error ? e.message : String(e) + }`, + level: 1, + }); + } + }; +} diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index 20a9c16a74..5281c1a70f 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -43,6 +43,7 @@ import { } from "../types/public/sdkErrors.js"; import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js"; import { emitPostStepProbeEvidence } from "../agent/utils/postStepProbeEvidence.js"; +import { wrapEvidenceCallback } from "../agent/utils/wrapEvidenceCallback.js"; import { inferToolOutput } from "../agent/utils/toolOutputEvidence.js"; import { CaptchaSolver, @@ -438,7 +439,7 @@ export class V3AgentHandler { onStepFinish: this.createStepHandler( state, callbacks?.onStepFinish, - callbacks?.onEvidence, + wrapEvidenceCallback(callbacks?.onEvidence, this.logger), ), abortSignal: preparedOptions.signal, providerOptions: { @@ -578,7 +579,7 @@ export class V3AgentHandler { onStepFinish: this.createStepHandler( state, callbacks?.onStepFinish, - callbacks?.onEvidence, + wrapEvidenceCallback(callbacks?.onEvidence, this.logger), ), onError: (event) => { captchaSolver?.dispose(); diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index ac552b5eb1..bcb92f9a00 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -8,6 +8,8 @@ import { OpenAICUAClient } from "../agent/OpenAICUAClient.js"; import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js"; import { ensureXPath } from "../agent/utils/xpath.js"; import { emitPostStepProbeEvidence } from "../agent/utils/postStepProbeEvidence.js"; +import { wrapEvidenceCallback } from "../agent/utils/wrapEvidenceCallback.js"; +import { inferCuaToolOutput } from "../agent/utils/toolOutputEvidence.js"; import { CuaEvidenceStepTracker } from "../agent/utils/cuaEvidenceStepTracker.js"; import { ActionExecutionResult, @@ -82,7 +84,7 @@ export class V3CuaAgentHandler { const page = await this.v3.context.awaitActivePage(); const screenshotBuffer = await page.screenshot({ fullPage: false }); - await this.emitCuaScreenshotNonFatal(screenshotBuffer, page.url()); + await this.emitCuaScreenshot(screenshotBuffer, page.url()); return screenshotBuffer.toString("base64"); // base64 png }); @@ -196,7 +198,10 @@ export class V3CuaAgentHandler { : optionsOrInstruction; this.setSafetyConfirmationHandler(options.callbacks?.onSafetyConfirmation); - this.evidenceCallback = options.callbacks?.onEvidence; + this.evidenceCallback = wrapEvidenceCallback( + options.callbacks?.onEvidence, + this.logger, + ); this.cuaEvidenceSteps.reset(); this.highlightCursor = options.highlightCursor !== false; @@ -680,7 +685,7 @@ export class V3CuaAgentHandler { const currentUrl = page.url(); // Mirror the same buffer the CUA client receives as agent evidence. - await this.emitCuaScreenshotNonFatal(screenshotBuffer, currentUrl); + await this.emitCuaScreenshot(screenshotBuffer, currentUrl); return await this.agentClient.captureScreenshot({ base64Image: screenshotBuffer.toString("base64"), @@ -807,23 +812,6 @@ export class V3CuaAgentHandler { ); } - private async emitCuaScreenshotNonFatal( - screenshot: Buffer, - url: string, - ): Promise { - try { - await this.emitCuaScreenshot(screenshot, url); - } catch (e) { - this.logger({ - category: "agent", - message: `Warning: CUA screenshot evidence callback failed: ${ - e instanceof Error ? e.message : String(e) - }`, - level: 1, - }); - } - } - private async emitCuaActionStep( action: AgentAction, result: ActionExecutionResult | undefined, @@ -858,11 +846,7 @@ export class V3CuaAgentHandler { actionName: String(action.type), actionArgs, reasoning, - toolOutput: { - ok: result?.success !== false, - result: result ?? { success: true }, - error: result?.error, - }, + toolOutput: inferCuaToolOutput(result), finishedAt: new Date().toISOString(), }); From 4d203ca28f09618af8e06290e353261ee48f36a9 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 22 May 2026 18:49:02 -0700 Subject: [PATCH 18/27] test(agent): update warning-message assertion to generic onEvidence label The non-fatal wrapper now logs `onEvidence callback failed for ` from a single boundary helper rather than the per-site `CUA screenshot evidence callback failed`. Update the assertion to match. --- packages/core/tests/unit/agent-captcha-hooks.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts index 9cb626cf39..4789fb5c63 100644 --- a/packages/core/tests/unit/agent-captcha-hooks.test.ts +++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts @@ -553,7 +553,7 @@ describe("v3 cua handler screenshot behavior", () => { ); expect( logs.some((line) => - line.message.includes("CUA screenshot evidence callback failed"), + line.message.includes("onEvidence callback failed for screenshot"), ), ).toBe(true); }); From 2418db391edd981c3adbdd57e97ad941001c3cc2 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 22 May 2026 16:04:33 -0700 Subject: [PATCH 19/27] fix(verifier): preserve final evidence observations --- .../v3/agent/utils/postStepProbeEvidence.ts | 60 +++++--- .../lib/v3/agent/utils/toolOutputEvidence.ts | 79 ++++++++-- .../core/lib/v3/handlers/v3AgentHandler.ts | 136 +++++++++++++----- .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 39 +++-- .../v3/types/public/agentEvidenceEvents.ts | 21 +++ .../lib/v3/verifier/evidenceNormalization.ts | 6 + packages/core/lib/v3/verifier/trajectory.ts | 32 ++++- packages/core/lib/v3/verifier/types.ts | 4 +- packages/core/lib/v3Evaluator.ts | 6 + .../tests/unit/tool-output-evidence.test.ts | 58 ++++++++ packages/core/tests/unit/v3-evaluator.test.ts | 33 +++++ .../verifier-evidence-normalization.test.ts | 20 +++ .../tests/unit/verifier-trajectory.test.ts | 17 +++ .../evals/framework/trajectoryRecorder.ts | 24 +++- .../framework/trajectoryRecorder.test.ts | 53 +++++++ 15 files changed, 507 insertions(+), 81 deletions(-) create mode 100644 packages/core/tests/unit/tool-output-evidence.test.ts create mode 100644 packages/core/tests/unit/verifier-evidence-normalization.test.ts diff --git a/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts index de9cd9d044..10889d6cf8 100644 --- a/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts +++ b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts @@ -3,32 +3,32 @@ import type { LogLine } from "../../types/public/logs.js"; import type { V3 } from "../../v3.js"; import { captureAriaTreeProbe } from "./captureAriaTreeProbe.js"; -interface EmitPostStepProbeEvidenceOptions { +interface CaptureProbeEvidenceOptions { v3: V3; - stepIndices: number | number[]; url: string; - evidenceCallback?: AgentEvidenceCallback; logger: (message: LogLine) => void; warningMessage: string; } +interface EmitPostStepProbeEvidenceOptions extends CaptureProbeEvidenceOptions { + stepIndices: number | number[]; + evidenceCallback?: AgentEvidenceCallback; +} + function errorMessage(error: unknown): string { return error instanceof Error ? error.message : String(error); } -export async function emitPostStepProbeEvidence({ +export async function captureProbeEvidence({ v3, - stepIndices, url, - evidenceCallback, logger, warningMessage, -}: EmitPostStepProbeEvidenceOptions): Promise { - if (!evidenceCallback) return; - - const indices = Array.isArray(stepIndices) ? stepIndices : [stepIndices]; - if (indices.length === 0) return; - +}: CaptureProbeEvidenceOptions): Promise<{ + url: string; + screenshot?: Buffer; + ariaTree?: string; +}> { let probeUrl = url; let screenshot: Buffer | undefined; try { @@ -44,21 +44,47 @@ export async function emitPostStepProbeEvidence({ } const ariaTree = await captureAriaTreeProbe(v3); + return { + url: probeUrl, + ...(screenshot ? { screenshot } : {}), + ...(ariaTree !== undefined ? { ariaTree } : {}), + }; +} + +export async function emitPostStepProbeEvidence({ + v3, + stepIndices, + url, + evidenceCallback, + logger, + warningMessage, +}: EmitPostStepProbeEvidenceOptions): Promise { + if (!evidenceCallback) return; + + const indices = Array.isArray(stepIndices) ? stepIndices : [stepIndices]; + if (indices.length === 0) return; + + const probe = await captureProbeEvidence({ + v3, + url, + logger, + warningMessage, + }); for (const stepIndex of indices) { - if (screenshot) { + if (probe.screenshot) { await evidenceCallback({ type: "screenshot", stepIndex, - screenshot, - url: probeUrl, + screenshot: probe.screenshot, + url: probe.url, evidenceRole: "probe", }); } await evidenceCallback({ type: "step_observed", stepIndex, - url: probeUrl, - ariaTree, + url: probe.url, + ariaTree: probe.ariaTree, }); } } diff --git a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts index b4be376757..9718181479 100644 --- a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts +++ b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts @@ -1,25 +1,76 @@ import type { AgentStepFinishedEvent } from "../../types/public/agentEvidenceEvents.js"; import type { ActionExecutionResult } from "../../types/public/agent.js"; +const ERROR_STRING_LIMIT = 1000; + +function isRecord(value: unknown): value is Record { + return value !== null && typeof value === "object" && !Array.isArray(value); +} + +function hasOwn(value: Record, key: string): boolean { + return Object.prototype.hasOwnProperty.call(value, key); +} + +function normalizeError(value: unknown): string | undefined { + if (value === undefined || value === null || value === false) { + return undefined; + } + if (value instanceof Error) { + return value.message; + } + if (typeof value === "string") { + return value; + } + if ( + typeof value === "number" || + typeof value === "boolean" || + typeof value === "bigint" + ) { + return String(value); + } + + let serialized: string; + try { + serialized = JSON.stringify(value) ?? String(value); + } catch { + serialized = String(value); + } + if (serialized.length <= ERROR_STRING_LIMIT) { + return serialized; + } + return `${serialized.slice(0, ERROR_STRING_LIMIT)}... [truncated]`; +} + +function statusCandidates(toolResult: unknown): Record[] { + if (!isRecord(toolResult)) { + return []; + } + + const candidates = [toolResult]; + const output = toolResult.output; + if (isRecord(output)) { + candidates.push(output); + } + return candidates; +} + export function inferToolOutput( toolResult: unknown, ): AgentStepFinishedEvent["toolOutput"] { - const error = - toolResult && - typeof toolResult === "object" && - "error" in toolResult && - typeof (toolResult as { error?: unknown }).error === "string" - ? (toolResult as { error: string }).error - : undefined; - - const isError = - toolResult && - typeof toolResult === "object" && - "isError" in toolResult && - Boolean((toolResult as { isError?: unknown }).isError); + const candidates = statusCandidates(toolResult); + const error = candidates + .map((candidate) => + hasOwn(candidate, "error") ? normalizeError(candidate.error) : undefined, + ) + .find((message): message is string => message !== undefined); + + const successFalse = candidates.some( + (candidate) => candidate.success === false, + ); + const isError = candidates.some((candidate) => Boolean(candidate.isError)); return { - ok: error === undefined && !isError, + ok: error === undefined && !isError && !successFalse, result: toolResult, error, }; diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index 5281c1a70f..4481c3dc68 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -42,7 +42,10 @@ import { AgentAbortError, } from "../types/public/sdkErrors.js"; import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js"; -import { emitPostStepProbeEvidence } from "../agent/utils/postStepProbeEvidence.js"; +import { + captureProbeEvidence, + emitPostStepProbeEvidence, +} from "../agent/utils/postStepProbeEvidence.js"; import { wrapEvidenceCallback } from "../agent/utils/wrapEvidenceCallback.js"; import { inferToolOutput } from "../agent/utils/toolOutputEvidence.js"; import { @@ -55,6 +58,19 @@ function getErrorMessage(error: unknown): string { return error instanceof Error ? error.message : String(error); } +type FinalAnswerDraft = { + message: string; + output?: Record; +}; + +interface StepHandlerOptions { + userCallback?: + | GenerateTextOnStepFinishCallback + | StreamTextOnStepFinishCallback; + evidenceCallback?: AgentEvidenceCallback; + onFinalAnswer?: (answer: FinalAnswerDraft) => void; +} + /** * Prepends a system message with cache control to the messages array. * The cache control providerOptions are used by Anthropic and ignored by other providers. @@ -248,10 +264,7 @@ export class V3AgentHandler { private createStepHandler( state: AgentState, - userCallback?: - | GenerateTextOnStepFinishCallback - | StreamTextOnStepFinishCallback, - evidenceCallback?: AgentEvidenceCallback, + { userCallback, evidenceCallback, onFinalAnswer }: StepHandlerOptions, ) { // Monotonic step counter scoped to this execute() call. Each tool call in // the agent loop becomes one trajectory step. The counter feeds stepIndex @@ -265,9 +278,7 @@ export class V3AgentHandler { }); const stepIndicesInTurn: number[] = []; - let lastFinalAnswer: - | { message: string; output?: Record } - | undefined; + let lastFinalAnswer: FinalAnswerDraft | undefined; if (event.toolCalls && event.toolCalls.length > 0) { for (let i = 0; i < event.toolCalls.length; i++) { @@ -316,6 +327,7 @@ export class V3AgentHandler { const stepIndex = stepCounter++; stepIndicesInTurn.push(stepIndex); + const finishedAt = new Date().toISOString(); await evidenceCallback?.({ type: "step_finished", stepIndex, @@ -326,7 +338,7 @@ export class V3AgentHandler { : {}, reasoning: event.text ?? "", toolOutput: inferToolOutput(toolResult), - finishedAt: new Date().toISOString(), + finishedAt, }); } state.currentPageUrl = (await this.v3.context.awaitActivePage()).url(); @@ -347,10 +359,7 @@ export class V3AgentHandler { } if (lastFinalAnswer) { - await evidenceCallback?.({ - type: "final_answer", - ...lastFinalAnswer, - }); + onFinalAnswer?.(lastFinalAnswer); } if (userCallback) { @@ -378,6 +387,7 @@ export class V3AgentHandler { completed: false, currentPageUrl: "", }; + let finalAnswerFromDoneTool: FinalAnswerDraft | undefined; let messages: ModelMessage[] = []; let captchaSolver: CaptchaSolver | undefined; @@ -425,6 +435,11 @@ export class V3AgentHandler { } } + const evidenceCallback = wrapEvidenceCallback( + callbacks?.onEvidence, + this.logger, + ); + const result = await this.llmClient.generateText({ model: wrappedModel, messages: prependSystemMessage(systemPrompt, messages), @@ -436,11 +451,13 @@ export class V3AgentHandler { callbacks?.prepareStep, captchaSolver, ), - onStepFinish: this.createStepHandler( - state, - callbacks?.onStepFinish, - wrapEvidenceCallback(callbacks?.onEvidence, this.logger), - ), + onStepFinish: this.createStepHandler(state, { + userCallback: callbacks?.onStepFinish, + evidenceCallback, + onFinalAnswer: (answer) => { + finalAnswerFromDoneTool = answer; + }, + }), abortSignal: preparedOptions.signal, providerOptions: { google: { mediaResolution: "MEDIA_RESOLUTION_HIGH" }, @@ -457,6 +474,15 @@ export class V3AgentHandler { preparedOptions.output, this.logger, ); + const output = doneResult.output ?? finalAnswerFromDoneTool?.output; + await this.emitFinalEvidence( + state, + { + message: state.finalMessage, + output, + }, + evidenceCallback, + ); return this.consolidateMetricsAndResult( startTime, @@ -464,7 +490,7 @@ export class V3AgentHandler { doneResult.messages, result, maxSteps, - doneResult.output, + output, ); } catch (error) { // Re-throw validation errors that should propagate to the caller @@ -510,6 +536,7 @@ export class V3AgentHandler { // Highlight cursor defaults to true for hybrid mode, can be overridden const shouldHighlightCursor = streamOptions?.highlightCursor ?? this.mode === "hybrid"; + let finalAnswerFromDoneTool: FinalAnswerDraft | undefined; const { options, @@ -564,6 +591,11 @@ export class V3AgentHandler { rejectResult(error); }; + const evidenceCallback = wrapEvidenceCallback( + callbacks?.onEvidence, + this.logger, + ); + let streamResult: ReturnType; try { streamResult = this.llmClient.streamText({ @@ -576,11 +608,13 @@ export class V3AgentHandler { callbacks?.prepareStep, captchaSolver, ), - onStepFinish: this.createStepHandler( - state, - callbacks?.onStepFinish, - wrapEvidenceCallback(callbacks?.onEvidence, this.logger), - ), + onStepFinish: this.createStepHandler(state, { + userCallback: callbacks?.onStepFinish, + evidenceCallback, + onFinalAnswer: (answer) => { + finalAnswerFromDoneTool = answer; + }, + }), onError: (event) => { captchaSolver?.dispose(); if (callbacks?.onError) { @@ -606,17 +640,29 @@ export class V3AgentHandler { options.instruction, options.output, this.logger, - ).then((doneResult) => { - const result = this.consolidateMetricsAndResult( - startTime, - state, - doneResult.messages, - event, - maxSteps, - doneResult.output, - ); - resolveResult(result); - }); + ) + .then(async (doneResult) => { + const output = + doneResult.output ?? finalAnswerFromDoneTool?.output; + await this.emitFinalEvidence( + state, + { + message: state.finalMessage, + output, + }, + evidenceCallback, + ); + const result = this.consolidateMetricsAndResult( + startTime, + state, + doneResult.messages, + event, + maxSteps, + output, + ); + resolveResult(result); + }) + .catch(handleError); }, onAbort: (event) => { captchaSolver?.dispose(); @@ -645,6 +691,26 @@ export class V3AgentHandler { return agentStreamResult; } + private async emitFinalEvidence( + state: AgentState, + finalAnswer: { message: string; output?: Record }, + evidenceCallback?: AgentEvidenceCallback, + ): Promise { + if (!evidenceCallback) return; + + const observation = await captureProbeEvidence({ + v3: this.v3, + url: state.currentPageUrl, + logger: this.logger, + warningMessage: "Warning: final harness probe failed", + }); + await evidenceCallback({ + type: "final_answer", + ...finalAnswer, + observation, + }); + } + private consolidateMetricsAndResult( startTime: number, state: AgentState, diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index bcb92f9a00..a41aa7fd07 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -7,10 +7,13 @@ import { GoogleCUAClient } from "../agent/GoogleCUAClient.js"; import { OpenAICUAClient } from "../agent/OpenAICUAClient.js"; import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js"; import { ensureXPath } from "../agent/utils/xpath.js"; -import { emitPostStepProbeEvidence } from "../agent/utils/postStepProbeEvidence.js"; +import { + captureProbeEvidence, + emitPostStepProbeEvidence, +} from "../agent/utils/postStepProbeEvidence.js"; import { wrapEvidenceCallback } from "../agent/utils/wrapEvidenceCallback.js"; -import { inferCuaToolOutput } from "../agent/utils/toolOutputEvidence.js"; import { CuaEvidenceStepTracker } from "../agent/utils/cuaEvidenceStepTracker.js"; +import { inferToolOutput } from "../agent/utils/toolOutputEvidence.js"; import { ActionExecutionResult, AgentAction, @@ -131,6 +134,7 @@ export class V3CuaAgentHandler { defaultDelay; try { let executionResult: ActionExecutionResult | undefined; + const startedAt = new Date().toISOString(); // Try to inject cursor before each action if enabled if (this.highlightCursor) { try { @@ -161,7 +165,7 @@ export class V3CuaAgentHandler { action.timestamp = Date.now(); if (shouldLog) { - await this.emitCuaActionStep(action, executionResult); + await this.emitCuaActionStep(action, executionResult, startedAt); } await new Promise((r) => setTimeout(r, waitBetween)); @@ -258,11 +262,26 @@ export class V3CuaAgentHandler { let result: AgentResult; try { result = await this.agent.execute({ options, logger: this.logger }); - await this.evidenceCallback?.({ - type: "final_answer", - message: result.message, - output: result.output, - }); + if (this.evidenceCallback) { + let finalUrl = ""; + try { + finalUrl = (await this.v3.context.awaitActivePage()).url(); + } catch { + finalUrl = this.cuaEvidenceSteps.latestScreenshotUrl ?? ""; + } + const observation = await captureProbeEvidence({ + v3: this.v3, + url: finalUrl, + logger: this.logger, + warningMessage: "Warning: CUA final probe failed", + }); + await this.evidenceCallback({ + type: "final_answer", + message: result.message, + output: result.output, + observation, + }); + } } finally { this.evidenceCallback = undefined; this.captchaSolver?.dispose(); @@ -815,6 +834,7 @@ export class V3CuaAgentHandler { private async emitCuaActionStep( action: AgentAction, result: ActionExecutionResult | undefined, + startedAt: string, ): Promise { let pageUrl = typeof action.pageUrl === "string" @@ -846,7 +866,8 @@ export class V3CuaAgentHandler { actionName: String(action.type), actionArgs, reasoning, - toolOutput: inferCuaToolOutput(result), + toolOutput: inferToolOutput(result ?? { success: true }), + startedAt, finishedAt: new Date().toISOString(), }); diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts index cf8e560779..23f90a5ef2 100644 --- a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts +++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts @@ -52,6 +52,8 @@ export interface AgentStepFinishedEvent { result: unknown; error?: string; }; + /** ISO 8601 timestamp at which the step's tool execution started, when available. */ + startedAt?: string; /** ISO 8601 timestamp at which the step finished. */ finishedAt: string; } @@ -70,6 +72,17 @@ export interface AgentStepObservedEvent { scroll?: { top: number; pageHeight: number }; } +export interface AgentFinalObservation { + /** Page URL at the time of terminal capture. */ + url: string; + /** PNG bytes from page.screenshot(), when capture succeeds. */ + screenshot?: Buffer; + /** Accessibility tree snapshot, when captured. */ + ariaTree?: string; + /** Viewport scroll context, when captured. */ + scroll?: { top: number; pageHeight: number }; +} + /** Final answer emitted by the agent, when available. */ export interface AgentFinalAnswerEvent { type: "final_answer"; @@ -77,6 +90,14 @@ export interface AgentFinalAnswerEvent { message: string; /** Optional structured output if the agent's output schema was set. */ output?: Record; + /** + * Independent terminal browser observation captured after the agent finishes. + * + * This preserves the legacy verifier behavior of evaluating against a final + * page screenshot even when the last agent output is a final answer rather + * than a browser action. + */ + observation?: AgentFinalObservation; } export type AgentEvidenceCallback = ( diff --git a/packages/core/lib/v3/verifier/evidenceNormalization.ts b/packages/core/lib/v3/verifier/evidenceNormalization.ts index 0012e84d6e..486ca68e63 100644 --- a/packages/core/lib/v3/verifier/evidenceNormalization.ts +++ b/packages/core/lib/v3/verifier/evidenceNormalization.ts @@ -81,6 +81,12 @@ export function buildAgentEvidenceFromStepFinished( if (typeof result === "string") { modalities.push({ type: "text", content: result }); + } else if ( + typeof result === "number" || + typeof result === "boolean" || + typeof result === "bigint" + ) { + modalities.push({ type: "text", content: String(result) }); } else if (Buffer.isBuffer(result)) { modalities.push({ type: "image", diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index 75e1372bbe..413ecc15fd 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -21,6 +21,10 @@ type RawRubric = { items?: unknown; }; +type PersistedProbeEvidence = ProbeEvidence & { + screenshotPath?: string; +}; + /** * Convert dataset or generated rubric JSON into the public Stagehand shape. * Snake-case dataset fields are accepted here so serialized quirks do not leak @@ -109,6 +113,7 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { const trajectoryPath = path.join(trajectoryDir, "trajectory.json"); const raw = await fs.readFile(trajectoryPath, "utf8"); const parsed = JSON.parse(raw) as Trajectory & { + finalObservation?: PersistedProbeEvidence; steps: Array< TrajectoryStep & { agentEvidence: { @@ -126,7 +131,7 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { | { type: "json"; content: unknown } >; }; - probeEvidence: ProbeEvidence; + probeEvidence: PersistedProbeEvidence; } >; }; @@ -151,9 +156,9 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { return resolved; }; - for (const step of parsed.steps) { - // Rehydrate tier-2 probe screenshot from its on-disk file reference. - const probe = step.probeEvidence; + const hydrateProbeScreenshot = async ( + probe: PersistedProbeEvidence | undefined, + ): Promise => { if (probe?.screenshotPath && !probe.screenshot) { const resolved = resolveWithinTrajectoryDir(probe.screenshotPath); try { @@ -163,6 +168,11 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { // evidence_insufficient path will handle it. } } + }; + + for (const step of parsed.steps) { + // Rehydrate tier-2 probe screenshot from its on-disk file reference. + await hydrateProbeScreenshot(step.probeEvidence); // Decode image modalities from disk references back to Buffer. if (step.agentEvidence?.modalities) { @@ -205,6 +215,8 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { } } + await hydrateProbeScreenshot(parsed.finalObservation); + return parsed; } @@ -308,11 +320,23 @@ export async function writeTrajectoryDir( }); } + const finalObservation: ProbeEvidence | undefined = + trajectory.finalObservation === undefined + ? undefined + : { ...trajectory.finalObservation }; + if (finalObservation?.screenshot) { + const relPath = "screenshots/probe/final.png"; + await fs.writeFile(path.join(dir, relPath), finalObservation.screenshot); + finalObservation.screenshotPath = relPath; + delete finalObservation.screenshot; + } + // Image modalities carry imagePath instead of raw bytes on disk; cast // through unknown rather than widen Trajectory's type contract. const serialized = { ...trajectory, steps: serializableSteps, + ...(finalObservation ? { finalObservation } : {}), } as unknown; await fs.writeFile( diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts index 04addaf9a4..c51ea2d47e 100644 --- a/packages/core/lib/v3/verifier/types.ts +++ b/packages/core/lib/v3/verifier/types.ts @@ -136,7 +136,7 @@ export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error"; * .trajectories/// * ├── task_data.json — TaskSpec + result metadata * ├── trajectory.json — this object, with screenshotPath instead of bytes - * ├── screenshot_1.png — probeEvidence.screenshot for step 1, etc. + * ├── screenshots/ — step probe/agent images plus final observation * ├── scores/ * │ └── result.json — Result from V3Evaluator.verify() * ├── core.log — captured action log @@ -146,6 +146,8 @@ export interface Trajectory { task: TaskSpec; steps: TrajectoryStep[]; finalAnswer?: string; + /** Terminal page observation captured after the agent finishes. */ + finalObservation?: ProbeEvidence; status: TrajectoryStatus; usage: TrajectoryUsage; timing: { startedAt: string; endedAt: string }; diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts index 5294f6c508..ee1bbc6d35 100644 --- a/packages/core/lib/v3Evaluator.ts +++ b/packages/core/lib/v3Evaluator.ts @@ -227,6 +227,10 @@ function collectLegacyScreenshots(trajectory: Trajectory): Buffer[] { } } + if (Buffer.isBuffer(trajectory.finalObservation?.screenshot)) { + screenshots.push(trajectory.finalObservation.screenshot); + } + return screenshots; } @@ -234,12 +238,14 @@ function renderLegacyAgentReasoning( trajectory: Trajectory, ): string | undefined { const stepLines = (trajectory.steps ?? []).map((step) => { + const status = step.toolOutput?.ok === false ? "Tool status: failed" : ""; const output = step.toolOutput?.error ? `Tool error: ${step.toolOutput.error}` : `Tool output: ${stringifyForPrompt(step.toolOutput?.result)}`; return [ `Step ${step.index}: ${step.actionName}`, step.reasoning ? `Reasoning: ${step.reasoning}` : undefined, + status || undefined, output, ] .filter(Boolean) diff --git a/packages/core/tests/unit/tool-output-evidence.test.ts b/packages/core/tests/unit/tool-output-evidence.test.ts new file mode 100644 index 0000000000..87b01d7529 --- /dev/null +++ b/packages/core/tests/unit/tool-output-evidence.test.ts @@ -0,0 +1,58 @@ +import { describe, expect, it } from "vitest"; + +import { inferToolOutput } from "../../lib/v3/agent/utils/toolOutputEvidence.js"; + +describe("inferToolOutput", () => { + it("preserves raw results while normalizing top-level failure status", () => { + const result = { success: false }; + + expect(inferToolOutput(result)).toEqual({ + ok: false, + result, + error: undefined, + }); + }); + + it("normalizes one-level AI SDK output wrappers", () => { + const result = { + toolCallId: "call-1", + output: { success: false, error: { message: "not found" } }, + }; + + expect(inferToolOutput(result)).toEqual({ + ok: false, + result, + error: '{"message":"not found"}', + }); + }); + + it("handles isError and non-string errors", () => { + const result = { isError: true, error: new Error("bad input") }; + + expect(inferToolOutput(result)).toEqual({ + ok: false, + result, + error: "bad input", + }); + }); + + it("normalizes non-json error values", () => { + const result = { error: Symbol("bad input") }; + + expect(inferToolOutput(result)).toEqual({ + ok: false, + result, + error: "Symbol(bad input)", + }); + }); + + it("does not recursively treat page data as tool status", () => { + const result = { data: { success: false, error: "page field" } }; + + expect(inferToolOutput(result)).toEqual({ + ok: true, + result, + error: undefined, + }); + }); +}); diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts index 1e9e3a0f19..2f488e5a8d 100644 --- a/packages/core/tests/unit/v3-evaluator.test.ts +++ b/packages/core/tests/unit/v3-evaluator.test.ts @@ -82,6 +82,39 @@ describe("V3Evaluator verifier facade", () => { expect(result.perCriterion).toBeUndefined(); }); + it("passes final observation screenshots to the legacy verifier adapter", async () => { + const taskSpec: TaskSpec = { + id: "final-observation", + instruction: "Complete the task", + }; + const finalScreenshot = Buffer.from("final screenshot"); + const trajectory = { + ...makeTrajectory(taskSpec), + finalObservation: { + url: "https://example.com/done", + screenshot: finalScreenshot, + }, + }; + const ask = vi.fn().mockResolvedValue({ + evaluation: "YES", + reasoning: "The final screenshot shows completion.", + }); + const evaluator = new V3Evaluator({} as V3, { + backend: "legacy", + }); + Object.defineProperty(evaluator, "legacyEvaluator", { + value: { ask }, + }); + + await evaluator.verify(trajectory); + + expect(ask).toHaveBeenCalledWith( + expect.objectContaining({ + screenshot: [finalScreenshot], + }), + ); + }); + it("keeps legacy tool output detail until the overall reasoning budget is reached", async () => { const taskSpec: TaskSpec = { id: "reasoning-budget", diff --git a/packages/core/tests/unit/verifier-evidence-normalization.test.ts b/packages/core/tests/unit/verifier-evidence-normalization.test.ts new file mode 100644 index 0000000000..7bf0d59b5a --- /dev/null +++ b/packages/core/tests/unit/verifier-evidence-normalization.test.ts @@ -0,0 +1,20 @@ +import { describe, expect, it } from "vitest"; + +import { buildAgentEvidenceFromStepFinished } from "../../lib/v3/verifier/evidenceNormalization.js"; + +describe("buildAgentEvidenceFromStepFinished", () => { + it("captures primitive tool results as text evidence", () => { + const evidence = buildAgentEvidenceFromStepFinished({ + type: "step_finished", + stepIndex: 0, + actionName: "check", + actionArgs: {}, + reasoning: "", + toolOutput: { ok: true, result: false }, + startedAt: new Date(0).toISOString(), + finishedAt: new Date(1).toISOString(), + }); + + expect(evidence.modalities).toEqual([{ type: "text", content: "false" }]); + }); +}); diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts index 7c9351d135..752d01beed 100644 --- a/packages/core/tests/unit/verifier-trajectory.test.ts +++ b/packages/core/tests/unit/verifier-trajectory.test.ts @@ -64,8 +64,10 @@ describe("verifier trajectory utilities", () => { it("loads trajectory screenshots and image modalities from disk", async () => { const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); const screenshot = Buffer.from("probe screenshot"); + const finalScreenshot = Buffer.from("final screenshot"); const agentImage = Buffer.from("agent image"); await writeFile(path.join(dir, "screenshot_1.png"), screenshot); + await writeFile(path.join(dir, "final.png"), finalScreenshot); await mkdir(path.join(dir, "screenshots", "agent"), { recursive: true }); await writeFile( path.join(dir, "screenshots", "agent", "1.png"), @@ -102,6 +104,10 @@ describe("verifier trajectory utilities", () => { finishedAt: new Date(0).toISOString(), }, ], + finalObservation: { + url: "https://example.com/done", + screenshotPath: "final.png", + }, }), ); @@ -109,6 +115,7 @@ describe("verifier trajectory utilities", () => { const modality = trajectory.steps[0].agentEvidence.modalities[0]; expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot); + expect(trajectory.finalObservation?.screenshot).toEqual(finalScreenshot); expect(modality.type).toBe("image"); if (modality.type === "image") { expect(modality.bytes).toEqual(agentImage); @@ -174,6 +181,10 @@ describe("verifier trajectory utilities", () => { startedAt: new Date(0).toISOString(), endedAt: new Date(0).toISOString(), }, + finalObservation: { + url: "https://example.com/done", + screenshot: Buffer.from("final screenshot"), + }, steps: [ { index: 0, @@ -220,6 +231,12 @@ describe("verifier trajectory utilities", () => { expect(trajectory.steps[0].toolOutput.result.output.screenshotBase64).toBe( "[redacted inline image payload]", ); + expect(trajectory.finalObservation.screenshotPath).toBe( + "screenshots/probe/final.png", + ); + await expect( + readFile(path.join(dir, "screenshots", "probe", "final.png")), + ).resolves.toEqual(Buffer.from("final screenshot")); }); it("rejects screenshot paths outside the trajectory directory", async () => { diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index da2b9b5da8..c48450908a 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -31,6 +31,7 @@ interface PartialStep { agentEvidence: AgentEvidence; probeEvidence: ProbeEvidence; toolOutput: { ok: boolean; result: unknown; error?: string }; + startedAt: string; finishedAt: string; } @@ -72,6 +73,7 @@ export class TrajectoryRecorder { // fire in one microtask. private readonly partialSteps = new Map>(); private finalAnswerEvent?: AgentFinalAnswerEvent; + private finalObservation?: ProbeEvidence; private startedAt = ""; private endedAt = ""; @@ -115,6 +117,7 @@ export class TrajectoryRecorder { ...e.toolOutput, result: redactInlineImagePayloads(e.toolOutput.result, e.actionName), }; + partial.startedAt = e.startedAt ?? e.finishedAt; partial.finishedAt = e.finishedAt; partial.agentEvidence = mergeAgentEvidence( partial.agentEvidence, @@ -133,6 +136,20 @@ export class TrajectoryRecorder { private onFinalAnswer(e: AgentFinalAnswerEvent): void { this.finalAnswerEvent = e; + if (e.observation) { + this.finalObservation = { + url: e.observation.url, + ...(e.observation.screenshot + ? { screenshot: e.observation.screenshot } + : {}), + ...(e.observation.ariaTree !== undefined + ? { ariaTree: e.observation.ariaTree } + : {}), + ...(e.observation.scroll !== undefined + ? { scroll: e.observation.scroll } + : {}), + }; + } } constructor(opts: TrajectoryRecorderOptions) { @@ -181,6 +198,9 @@ export class TrajectoryRecorder { task: this.taskSpec, steps, finalAnswer: opts.finalAnswer ?? this.finalAnswerEvent?.message, + ...(this.finalObservation + ? { finalObservation: this.finalObservation } + : {}), status: opts.status, usage: { ...ZERO_USAGE, ...(opts.usage ?? {}) }, timing: { startedAt: this.startedAt, endedAt: this.endedAt }, @@ -197,6 +217,7 @@ export class TrajectoryRecorder { cancel(): void { this.partialSteps.clear(); this.finalAnswerEvent = undefined; + this.finalObservation = undefined; } /** Where the trajectory dir lives (whether or not it was persisted). */ @@ -259,6 +280,7 @@ export class TrajectoryRecorder { if ( p.actionName === undefined || p.toolOutput === undefined || + p.startedAt === undefined || p.finishedAt === undefined ) { // Provider-only screenshot refreshes are transport evidence for the @@ -274,7 +296,7 @@ export class TrajectoryRecorder { agentEvidence: p.agentEvidence ?? { modalities: [] }, probeEvidence: p.probeEvidence ?? {}, toolOutput: p.toolOutput, - startedAt: this.startedAt, + startedAt: p.startedAt, finishedAt: p.finishedAt, }); } diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts index 38443c5dc0..743d3b4ecd 100644 --- a/packages/evals/tests/framework/trajectoryRecorder.test.ts +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -75,6 +75,7 @@ describe("TrajectoryRecorder", () => { ok: true, result: { economy: "$100", business: "$250" }, }, + startedAt: new Date(0).toISOString(), finishedAt: new Date(0).toISOString(), }); recorder.record({ @@ -86,6 +87,11 @@ describe("TrajectoryRecorder", () => { recorder.record({ type: "final_answer", message: "Business is $150 more than economy.", + observation: { + url: "https://example.com/checkout", + screenshot: Buffer.from("final-screen"), + ariaTree: "RootWebArea\nStaticText: Complete", + }, }); const trajectory = await recorder.finish({ @@ -109,6 +115,7 @@ describe("TrajectoryRecorder", () => { }, }); expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot); + expect(trajectory.steps[0].startedAt).toBe(new Date(0).toISOString()); expect(trajectory.steps[0].agentEvidence.modalities).toEqual( expect.arrayContaining([ { type: "image", bytes: screenshot, mediaType: "image/png" }, @@ -117,6 +124,13 @@ describe("TrajectoryRecorder", () => { ]), ); expect(trajectory.finalAnswer).toBe("Business is $150 more than economy."); + expect(trajectory.finalObservation).toMatchObject({ + url: "https://example.com/checkout", + ariaTree: "RootWebArea\nStaticText: Complete", + }); + expect(trajectory.finalObservation?.screenshot).toEqual( + Buffer.from("final-screen"), + ); }); it("persists trajectory files and evaluator results", async () => { @@ -151,6 +165,7 @@ describe("TrajectoryRecorder", () => { actionArgs: { instruction: "Search fares" }, reasoning: "Search for fares.", toolOutput: { ok: true, result: "done" }, + startedAt: new Date(0).toISOString(), finishedAt: new Date(0).toISOString(), }); recorder.record({ @@ -158,6 +173,14 @@ describe("TrajectoryRecorder", () => { stepIndex: 0, url: "https://example.com/search", }); + recorder.record({ + type: "final_answer", + message: "Complete.", + observation: { + url: "https://example.com/complete", + screenshot: Buffer.from("final-screen"), + }, + }); await recorder.finish({ status: "complete" }); await recorder.persistResult({ @@ -179,6 +202,9 @@ describe("TrajectoryRecorder", () => { await expect( fs.readFile(path.join(taskDir, "screenshots", "probe", "1.png")), ).resolves.toEqual(screenshot); + await expect( + fs.readFile(path.join(taskDir, "screenshots", "probe", "final.png")), + ).resolves.toEqual(Buffer.from("final-screen")); await expect( fs.readFile(path.join(taskDir, "screenshots", "agent", "1.png")), ).resolves.toEqual(screenshot); @@ -192,6 +218,9 @@ describe("TrajectoryRecorder", () => { expect(trajectory.steps[0].probeEvidence.screenshotPath).toBe( "screenshots/probe/1.png", ); + expect(trajectory.finalObservation.screenshotPath).toBe( + "screenshots/probe/final.png", + ); expect(trajectory.steps[0].agentEvidence.modalities).toContainEqual({ type: "image", imagePath: "screenshots/agent/1.png", @@ -207,6 +236,29 @@ describe("TrajectoryRecorder", () => { }); }); + it("normalizes missing step startedAt to finishedAt", async () => { + const recorder = new TrajectoryRecorder({ + taskSpec: makeTaskSpec(), + persist: false, + }); + const finishedAt = new Date(1).toISOString(); + + recorder.record({ + type: "step_finished", + stepIndex: 0, + actionName: "extract", + actionArgs: { instruction: "Read fares" }, + reasoning: "", + toolOutput: { ok: true, result: false }, + finishedAt, + }); + + const trajectory = await recorder.finish({ status: "complete" }); + + expect(trajectory.steps[0].startedAt).toBe(finishedAt); + expect(trajectory.steps[0].finishedAt).toBe(finishedAt); + }); + it("lifts inline screenshot payloads into image evidence and redacts JSON", async () => { const inlineScreenshot = Buffer.from("inline screenshot").toString("base64"); @@ -231,6 +283,7 @@ describe("TrajectoryRecorder", () => { }, }, }, + startedAt: new Date(0).toISOString(), finishedAt: new Date(0).toISOString(), }); From 125246268e9b79b66aea483b9d3e6c97ab6c9d4d Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 22 May 2026 20:10:39 -0700 Subject: [PATCH 20/27] Remove verifier trajectory timestamps --- .../core/lib/v3/handlers/v3AgentHandler.ts | 2 -- .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 6 +--- .../v3/types/public/agentEvidenceEvents.ts | 4 --- packages/core/lib/v3/verifier/trajectory.ts | 16 ---------- packages/core/lib/v3/verifier/types.ts | 8 +---- packages/core/tests/unit/v3-evaluator.test.ts | 6 ---- .../verifier-evidence-normalization.test.ts | 2 -- .../tests/unit/verifier-trajectory.test.ts | 24 -------------- .../evals/framework/trajectoryRecorder.ts | 25 ++------------- .../framework/trajectoryRecorder.test.ts | 31 ------------------- 10 files changed, 5 insertions(+), 119 deletions(-) diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index 4481c3dc68..68858f41be 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -327,7 +327,6 @@ export class V3AgentHandler { const stepIndex = stepCounter++; stepIndicesInTurn.push(stepIndex); - const finishedAt = new Date().toISOString(); await evidenceCallback?.({ type: "step_finished", stepIndex, @@ -338,7 +337,6 @@ export class V3AgentHandler { : {}, reasoning: event.text ?? "", toolOutput: inferToolOutput(toolResult), - finishedAt, }); } state.currentPageUrl = (await this.v3.context.awaitActivePage()).url(); diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index a41aa7fd07..5c495b5c2f 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -134,7 +134,6 @@ export class V3CuaAgentHandler { defaultDelay; try { let executionResult: ActionExecutionResult | undefined; - const startedAt = new Date().toISOString(); // Try to inject cursor before each action if enabled if (this.highlightCursor) { try { @@ -165,7 +164,7 @@ export class V3CuaAgentHandler { action.timestamp = Date.now(); if (shouldLog) { - await this.emitCuaActionStep(action, executionResult, startedAt); + await this.emitCuaActionStep(action, executionResult); } await new Promise((r) => setTimeout(r, waitBetween)); @@ -834,7 +833,6 @@ export class V3CuaAgentHandler { private async emitCuaActionStep( action: AgentAction, result: ActionExecutionResult | undefined, - startedAt: string, ): Promise { let pageUrl = typeof action.pageUrl === "string" @@ -867,8 +865,6 @@ export class V3CuaAgentHandler { actionArgs, reasoning, toolOutput: inferToolOutput(result ?? { success: true }), - startedAt, - finishedAt: new Date().toISOString(), }); // Post-action tier-2 probe. The pre-action screenshot from diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts index 23f90a5ef2..dcd0e89e70 100644 --- a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts +++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts @@ -52,10 +52,6 @@ export interface AgentStepFinishedEvent { result: unknown; error?: string; }; - /** ISO 8601 timestamp at which the step's tool execution started, when available. */ - startedAt?: string; - /** ISO 8601 timestamp at which the step finished. */ - finishedAt: string; } /** diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index 413ecc15fd..b8722a64df 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -253,7 +253,6 @@ export function shouldPersistTrajectory( * ├── screenshots/ * │ ├── probe/.png * │ └── agent/[_M].png - * ├── times.json * ├── scores/ (empty; populated separately) * └── core.log * @@ -357,19 +356,6 @@ export async function writeTrajectoryDir( ), ); - await fs.writeFile( - path.join(dir, "times.json"), - JSON.stringify( - { - timing: trajectory.timing, - usage: trajectory.usage, - stepCount: trajectory.steps.length, - }, - null, - 2, - ), - ); - await fs.mkdir(path.join(dir, "scores"), { recursive: true }); await fs.writeFile(path.join(dir, "core.log"), coreLog(trajectory)); } @@ -384,8 +370,6 @@ function coreLog(trajectory: Trajectory): string { url: step.probeEvidence.url ?? null, ok: step.toolOutput.ok, reasoning: step.reasoning || undefined, - startedAt: step.startedAt, - finishedAt: step.finishedAt, }), ) .join("\n") + "\n" diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts index c51ea2d47e..5431397092 100644 --- a/packages/core/lib/v3/verifier/types.ts +++ b/packages/core/lib/v3/verifier/types.ts @@ -119,10 +119,6 @@ export interface TrajectoryStep { agentEvidence: AgentEvidence; probeEvidence: ProbeEvidence; toolOutput: ToolOutput; - /** ISO 8601 timestamp when the step's tool execution started. */ - startedAt: string; - /** ISO 8601 timestamp when the step's tool execution finished. */ - finishedAt: string; } /** Terminal status of the agent run. */ @@ -139,8 +135,7 @@ export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error"; * ├── screenshots/ — step probe/agent images plus final observation * ├── scores/ * │ └── result.json — Result from V3Evaluator.verify() - * ├── core.log — captured action log - * └── times.json — step timing + token usage + * └── core.log — captured action log */ export interface Trajectory { task: TaskSpec; @@ -150,7 +145,6 @@ export interface Trajectory { finalObservation?: ProbeEvidence; status: TrajectoryStatus; usage: TrajectoryUsage; - timing: { startedAt: string; endedAt: string }; } /** Score for a single rubric criterion after evidence analysis + rescoring. */ diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts index 2f488e5a8d..b18650d8ea 100644 --- a/packages/core/tests/unit/v3-evaluator.test.ts +++ b/packages/core/tests/unit/v3-evaluator.test.ts @@ -197,10 +197,6 @@ function makeEmptyTrajectory(taskSpec: TaskSpec): Trajectory { input_tokens: 0, output_tokens: 0, }, - timing: { - startedAt: new Date(0).toISOString(), - endedAt: new Date(0).toISOString(), - }, }; } @@ -228,8 +224,6 @@ function makeTrajectory( ok: true, result: options.toolResult ?? "done", }, - startedAt: new Date(0).toISOString(), - finishedAt: new Date(0).toISOString(), }, ], finalAnswer: options.finalAnswer, diff --git a/packages/core/tests/unit/verifier-evidence-normalization.test.ts b/packages/core/tests/unit/verifier-evidence-normalization.test.ts index 7bf0d59b5a..5b6ee249b2 100644 --- a/packages/core/tests/unit/verifier-evidence-normalization.test.ts +++ b/packages/core/tests/unit/verifier-evidence-normalization.test.ts @@ -11,8 +11,6 @@ describe("buildAgentEvidenceFromStepFinished", () => { actionArgs: {}, reasoning: "", toolOutput: { ok: true, result: false }, - startedAt: new Date(0).toISOString(), - finishedAt: new Date(1).toISOString(), }); expect(evidence.modalities).toEqual([{ type: "text", content: "false" }]); diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts index 752d01beed..e57f2bb3c4 100644 --- a/packages/core/tests/unit/verifier-trajectory.test.ts +++ b/packages/core/tests/unit/verifier-trajectory.test.ts @@ -79,10 +79,6 @@ describe("verifier trajectory utilities", () => { task: { id: "task", instruction: "Do the task" }, status: "complete", usage: { input_tokens: 0, output_tokens: 0 }, - timing: { - startedAt: new Date(0).toISOString(), - endedAt: new Date(0).toISOString(), - }, steps: [ { index: 0, @@ -100,8 +96,6 @@ describe("verifier trajectory utilities", () => { }, probeEvidence: { screenshotPath: "screenshot_1.png" }, toolOutput: { ok: true, result: null }, - startedAt: new Date(0).toISOString(), - finishedAt: new Date(0).toISOString(), }, ], finalObservation: { @@ -131,10 +125,6 @@ describe("verifier trajectory utilities", () => { task: { id: "task", instruction: "Do the task" }, status: "complete", usage: { input_tokens: 0, output_tokens: 0 }, - timing: { - startedAt: new Date(0).toISOString(), - endedAt: new Date(0).toISOString(), - }, steps: [ { index: 0, @@ -152,8 +142,6 @@ describe("verifier trajectory utilities", () => { }, probeEvidence: {}, toolOutput: { ok: true, result: null }, - startedAt: new Date(0).toISOString(), - finishedAt: new Date(0).toISOString(), }, ], }), @@ -177,10 +165,6 @@ describe("verifier trajectory utilities", () => { task: { id: "task", instruction: "Do the task" }, status: "complete", usage: { input_tokens: 0, output_tokens: 0 }, - timing: { - startedAt: new Date(0).toISOString(), - endedAt: new Date(0).toISOString(), - }, finalObservation: { url: "https://example.com/done", screenshot: Buffer.from("final screenshot"), @@ -214,8 +198,6 @@ describe("verifier trajectory utilities", () => { }, }, }, - startedAt: new Date(0).toISOString(), - finishedAt: new Date(0).toISOString(), }, ], }); @@ -247,10 +229,6 @@ describe("verifier trajectory utilities", () => { task: { id: "task", instruction: "Do the task" }, status: "complete", usage: { input_tokens: 0, output_tokens: 0 }, - timing: { - startedAt: new Date(0).toISOString(), - endedAt: new Date(0).toISOString(), - }, steps: [ { index: 0, @@ -260,8 +238,6 @@ describe("verifier trajectory utilities", () => { agentEvidence: { modalities: [] }, probeEvidence: { screenshotPath: "../../../etc/passwd" }, toolOutput: { ok: true, result: null }, - startedAt: new Date(0).toISOString(), - finishedAt: new Date(0).toISOString(), }, ], }), diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index c48450908a..84b57b43a4 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -31,8 +31,6 @@ interface PartialStep { agentEvidence: AgentEvidence; probeEvidence: ProbeEvidence; toolOutput: { ok: boolean; result: unknown; error?: string }; - startedAt: string; - finishedAt: string; } export interface TrajectoryRecorderOptions { @@ -74,8 +72,6 @@ export class TrajectoryRecorder { private readonly partialSteps = new Map>(); private finalAnswerEvent?: AgentFinalAnswerEvent; private finalObservation?: ProbeEvidence; - private startedAt = ""; - private endedAt = ""; private onScreenshot(e: AgentScreenshotEvidenceEvent): void { const partial = this.ensurePartial(e.stepIndex); @@ -117,8 +113,6 @@ export class TrajectoryRecorder { ...e.toolOutput, result: redactInlineImagePayloads(e.toolOutput.result, e.actionName), }; - partial.startedAt = e.startedAt ?? e.finishedAt; - partial.finishedAt = e.finishedAt; partial.agentEvidence = mergeAgentEvidence( partial.agentEvidence, buildAgentEvidenceFromStepFinished(e), @@ -160,15 +154,13 @@ export class TrajectoryRecorder { this.persistEnabled = shouldPersistTrajectory(opts.persist); } - /** Mark the beginning of collection. Call once before agent.execute(). */ + /** Mark the beginning of collection. Retained as a no-op for compatibility. */ start(): void { - if (this.startedAt) return; - this.startedAt = new Date().toISOString(); + return; } /** Ingest an evidence callback event from agent.execute(). */ record(event: AgentEvidenceEvent): void { - if (!this.startedAt) this.start(); switch (event.type) { case "screenshot": this.onScreenshot(event); @@ -190,9 +182,6 @@ export class TrajectoryRecorder { * write the on-disk layout. Idempotent. */ async finish(opts: TrajectoryFinishOptions): Promise { - if (!this.startedAt) this.start(); - this.endedAt = new Date().toISOString(); - const steps = this.assembleSteps(); const trajectory: Trajectory = { task: this.taskSpec, @@ -203,7 +192,6 @@ export class TrajectoryRecorder { : {}), status: opts.status, usage: { ...ZERO_USAGE, ...(opts.usage ?? {}) }, - timing: { startedAt: this.startedAt, endedAt: this.endedAt }, }; if (this.persistEnabled) { @@ -277,12 +265,7 @@ export class TrajectoryRecorder { const indices = [...this.partialSteps.keys()].sort((a, b) => a - b); for (const i of indices) { const p = this.partialSteps.get(i)!; - if ( - p.actionName === undefined || - p.toolOutput === undefined || - p.startedAt === undefined || - p.finishedAt === undefined - ) { + if (p.actionName === undefined || p.toolOutput === undefined) { // Provider-only screenshot refreshes are transport evidence for the // next CUA action. If no action arrives for this index, there is no // completed trajectory step to persist. @@ -296,8 +279,6 @@ export class TrajectoryRecorder { agentEvidence: p.agentEvidence ?? { modalities: [] }, probeEvidence: p.probeEvidence ?? {}, toolOutput: p.toolOutput, - startedAt: p.startedAt, - finishedAt: p.finishedAt, }); } return out; diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts index 743d3b4ecd..57f4c93c55 100644 --- a/packages/evals/tests/framework/trajectoryRecorder.test.ts +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -75,8 +75,6 @@ describe("TrajectoryRecorder", () => { ok: true, result: { economy: "$100", business: "$250" }, }, - startedAt: new Date(0).toISOString(), - finishedAt: new Date(0).toISOString(), }); recorder.record({ type: "step_observed", @@ -115,7 +113,6 @@ describe("TrajectoryRecorder", () => { }, }); expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot); - expect(trajectory.steps[0].startedAt).toBe(new Date(0).toISOString()); expect(trajectory.steps[0].agentEvidence.modalities).toEqual( expect.arrayContaining([ { type: "image", bytes: screenshot, mediaType: "image/png" }, @@ -165,8 +162,6 @@ describe("TrajectoryRecorder", () => { actionArgs: { instruction: "Search fares" }, reasoning: "Search for fares.", toolOutput: { ok: true, result: "done" }, - startedAt: new Date(0).toISOString(), - finishedAt: new Date(0).toISOString(), }); recorder.record({ type: "step_observed", @@ -195,7 +190,6 @@ describe("TrajectoryRecorder", () => { "scores", "screenshots", "task_data.json", - "times.json", "trajectory.json", ]), ); @@ -236,29 +230,6 @@ describe("TrajectoryRecorder", () => { }); }); - it("normalizes missing step startedAt to finishedAt", async () => { - const recorder = new TrajectoryRecorder({ - taskSpec: makeTaskSpec(), - persist: false, - }); - const finishedAt = new Date(1).toISOString(); - - recorder.record({ - type: "step_finished", - stepIndex: 0, - actionName: "extract", - actionArgs: { instruction: "Read fares" }, - reasoning: "", - toolOutput: { ok: true, result: false }, - finishedAt, - }); - - const trajectory = await recorder.finish({ status: "complete" }); - - expect(trajectory.steps[0].startedAt).toBe(finishedAt); - expect(trajectory.steps[0].finishedAt).toBe(finishedAt); - }); - it("lifts inline screenshot payloads into image evidence and redacts JSON", async () => { const inlineScreenshot = Buffer.from("inline screenshot").toString("base64"); @@ -283,8 +254,6 @@ describe("TrajectoryRecorder", () => { }, }, }, - startedAt: new Date(0).toISOString(), - finishedAt: new Date(0).toISOString(), }); const trajectory = await recorder.finish({ status: "complete" }); From b4a1537e4faca2eaa361fe4ac190469042029521 Mon Sep 17 00:00:00 2001 From: miguel Date: Sat, 23 May 2026 19:06:36 -0700 Subject: [PATCH 21/27] refactor(verifier): simplify evidence event sequencing --- .../v3/agent/utils/cuaEvidenceStepTracker.ts | 55 ----- .../v3/agent/utils/postStepProbeEvidence.ts | 27 +-- .../core/lib/v3/handlers/v3AgentHandler.ts | 18 +- .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 25 +- .../v3/types/public/agentEvidenceEvents.ts | 14 +- .../unit/cua-evidence-step-tracker.test.ts | 55 ----- .../tests/unit/tool-output-evidence.test.ts | 85 +++---- .../verifier-evidence-normalization.test.ts | 49 +++- .../evals/framework/trajectoryRecorder.ts | 150 ++++++------ .../framework/trajectoryRecorder.test.ts | 214 +++++++----------- 10 files changed, 260 insertions(+), 432 deletions(-) delete mode 100644 packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts delete mode 100644 packages/core/tests/unit/cua-evidence-step-tracker.test.ts diff --git a/packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts b/packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts deleted file mode 100644 index 356cc6a98c..0000000000 --- a/packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts +++ /dev/null @@ -1,55 +0,0 @@ -import type { AgentScreenshotEvidenceEvent } from "../../types/public/agentEvidenceEvents.js"; - -export interface PairedCuaActionStep { - stepIndex: number; - replayScreenshot?: AgentScreenshotEvidenceEvent; -} - -export class CuaEvidenceStepTracker { - private nextStepIndex = 0; - private latestScreenshot?: AgentScreenshotEvidenceEvent; - private latestScreenshotConsumed = true; - - reset(): void { - this.nextStepIndex = 0; - this.latestScreenshot = undefined; - this.latestScreenshotConsumed = true; - } - - recordScreenshot( - screenshot: Buffer, - url: string, - ): AgentScreenshotEvidenceEvent { - const event: AgentScreenshotEvidenceEvent = { - type: "screenshot", - stepIndex: this.nextStepIndex++, - screenshot, - url, - evidenceRole: "agent", - }; - this.latestScreenshot = event; - this.latestScreenshotConsumed = false; - return event; - } - - pairAction(): PairedCuaActionStep { - if (this.latestScreenshot && !this.latestScreenshotConsumed) { - this.latestScreenshotConsumed = true; - return { stepIndex: this.latestScreenshot.stepIndex }; - } - - const stepIndex = this.nextStepIndex++; - if (this.latestScreenshot) { - return { - stepIndex, - replayScreenshot: { ...this.latestScreenshot, stepIndex }, - }; - } - - return { stepIndex }; - } - - get latestScreenshotUrl(): string | undefined { - return this.latestScreenshot?.url; - } -} diff --git a/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts index 10889d6cf8..f68315dbad 100644 --- a/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts +++ b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts @@ -11,7 +11,6 @@ interface CaptureProbeEvidenceOptions { } interface EmitPostStepProbeEvidenceOptions extends CaptureProbeEvidenceOptions { - stepIndices: number | number[]; evidenceCallback?: AgentEvidenceCallback; } @@ -53,7 +52,6 @@ export async function captureProbeEvidence({ export async function emitPostStepProbeEvidence({ v3, - stepIndices, url, evidenceCallback, logger, @@ -61,30 +59,23 @@ export async function emitPostStepProbeEvidence({ }: EmitPostStepProbeEvidenceOptions): Promise { if (!evidenceCallback) return; - const indices = Array.isArray(stepIndices) ? stepIndices : [stepIndices]; - if (indices.length === 0) return; - const probe = await captureProbeEvidence({ v3, url, logger, warningMessage, }); - for (const stepIndex of indices) { - if (probe.screenshot) { - await evidenceCallback({ - type: "screenshot", - stepIndex, - screenshot: probe.screenshot, - url: probe.url, - evidenceRole: "probe", - }); - } + if (probe.screenshot) { await evidenceCallback({ - type: "step_observed", - stepIndex, + type: "screenshot", + screenshot: probe.screenshot, url: probe.url, - ariaTree: probe.ariaTree, + evidenceRole: "probe", }); } + await evidenceCallback({ + type: "step_observed", + url: probe.url, + ariaTree: probe.ariaTree, + }); } diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index 68858f41be..c3b6a5892e 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -266,10 +266,6 @@ export class V3AgentHandler { state: AgentState, { userCallback, evidenceCallback, onFinalAnswer }: StepHandlerOptions, ) { - // Monotonic step counter scoped to this execute() call. Each tool call in - // the agent loop becomes one trajectory step. The counter feeds stepIndex - // on evidence callback events. - let stepCounter = 0; return async (event: StepResult) => { this.logger({ category: "agent", @@ -277,7 +273,6 @@ export class V3AgentHandler { level: 2, }); - const stepIndicesInTurn: number[] = []; let lastFinalAnswer: FinalAnswerDraft | undefined; if (event.toolCalls && event.toolCalls.length > 0) { @@ -325,11 +320,8 @@ export class V3AgentHandler { state.actions.push(action); } - const stepIndex = stepCounter++; - stepIndicesInTurn.push(stepIndex); await evidenceCallback?.({ type: "step_finished", - stepIndex, actionName: toolCall.toolName, actionArgs: typeof args === "object" && args !== null @@ -341,14 +333,12 @@ export class V3AgentHandler { } state.currentPageUrl = (await this.v3.context.awaitActivePage()).url(); - // Harness probe — take a single screenshot / a11y snapshot per AI SDK - // step and attach it to every tool call in that turn. The observation - // reflects the settled page state after the batch of tool calls; this - // is more faithful than dropping probe evidence for all but the last - // tool call, while still avoiding per-tool screenshot overhead. + // Harness probe — one screenshot / a11y snapshot per AI SDK step. + // The recorder applies the probe to every step_finished received + // since the previous probe, so a multi-tool turn shares the same + // post-turn observation. await emitPostStepProbeEvidence({ v3: this.v3, - stepIndices: stepIndicesInTurn, url: state.currentPageUrl, evidenceCallback, logger: this.logger, diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index 5c495b5c2f..901dce71da 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -12,7 +12,6 @@ import { emitPostStepProbeEvidence, } from "../agent/utils/postStepProbeEvidence.js"; import { wrapEvidenceCallback } from "../agent/utils/wrapEvidenceCallback.js"; -import { CuaEvidenceStepTracker } from "../agent/utils/cuaEvidenceStepTracker.js"; import { inferToolOutput } from "../agent/utils/toolOutputEvidence.js"; import { ActionExecutionResult, @@ -45,7 +44,7 @@ export class V3CuaAgentHandler { private captchaSolver: CaptchaSolver | null = null; private captchaClickGuardRemaining = 0; private currentInstruction = ""; - private readonly cuaEvidenceSteps = new CuaEvidenceStepTracker(); + private lastAgentScreenshotUrl?: string; private evidenceCallback?: AgentEvidenceCallback; constructor( @@ -205,7 +204,7 @@ export class V3CuaAgentHandler { options.callbacks?.onEvidence, this.logger, ); - this.cuaEvidenceSteps.reset(); + this.lastAgentScreenshotUrl = undefined; this.highlightCursor = options.highlightCursor !== false; this.currentInstruction = options.instruction; @@ -266,7 +265,7 @@ export class V3CuaAgentHandler { try { finalUrl = (await this.v3.context.awaitActivePage()).url(); } catch { - finalUrl = this.cuaEvidenceSteps.latestScreenshotUrl ?? ""; + finalUrl = this.lastAgentScreenshotUrl ?? ""; } const observation = await captureProbeEvidence({ v3: this.v3, @@ -825,9 +824,13 @@ export class V3CuaAgentHandler { screenshot: Buffer, url: string, ): Promise { - await this.evidenceCallback?.( - this.cuaEvidenceSteps.recordScreenshot(screenshot, url), - ); + this.lastAgentScreenshotUrl = url; + await this.evidenceCallback?.({ + type: "screenshot", + screenshot, + url, + evidenceRole: "agent", + }); } private async emitCuaActionStep( @@ -837,16 +840,12 @@ export class V3CuaAgentHandler { let pageUrl = typeof action.pageUrl === "string" ? action.pageUrl - : (this.cuaEvidenceSteps.latestScreenshotUrl ?? ""); + : (this.lastAgentScreenshotUrl ?? ""); try { pageUrl = (await this.v3.context.awaitActivePage()).url(); } catch { // Keep the best pre-action URL fallback. } - const { stepIndex, replayScreenshot } = this.cuaEvidenceSteps.pairAction(); - if (replayScreenshot) { - await this.evidenceCallback?.(replayScreenshot); - } const actionArgs = Object.fromEntries( Object.entries(action).filter(([key]) => key !== "screenshot"), @@ -860,7 +859,6 @@ export class V3CuaAgentHandler { await this.evidenceCallback?.({ type: "step_finished", - stepIndex, actionName: String(action.type), actionArgs, reasoning, @@ -874,7 +872,6 @@ export class V3CuaAgentHandler { // has to trust the action history alone. await emitPostStepProbeEvidence({ v3: this.v3, - stepIndices: stepIndex, url: pageUrl, evidenceCallback: this.evidenceCallback, logger: this.logger, diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts index dcd0e89e70..25e2bd51f4 100644 --- a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts +++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts @@ -2,8 +2,11 @@ * Evidence events emitted through AgentExecuteOptions.callbacks.onEvidence. * * These events describe observations made by Stagehand during an agent run. - * They are intentionally transport-level callback payloads; verifier-specific - * storage and normalization live in the evals/verifier layers. + * They are emitted in temporal order; consumers should treat the stream as + * sequential (pair an agent-role screenshot with the next step_finished, + * apply a step_observed/probe to all steps_finished since the last probe). + * Verifier-specific storage and normalization live in the evals/verifier + * layers. */ export type AgentEvidenceRole = "probe" | "agent"; @@ -23,8 +26,6 @@ export type AgentEvidenceEvent = */ export interface AgentScreenshotEvidenceEvent { type: "screenshot"; - /** Zero-based index of the step this screenshot corresponds to. */ - stepIndex: number; /** PNG bytes from page.screenshot(). */ screenshot: Buffer; /** Page URL at the time of capture. */ @@ -38,7 +39,6 @@ export interface AgentScreenshotEvidenceEvent { */ export interface AgentStepFinishedEvent { type: "step_finished"; - stepIndex: number; /** Name of the tool/action that ran, e.g. "act", "extract", "click". */ actionName: string; /** Arguments passed to the tool/action. */ @@ -55,11 +55,11 @@ export interface AgentStepFinishedEvent { } /** - * Independent post-step browser observation. + * Independent post-step browser observation. Emitted once per agent turn; + * consumers apply it to every step_finished received since the previous probe. */ export interface AgentStepObservedEvent { type: "step_observed"; - stepIndex: number; /** Page URL after the step's tool/action execution. */ url: string; /** Accessibility tree snapshot, when captured. */ diff --git a/packages/core/tests/unit/cua-evidence-step-tracker.test.ts b/packages/core/tests/unit/cua-evidence-step-tracker.test.ts deleted file mode 100644 index 112c820a97..0000000000 --- a/packages/core/tests/unit/cua-evidence-step-tracker.test.ts +++ /dev/null @@ -1,55 +0,0 @@ -import { describe, expect, it } from "vitest"; - -import { CuaEvidenceStepTracker } from "../../lib/v3/agent/utils/cuaEvidenceStepTracker.js"; - -describe("CuaEvidenceStepTracker", () => { - it("pairs a fresh provider screenshot with the next action", () => { - const tracker = new CuaEvidenceStepTracker(); - const screenshot = Buffer.from("screen"); - - const event = tracker.recordScreenshot(screenshot, "https://example.com"); - const paired = tracker.pairAction(); - - expect(event).toMatchObject({ - type: "screenshot", - stepIndex: 0, - evidenceRole: "agent", - url: "https://example.com", - }); - expect(paired).toEqual({ stepIndex: 0 }); - }); - - it("allocates an action step without screenshot evidence", () => { - const tracker = new CuaEvidenceStepTracker(); - - expect(tracker.pairAction()).toEqual({ stepIndex: 0 }); - }); - - it("replays the latest consumed screenshot for later actions", () => { - const tracker = new CuaEvidenceStepTracker(); - const screenshot = Buffer.from("screen"); - - tracker.recordScreenshot(screenshot, "https://example.com/start"); - tracker.pairAction(); - const paired = tracker.pairAction(); - - expect(paired.stepIndex).toBe(1); - expect(paired.replayScreenshot).toMatchObject({ - type: "screenshot", - stepIndex: 1, - evidenceRole: "agent", - url: "https://example.com/start", - }); - expect(paired.replayScreenshot?.screenshot).toEqual(screenshot); - }); - - it("resets step allocation and pending screenshot state", () => { - const tracker = new CuaEvidenceStepTracker(); - - tracker.recordScreenshot(Buffer.from("screen"), "https://example.com"); - tracker.reset(); - - expect(tracker.pairAction()).toEqual({ stepIndex: 0 }); - expect(tracker.latestScreenshotUrl).toBeUndefined(); - }); -}); diff --git a/packages/core/tests/unit/tool-output-evidence.test.ts b/packages/core/tests/unit/tool-output-evidence.test.ts index 87b01d7529..fd7c2aabde 100644 --- a/packages/core/tests/unit/tool-output-evidence.test.ts +++ b/packages/core/tests/unit/tool-output-evidence.test.ts @@ -3,56 +3,45 @@ import { describe, expect, it } from "vitest"; import { inferToolOutput } from "../../lib/v3/agent/utils/toolOutputEvidence.js"; describe("inferToolOutput", () => { - it("preserves raw results while normalizing top-level failure status", () => { - const result = { success: false }; - - expect(inferToolOutput(result)).toEqual({ - ok: false, - result, - error: undefined, - }); - }); - - it("normalizes one-level AI SDK output wrappers", () => { - const result = { - toolCallId: "call-1", - output: { success: false, error: { message: "not found" } }, - }; - - expect(inferToolOutput(result)).toEqual({ - ok: false, - result, - error: '{"message":"not found"}', - }); - }); - - it("handles isError and non-string errors", () => { - const result = { isError: true, error: new Error("bad input") }; - - expect(inferToolOutput(result)).toEqual({ - ok: false, - result, - error: "bad input", - }); - }); - - it("normalizes non-json error values", () => { - const result = { error: Symbol("bad input") }; - - expect(inferToolOutput(result)).toEqual({ - ok: false, - result, - error: "Symbol(bad input)", - }); - }); - - it("does not recursively treat page data as tool status", () => { - const result = { data: { success: false, error: "page field" } }; - + it.each<[string, unknown, boolean, string | undefined]>([ + [ + "preserves raw results while normalizing top-level failure status", + { success: false }, + false, + undefined, + ], + [ + "normalizes one-level AI SDK output wrappers", + { + toolCallId: "call-1", + output: { success: false, error: { message: "not found" } }, + }, + false, + '{"message":"not found"}', + ], + [ + "handles isError and non-string errors", + { isError: true, error: new Error("bad input") }, + false, + "bad input", + ], + [ + "normalizes non-json error values", + { error: Symbol("bad input") }, + false, + "Symbol(bad input)", + ], + [ + "does not recursively treat page data as tool status", + { data: { success: false, error: "page field" } }, + true, + undefined, + ], + ])("%s", (_, result, ok, error) => { expect(inferToolOutput(result)).toEqual({ - ok: true, + ok, result, - error: undefined, + error, }); }); }); diff --git a/packages/core/tests/unit/verifier-evidence-normalization.test.ts b/packages/core/tests/unit/verifier-evidence-normalization.test.ts index 5b6ee249b2..174b0a87f5 100644 --- a/packages/core/tests/unit/verifier-evidence-normalization.test.ts +++ b/packages/core/tests/unit/verifier-evidence-normalization.test.ts @@ -1,12 +1,14 @@ import { describe, expect, it } from "vitest"; -import { buildAgentEvidenceFromStepFinished } from "../../lib/v3/verifier/evidenceNormalization.js"; +import { + buildAgentEvidenceFromStepFinished, + REDACTED_INLINE_IMAGE, +} from "../../lib/v3/verifier/evidenceNormalization.js"; describe("buildAgentEvidenceFromStepFinished", () => { it("captures primitive tool results as text evidence", () => { const evidence = buildAgentEvidenceFromStepFinished({ type: "step_finished", - stepIndex: 0, actionName: "check", actionArgs: {}, reasoning: "", @@ -15,4 +17,47 @@ describe("buildAgentEvidenceFromStepFinished", () => { expect(evidence.modalities).toEqual([{ type: "text", content: "false" }]); }); + + it("lifts inline screenshot payloads into image evidence and redacts JSON", () => { + const inlineScreenshot = + Buffer.from("inline screenshot").toString("base64"); + + const evidence = buildAgentEvidenceFromStepFinished({ + type: "step_finished", + actionName: "click", + actionArgs: { describe: "Open fare details" }, + reasoning: "", + toolOutput: { + ok: true, + result: { + output: { + success: true, + describe: "Open fare details", + screenshotBase64: inlineScreenshot, + }, + }, + }, + }); + + const [imageModality, jsonModality] = evidence.modalities; + + expect(JSON.stringify(evidence)).not.toContain(inlineScreenshot); + expect(jsonModality).toMatchObject({ + type: "json", + content: { + output: { + screenshotBase64: REDACTED_INLINE_IMAGE, + }, + }, + }); + expect(imageModality).toMatchObject({ + type: "image", + mediaType: "image/png", + }); + if (imageModality?.type === "image") { + expect(imageModality.bytes).toEqual( + Buffer.from(inlineScreenshot, "base64"), + ); + } + }); }); diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index 84b57b43a4..5785ca4388 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -23,14 +23,9 @@ import type { EvaluationResult, } from "@browserbasehq/stagehand"; -interface PartialStep { - index: number; - actionName: string; - actionArgs: Record; - reasoning: string; - agentEvidence: AgentEvidence; - probeEvidence: ProbeEvidence; - toolOutput: { ok: boolean; result: unknown; error?: string }; +interface PendingScreenshot { + screenshot: Buffer; + url: string; } export interface TrajectoryRecorderOptions { @@ -67,65 +62,79 @@ export class TrajectoryRecorder { private readonly outputDir: string; private readonly persistEnabled: boolean; - // Events can arrive out-of-order across step indices; same-step events all - // fire in one microtask. - private readonly partialSteps = new Map>(); + // Steps are appended in arrival order on each step_finished event. + private readonly steps: TrajectoryStep[] = []; + // The most recent agent-role screenshot is held until the next step_finished + // consumes it. A second agent-role screenshot before any step_finished + // overwrites the first — that's the desired behavior when a turn is skipped + // (e.g., captcha guard short-circuits before emitting step_finished). + private pendingAgentScreenshot?: PendingScreenshot; + // The most recent probe-role screenshot waits for the matching step_observed. + private pendingProbeScreenshot?: PendingScreenshot; + // Steps that haven't yet had a probe attached. The next step_observed fans + // out to all of them (one probe per agent turn, N tool calls per turn). + private stepsAwaitingProbe: number[] = []; private finalAnswerEvent?: AgentFinalAnswerEvent; private finalObservation?: ProbeEvidence; private onScreenshot(e: AgentScreenshotEvidenceEvent): void { - const partial = this.ensurePartial(e.stepIndex); - - // Default to probe when the emit site doesn't tag a role: matches - // v3AgentHandler's post-step screenshot. For CUA the pre-action shot is - // NOT a probe — emitCuaActionStep fills that role post-action. const role = e.evidenceRole ?? "probe"; - - if (role === "probe") { - const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; - probe.screenshot = e.screenshot; - probe.url = e.url; - partial.probeEvidence = probe; - } else if (!partial.probeEvidence?.url) { - // Capture URL even for tier-1-only events; a later post-action URL - // can still overwrite it. - partial.probeEvidence = { - ...(partial.probeEvidence ?? {}), - url: e.url, - }; - } - if (role === "agent") { - partial.agentEvidence = mergeAgentEvidence(partial.agentEvidence, { - modalities: [ - { type: "image", bytes: e.screenshot, mediaType: "image/png" }, - ], - }); + this.pendingAgentScreenshot = { screenshot: e.screenshot, url: e.url }; + } else { + this.pendingProbeScreenshot = { screenshot: e.screenshot, url: e.url }; } } private onStepFinished(e: AgentStepFinishedEvent): void { - const partial = this.ensurePartial(e.stepIndex); - partial.actionName = e.actionName; - partial.actionArgs = e.actionArgs; - partial.reasoning = e.reasoning; - partial.toolOutput = { - ...e.toolOutput, - result: redactInlineImagePayloads(e.toolOutput.result, e.actionName), - }; - partial.agentEvidence = mergeAgentEvidence( - partial.agentEvidence, + const agentEvidence: AgentEvidence = this.pendingAgentScreenshot + ? mergeAgentEvidence( + { modalities: [] }, + { + modalities: [ + { + type: "image", + bytes: this.pendingAgentScreenshot.screenshot, + mediaType: "image/png", + }, + ], + }, + ) + : { modalities: [] }; + const merged = mergeAgentEvidence( + agentEvidence, buildAgentEvidenceFromStepFinished(e), ); + + const step: TrajectoryStep = { + index: this.steps.length, + actionName: e.actionName, + actionArgs: e.actionArgs, + reasoning: e.reasoning, + agentEvidence: merged, + probeEvidence: {}, + toolOutput: { + ...e.toolOutput, + result: redactInlineImagePayloads(e.toolOutput.result, e.actionName), + }, + }; + this.pendingAgentScreenshot = undefined; + this.steps.push(step); + this.stepsAwaitingProbe.push(step.index); } private onStepObserved(e: AgentStepObservedEvent): void { - const partial = this.ensurePartial(e.stepIndex); - const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; - probe.url = e.url; + if (this.stepsAwaitingProbe.length === 0) return; + const probe: ProbeEvidence = { url: e.url }; + if (this.pendingProbeScreenshot) + probe.screenshot = this.pendingProbeScreenshot.screenshot; if (e.ariaTree !== undefined) probe.ariaTree = e.ariaTree; if (e.scroll !== undefined) probe.scroll = e.scroll; - partial.probeEvidence = probe; + for (const idx of this.stepsAwaitingProbe) { + this.steps[idx].probeEvidence = probe; + } + this.stepsAwaitingProbe = []; + this.pendingProbeScreenshot = undefined; } private onFinalAnswer(e: AgentFinalAnswerEvent): void { @@ -182,10 +191,9 @@ export class TrajectoryRecorder { * write the on-disk layout. Idempotent. */ async finish(opts: TrajectoryFinishOptions): Promise { - const steps = this.assembleSteps(); const trajectory: Trajectory = { task: this.taskSpec, - steps, + steps: this.steps, finalAnswer: opts.finalAnswer ?? this.finalAnswerEvent?.message, ...(this.finalObservation ? { finalObservation: this.finalObservation } @@ -203,7 +211,10 @@ export class TrajectoryRecorder { /** Throw away in-memory state without writing to disk. Used on early abort. */ cancel(): void { - this.partialSteps.clear(); + this.steps.length = 0; + this.pendingAgentScreenshot = undefined; + this.pendingProbeScreenshot = undefined; + this.stepsAwaitingProbe = []; this.finalAnswerEvent = undefined; this.finalObservation = undefined; } @@ -250,37 +261,4 @@ export class TrajectoryRecorder { JSON.stringify({ ...taskData, result }, null, 2), ); } - - private ensurePartial(stepIndex: number): Partial { - let p = this.partialSteps.get(stepIndex); - if (!p) { - p = { index: stepIndex }; - this.partialSteps.set(stepIndex, p); - } - return p; - } - - private assembleSteps(): TrajectoryStep[] { - const out: TrajectoryStep[] = []; - const indices = [...this.partialSteps.keys()].sort((a, b) => a - b); - for (const i of indices) { - const p = this.partialSteps.get(i)!; - if (p.actionName === undefined || p.toolOutput === undefined) { - // Provider-only screenshot refreshes are transport evidence for the - // next CUA action. If no action arrives for this index, there is no - // completed trajectory step to persist. - continue; - } - out.push({ - index: i, - actionName: p.actionName, - actionArgs: p.actionArgs ?? {}, - reasoning: p.reasoning ?? "", - agentEvidence: p.agentEvidence ?? { modalities: [] }, - probeEvidence: p.probeEvidence ?? {}, - toolOutput: p.toolOutput, - }); - } - return out; - } } diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts index 57f4c93c55..0623d88b89 100644 --- a/packages/evals/tests/framework/trajectoryRecorder.test.ts +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -30,44 +30,82 @@ function makeTaskSpec(): TaskSpec { id: "recorder-task", instruction: "Compare economy and business fares.", initUrl: "https://example.com", - precomputedRubric: { - items: [ - { - criterion: "Report fare delta", - description: "Report the difference between two fares.", - maxPoints: 1, - }, - ], - }, }; } +function recordSimpleStep(recorder: TrajectoryRecorder, screenshot: Buffer) { + recorder.record({ + type: "screenshot", + screenshot, + url: "https://example.com/search", + evidenceRole: "agent", + }); + recorder.record({ + type: "step_finished", + actionName: "act", + actionArgs: { instruction: "Search fares" }, + reasoning: "Search for fares.", + toolOutput: { ok: true, result: "done" }, + }); + recorder.record({ + type: "screenshot", + screenshot, + url: "https://example.com/search", + evidenceRole: "probe", + }); + recorder.record({ + type: "step_observed", + url: "https://example.com/search", + }); +} + +function recordFinalAnswer( + recorder: TrajectoryRecorder, + opts: { message: string; screenshot: Buffer; ariaTree?: string }, +): void { + recorder.record({ + type: "final_answer", + message: opts.message, + observation: { + url: "https://example.com/complete", + screenshot: opts.screenshot, + ...(opts.ariaTree !== undefined ? { ariaTree: opts.ariaTree } : {}), + }, + }); +} + describe("TrajectoryRecorder", () => { - it("assembles trajectory evidence from callback events", async () => { + it("assembles ordered callback events into trajectory steps", async () => { const recorder = new TrajectoryRecorder({ taskSpec: makeTaskSpec(), persist: false, }); const screenshot = Buffer.from("screen-1"); + const staleScreenshot = Buffer.from("stale-screen"); + const probeScreenshot = Buffer.from("probe-screen"); recorder.start(); recorder.record({ type: "screenshot", - stepIndex: 0, - screenshot, - url: "https://example.com/search", + screenshot: staleScreenshot, + url: "https://example.com/stale", evidenceRole: "agent", }); recorder.record({ type: "screenshot", - stepIndex: 0, screenshot, url: "https://example.com/search", - evidenceRole: "probe", + evidenceRole: "agent", + }); + recorder.record({ + type: "step_finished", + actionName: "click", + actionArgs: { describe: "Open fares" }, + reasoning: "Open fare details.", + toolOutput: { ok: true, result: "opened" }, }); recorder.record({ type: "step_finished", - stepIndex: 0, actionName: "extract", actionArgs: { instruction: "Read fares" }, reasoning: "Read visible fare cells.", @@ -76,20 +114,21 @@ describe("TrajectoryRecorder", () => { result: { economy: "$100", business: "$250" }, }, }); + recorder.record({ + type: "screenshot", + screenshot: probeScreenshot, + url: "https://example.com/search", + evidenceRole: "probe", + }); recorder.record({ type: "step_observed", - stepIndex: 0, url: "https://example.com/search", ariaTree: "RootWebArea\nStaticText: Economy $100", }); - recorder.record({ - type: "final_answer", + recordFinalAnswer(recorder, { message: "Business is $150 more than economy.", - observation: { - url: "https://example.com/checkout", - screenshot: Buffer.from("final-screen"), - ariaTree: "RootWebArea\nStaticText: Complete", - }, + screenshot: Buffer.from("final-screen"), + ariaTree: "RootWebArea\nStaticText: Complete", }); const trajectory = await recorder.finish({ @@ -97,32 +136,35 @@ describe("TrajectoryRecorder", () => { usage: { input_tokens: 10, output_tokens: 5 }, }); - expect(trajectory.steps).toHaveLength(1); + expect(trajectory.steps).toHaveLength(2); expect(trajectory.steps[0]).toMatchObject({ index: 0, - actionName: "extract", - actionArgs: { instruction: "Read fares" }, - reasoning: "Read visible fare cells.", - toolOutput: { - ok: true, - result: { economy: "$100", business: "$250" }, - }, + actionName: "click", probeEvidence: { url: "https://example.com/search", ariaTree: "RootWebArea\nStaticText: Economy $100", }, }); - expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot); + expect(trajectory.steps[1]).toMatchObject({ + index: 1, + actionName: "extract", + toolOutput: { ok: true, result: { economy: "$100", business: "$250" } }, + }); + expect(trajectory.steps[0].probeEvidence.screenshot).toEqual( + probeScreenshot, + ); + expect(trajectory.steps[1].probeEvidence.screenshot).toEqual( + probeScreenshot, + ); expect(trajectory.steps[0].agentEvidence.modalities).toEqual( expect.arrayContaining([ { type: "image", bytes: screenshot, mediaType: "image/png" }, - { type: "text", content: "Read visible fare cells." }, - { type: "json", content: { economy: "$100", business: "$250" } }, + { type: "text", content: "Open fare details." }, ]), ); expect(trajectory.finalAnswer).toBe("Business is $150 more than economy."); expect(trajectory.finalObservation).toMatchObject({ - url: "https://example.com/checkout", + url: "https://example.com/complete", ariaTree: "RootWebArea\nStaticText: Complete", }); expect(trajectory.finalObservation?.screenshot).toEqual( @@ -141,40 +183,10 @@ describe("TrajectoryRecorder", () => { const screenshot = Buffer.from("screen-1"); recorder.start(); - recorder.record({ - type: "screenshot", - stepIndex: 0, - screenshot, - url: "https://example.com/search", - evidenceRole: "agent", - }); - recorder.record({ - type: "screenshot", - stepIndex: 0, - screenshot, - url: "https://example.com/search", - evidenceRole: "probe", - }); - recorder.record({ - type: "step_finished", - stepIndex: 0, - actionName: "act", - actionArgs: { instruction: "Search fares" }, - reasoning: "Search for fares.", - toolOutput: { ok: true, result: "done" }, - }); - recorder.record({ - type: "step_observed", - stepIndex: 0, - url: "https://example.com/search", - }); - recorder.record({ - type: "final_answer", + recordSimpleStep(recorder, screenshot); + recordFinalAnswer(recorder, { message: "Complete.", - observation: { - url: "https://example.com/complete", - screenshot: Buffer.from("final-screen"), - }, + screenshot: Buffer.from("final-screen"), }); await recorder.finish({ status: "complete" }); @@ -229,68 +241,4 @@ describe("TrajectoryRecorder", () => { explanation: "The task was completed.", }); }); - - it("lifts inline screenshot payloads into image evidence and redacts JSON", async () => { - const inlineScreenshot = - Buffer.from("inline screenshot").toString("base64"); - const recorder = new TrajectoryRecorder({ - taskSpec: makeTaskSpec(), - persist: false, - }); - - recorder.record({ - type: "step_finished", - stepIndex: 0, - actionName: "click", - actionArgs: { describe: "Open fare details" }, - reasoning: "Click the fare details button.", - toolOutput: { - ok: true, - result: { - output: { - success: true, - describe: "Open fare details", - screenshotBase64: inlineScreenshot, - }, - }, - }, - }); - - const trajectory = await recorder.finish({ status: "complete" }); - const step = trajectory.steps[0]; - const rawTrajectory = JSON.stringify(trajectory); - const imageModalities = step.agentEvidence.modalities.filter( - (m) => m.type === "image", - ); - const jsonModality = step.agentEvidence.modalities.find( - (m) => m.type === "json", - ); - - expect(rawTrajectory).not.toContain(inlineScreenshot); - expect(step.toolOutput.result).toMatchObject({ - output: { - success: true, - describe: "Open fare details", - screenshotBase64: "[redacted inline image payload]", - }, - }); - expect(jsonModality).toMatchObject({ - type: "json", - content: { - output: { - screenshotBase64: "[redacted inline image payload]", - }, - }, - }); - expect(imageModalities).toHaveLength(1); - expect(imageModalities[0]).toMatchObject({ - type: "image", - mediaType: "image/png", - }); - if (imageModalities[0].type === "image") { - expect(imageModalities[0].bytes).toEqual( - Buffer.from(inlineScreenshot, "base64"), - ); - } - }); }); From d6fb72b8880feae4fbfd6c7450bd8c20c4299d64 Mon Sep 17 00:00:00 2001 From: miguel Date: Sun, 24 May 2026 10:23:59 -0700 Subject: [PATCH 22/27] refactor(verifier): tighten evidence event types and recorder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up cleanup on the sequential-recorder refactor: - Drop step.index from TrajectoryStep; array position is the canonical index. Trajectory writer and v3Evaluator use entries()/map index. - Drop unused scroll field from AgentStepObservedEvent, AgentFinalObservation, and ProbeEvidence — no producer ever set it. - Require evidenceRole on AgentScreenshotEvidenceEvent; the role routes the event into different recorder slots, so a missing role can't silently misroute. - Flatten the identity mergeAgentEvidence in onStepFinished. - Drop unused url field from the recorder's pending screenshot slots. - Remove the no-op TrajectoryRecorder.start() method and test call sites. - Remove the dead early-return guard in onStepObserved. --- .../v3/types/public/agentEvidenceEvents.ts | 6 +- packages/core/lib/v3/verifier/trajectory.ts | 10 +-- packages/core/lib/v3/verifier/types.ts | 3 - packages/core/lib/v3Evaluator.ts | 4 +- packages/core/tests/unit/v3-evaluator.test.ts | 1 - .../tests/unit/verifier-trajectory.test.ts | 4 -- .../evals/framework/trajectoryRecorder.ts | 62 ++++++------------- .../framework/trajectoryRecorder.test.ts | 4 -- 8 files changed, 27 insertions(+), 67 deletions(-) diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts index 25e2bd51f4..d74b56a67c 100644 --- a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts +++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts @@ -31,7 +31,7 @@ export interface AgentScreenshotEvidenceEvent { /** Page URL at the time of capture. */ url: string; /** Role this screenshot plays in downstream evidence collection. */ - evidenceRole?: AgentEvidenceRole; + evidenceRole: AgentEvidenceRole; } /** @@ -64,8 +64,6 @@ export interface AgentStepObservedEvent { url: string; /** Accessibility tree snapshot, when captured. */ ariaTree?: string; - /** Viewport scroll context, when captured. */ - scroll?: { top: number; pageHeight: number }; } export interface AgentFinalObservation { @@ -75,8 +73,6 @@ export interface AgentFinalObservation { screenshot?: Buffer; /** Accessibility tree snapshot, when captured. */ ariaTree?: string; - /** Viewport scroll context, when captured. */ - scroll?: { top: number; pageHeight: number }; } /** Final answer emitted by the agent, when available. */ diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index b8722a64df..223e4b1b92 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -268,10 +268,10 @@ export async function writeTrajectoryDir( await fs.mkdir(path.join(dir, "screenshots", "agent"), { recursive: true }); const serializableSteps: unknown[] = []; - for (const step of trajectory.steps) { + for (const [i, step] of trajectory.steps.entries()) { const probe: ProbeEvidence = { ...step.probeEvidence }; if (probe.screenshot) { - const relPath = `screenshots/probe/${step.index + 1}.png`; + const relPath = `screenshots/probe/${i + 1}.png`; await fs.writeFile(path.join(dir, relPath), probe.screenshot); probe.screenshotPath = relPath; delete probe.screenshot; @@ -296,7 +296,7 @@ export async function writeTrajectoryDir( continue; } const suffix = multipleImages ? `_${imageSeq}` : ""; - const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`; + const relPath = `screenshots/agent/${i + 1}${suffix}.png`; await fs.writeFile(path.join(dir, relPath), m.bytes); modalities.push({ type: "image", @@ -363,9 +363,9 @@ export async function writeTrajectoryDir( function coreLog(trajectory: Trajectory): string { return ( trajectory.steps - .map((step) => + .map((step, i) => JSON.stringify({ - step: step.index, + step: i, action: step.actionName, url: step.probeEvidence.url ?? null, ok: step.toolOutput.ok, diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts index 5431397092..4aa76ea6f1 100644 --- a/packages/core/lib/v3/verifier/types.ts +++ b/packages/core/lib/v3/verifier/types.ts @@ -89,8 +89,6 @@ export interface ProbeEvidence { screenshot?: Buffer; /** Reference to the persisted screenshot file under the trajectory dir. */ screenshotPath?: string; - /** Viewport scroll context. Lets the verifier reason about whether the agent saw the full page. */ - scroll?: { top: number; pageHeight: number }; /** Accessibility tree snapshot. */ ariaTree?: string; /** Verifier-requested probes, keyed by criterion id. */ @@ -111,7 +109,6 @@ export interface ToolOutput { /** One step in a trajectory: action + reasoning + evidence + outcome. */ export interface TrajectoryStep { - index: number; actionName: string; actionArgs: Record; /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */ diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts index ee1bbc6d35..5e6e5ee92f 100644 --- a/packages/core/lib/v3Evaluator.ts +++ b/packages/core/lib/v3Evaluator.ts @@ -237,13 +237,13 @@ function collectLegacyScreenshots(trajectory: Trajectory): Buffer[] { function renderLegacyAgentReasoning( trajectory: Trajectory, ): string | undefined { - const stepLines = (trajectory.steps ?? []).map((step) => { + const stepLines = (trajectory.steps ?? []).map((step, i) => { const status = step.toolOutput?.ok === false ? "Tool status: failed" : ""; const output = step.toolOutput?.error ? `Tool error: ${step.toolOutput.error}` : `Tool output: ${stringifyForPrompt(step.toolOutput?.result)}`; return [ - `Step ${step.index}: ${step.actionName}`, + `Step ${i}: ${step.actionName}`, step.reasoning ? `Reasoning: ${step.reasoning}` : undefined, status || undefined, output, diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts index b18650d8ea..e6f61d54a8 100644 --- a/packages/core/tests/unit/v3-evaluator.test.ts +++ b/packages/core/tests/unit/v3-evaluator.test.ts @@ -212,7 +212,6 @@ function makeTrajectory( ...makeEmptyTrajectory(taskSpec), steps: [ { - index: 0, actionName: "act", actionArgs: {}, reasoning: "I completed the task.", diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts index e57f2bb3c4..7cd38662df 100644 --- a/packages/core/tests/unit/verifier-trajectory.test.ts +++ b/packages/core/tests/unit/verifier-trajectory.test.ts @@ -81,7 +81,6 @@ describe("verifier trajectory utilities", () => { usage: { input_tokens: 0, output_tokens: 0 }, steps: [ { - index: 0, actionName: "act", actionArgs: {}, reasoning: "", @@ -127,7 +126,6 @@ describe("verifier trajectory utilities", () => { usage: { input_tokens: 0, output_tokens: 0 }, steps: [ { - index: 0, actionName: "act", actionArgs: {}, reasoning: "", @@ -171,7 +169,6 @@ describe("verifier trajectory utilities", () => { }, steps: [ { - index: 0, actionName: "click", actionArgs: {}, reasoning: "", @@ -231,7 +228,6 @@ describe("verifier trajectory utilities", () => { usage: { input_tokens: 0, output_tokens: 0 }, steps: [ { - index: 0, actionName: "act", actionArgs: {}, reasoning: "", diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index 5785ca4388..d24b53ae9e 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -23,11 +23,6 @@ import type { EvaluationResult, } from "@browserbasehq/stagehand"; -interface PendingScreenshot { - screenshot: Buffer; - url: string; -} - export interface TrajectoryRecorderOptions { taskSpec: TaskSpec; /** @@ -68,9 +63,9 @@ export class TrajectoryRecorder { // consumes it. A second agent-role screenshot before any step_finished // overwrites the first — that's the desired behavior when a turn is skipped // (e.g., captcha guard short-circuits before emitting step_finished). - private pendingAgentScreenshot?: PendingScreenshot; + private pendingAgentScreenshot?: Buffer; // The most recent probe-role screenshot waits for the matching step_observed. - private pendingProbeScreenshot?: PendingScreenshot; + private pendingProbeScreenshot?: Buffer; // Steps that haven't yet had a probe attached. The next step_observed fans // out to all of them (one probe per agent turn, N tool calls per turn). private stepsAwaitingProbe: number[] = []; @@ -78,36 +73,30 @@ export class TrajectoryRecorder { private finalObservation?: ProbeEvidence; private onScreenshot(e: AgentScreenshotEvidenceEvent): void { - const role = e.evidenceRole ?? "probe"; - if (role === "agent") { - this.pendingAgentScreenshot = { screenshot: e.screenshot, url: e.url }; + if (e.evidenceRole === "agent") { + this.pendingAgentScreenshot = e.screenshot; } else { - this.pendingProbeScreenshot = { screenshot: e.screenshot, url: e.url }; + this.pendingProbeScreenshot = e.screenshot; } } private onStepFinished(e: AgentStepFinishedEvent): void { - const agentEvidence: AgentEvidence = this.pendingAgentScreenshot - ? mergeAgentEvidence( - { modalities: [] }, - { - modalities: [ - { - type: "image", - bytes: this.pendingAgentScreenshot.screenshot, - mediaType: "image/png", - }, - ], - }, - ) - : { modalities: [] }; + const modalities: AgentEvidence["modalities"] = []; + if (this.pendingAgentScreenshot) { + modalities.push({ + type: "image", + bytes: this.pendingAgentScreenshot, + mediaType: "image/png", + }); + } const merged = mergeAgentEvidence( - agentEvidence, + { modalities }, buildAgentEvidenceFromStepFinished(e), ); - const step: TrajectoryStep = { - index: this.steps.length, + this.pendingAgentScreenshot = undefined; + this.stepsAwaitingProbe.push(this.steps.length); + this.steps.push({ actionName: e.actionName, actionArgs: e.actionArgs, reasoning: e.reasoning, @@ -117,19 +106,14 @@ export class TrajectoryRecorder { ...e.toolOutput, result: redactInlineImagePayloads(e.toolOutput.result, e.actionName), }, - }; - this.pendingAgentScreenshot = undefined; - this.steps.push(step); - this.stepsAwaitingProbe.push(step.index); + }); } private onStepObserved(e: AgentStepObservedEvent): void { - if (this.stepsAwaitingProbe.length === 0) return; const probe: ProbeEvidence = { url: e.url }; if (this.pendingProbeScreenshot) - probe.screenshot = this.pendingProbeScreenshot.screenshot; + probe.screenshot = this.pendingProbeScreenshot; if (e.ariaTree !== undefined) probe.ariaTree = e.ariaTree; - if (e.scroll !== undefined) probe.scroll = e.scroll; for (const idx of this.stepsAwaitingProbe) { this.steps[idx].probeEvidence = probe; } @@ -148,9 +132,6 @@ export class TrajectoryRecorder { ...(e.observation.ariaTree !== undefined ? { ariaTree: e.observation.ariaTree } : {}), - ...(e.observation.scroll !== undefined - ? { scroll: e.observation.scroll } - : {}), }; } } @@ -163,11 +144,6 @@ export class TrajectoryRecorder { this.persistEnabled = shouldPersistTrajectory(opts.persist); } - /** Mark the beginning of collection. Retained as a no-op for compatibility. */ - start(): void { - return; - } - /** Ingest an evidence callback event from agent.execute(). */ record(event: AgentEvidenceEvent): void { switch (event.type) { diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts index 0623d88b89..4dcf379e86 100644 --- a/packages/evals/tests/framework/trajectoryRecorder.test.ts +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -84,7 +84,6 @@ describe("TrajectoryRecorder", () => { const staleScreenshot = Buffer.from("stale-screen"); const probeScreenshot = Buffer.from("probe-screen"); - recorder.start(); recorder.record({ type: "screenshot", screenshot: staleScreenshot, @@ -138,7 +137,6 @@ describe("TrajectoryRecorder", () => { expect(trajectory.steps).toHaveLength(2); expect(trajectory.steps[0]).toMatchObject({ - index: 0, actionName: "click", probeEvidence: { url: "https://example.com/search", @@ -146,7 +144,6 @@ describe("TrajectoryRecorder", () => { }, }); expect(trajectory.steps[1]).toMatchObject({ - index: 1, actionName: "extract", toolOutput: { ok: true, result: { economy: "$100", business: "$250" } }, }); @@ -182,7 +179,6 @@ describe("TrajectoryRecorder", () => { }); const screenshot = Buffer.from("screen-1"); - recorder.start(); recordSimpleStep(recorder, screenshot); recordFinalAnswer(recorder, { message: "Complete.", From 754a54b0f55a7ddac7f964c3efb0438fae6bfe58 Mon Sep 17 00:00:00 2001 From: miguel Date: Sun, 24 May 2026 11:36:27 -0700 Subject: [PATCH 23/27] refactor(verifier): drop unused inferCuaToolOutput The CUA handler calls inferToolOutput directly now that the general helper handles the {success: boolean, error?: ...} shape via normalizeError. --- .../core/lib/v3/agent/utils/toolOutputEvidence.ts | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts index 9718181479..50426b0d12 100644 --- a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts +++ b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts @@ -1,5 +1,4 @@ import type { AgentStepFinishedEvent } from "../../types/public/agentEvidenceEvents.js"; -import type { ActionExecutionResult } from "../../types/public/agent.js"; const ERROR_STRING_LIMIT = 1000; @@ -75,13 +74,3 @@ export function inferToolOutput( error, }; } - -export function inferCuaToolOutput( - result: ActionExecutionResult | undefined, -): AgentStepFinishedEvent["toolOutput"] { - return { - ok: result?.success !== false, - result: result ?? { success: true }, - error: result?.error, - }; -} From a748399434bf0d83b8559a43a3cc11ab0c423382 Mon Sep 17 00:00:00 2001 From: miguel Date: Sun, 24 May 2026 11:45:41 -0700 Subject: [PATCH 24/27] only emit step when evidenceCallback is provided --- packages/core/lib/v3/handlers/v3CuaAgentHandler.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index 901dce71da..73e86a9171 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -162,7 +162,7 @@ export class V3CuaAgentHandler { } action.timestamp = Date.now(); - if (shouldLog) { + if (shouldLog && this.evidenceCallback) { await this.emitCuaActionStep(action, executionResult); } From 61160d9b86ad0acf55efa1314c3abc9db0fa0616 Mon Sep 17 00:00:00 2001 From: miguel Date: Sun, 24 May 2026 11:57:41 -0700 Subject: [PATCH 25/27] perf(verifier): dedupe shared probe screenshots in writeTrajectoryDir A single post-turn probe is fanned across every step of a multi-tool turn, so those steps share the same screenshot Buffer by reference. writeTrajectoryDir was writing an identical PNG per step (probe/1.png, probe/2.png, ...). Dedupe by Buffer identity: write the PNG once and point every sharing step's screenshotPath at the same file. Behavior-preserving for single-probe steps. Co-Authored-By: Claude Opus 4.7 --- packages/core/lib/v3/verifier/trajectory.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index 223e4b1b92..1465d363e2 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -268,11 +268,19 @@ export async function writeTrajectoryDir( await fs.mkdir(path.join(dir, "screenshots", "agent"), { recursive: true }); const serializableSteps: unknown[] = []; + // A single post-turn probe is fanned across every step of a multi-tool turn, + // so the same screenshot Buffer is shared by reference. Dedupe by identity: + // write the PNG once and point every sharing step's screenshotPath at it. + const probePathByBuffer = new Map(); for (const [i, step] of trajectory.steps.entries()) { const probe: ProbeEvidence = { ...step.probeEvidence }; if (probe.screenshot) { - const relPath = `screenshots/probe/${i + 1}.png`; - await fs.writeFile(path.join(dir, relPath), probe.screenshot); + let relPath = probePathByBuffer.get(probe.screenshot); + if (!relPath) { + relPath = `screenshots/probe/${i + 1}.png`; + await fs.writeFile(path.join(dir, relPath), probe.screenshot); + probePathByBuffer.set(probe.screenshot, relPath); + } probe.screenshotPath = relPath; delete probe.screenshot; } From 98bd986e699fc978d6dc108b79a6e241601ece8b Mon Sep 17 00:00:00 2001 From: miguel Date: Mon, 25 May 2026 08:16:12 -0700 Subject: [PATCH 26/27] fix(cua,verifier): record failed actions + share agent screenshot across batched steps Two trajectory-fidelity gaps in CUA runs: 1. Failed actions were dropped. emitCuaActionStep only ran after a successful executeAction; a throwing action jumped to catch and rethrew, so no step_finished was recorded. Now the catch emits a step_finished {ok:false, error} (with a best-effort post-failure probe) before rethrowing, in a nested try/catch so evidence emission never masks the original error. emitCuaActionStep now takes an explicit toolOutput instead of deriving it from `result ?? {success:true}`. 2. Batched actions lost the agent screenshot. A CUA provider can choose several actions from one screenshot, but the recorder cleared the pending agent screenshot after the first step_finished, so later steps got no tier-1 frame. Renamed to latestAgentScreenshot; it now applies to every step until a newer agent screenshot replaces it (wiped on cancel()). writeTrajectoryDir dedupes the now-shared agent Buffer by identity so it isn't written once per step. Public onEvidence contract doc updated to describe the replay semantics. Tests: failed-action emits step_finished{ok:false} and rethrows; batched two-action turn shares the agent screenshot across both steps. Co-Authored-By: Claude Opus 4.7 --- .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 49 ++++++++++++++--- .../v3/types/public/agentEvidenceEvents.ts | 10 ++-- packages/core/lib/v3/verifier/trajectory.ts | 16 ++++-- .../tests/unit/agent-captcha-hooks.test.ts | 55 +++++++++++++++++++ .../evals/framework/trajectoryRecorder.ts | 23 ++++---- .../framework/trajectoryRecorder.test.ts | 7 +++ 6 files changed, 133 insertions(+), 27 deletions(-) diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index 73e86a9171..d4b359f8e3 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -22,7 +22,10 @@ import { SafetyConfirmationHandler, } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; -import type { AgentEvidenceCallback } from "../types/public/agentEvidenceEvents.js"; +import type { + AgentEvidenceCallback, + AgentStepFinishedEvent, +} from "../types/public/agentEvidenceEvents.js"; import { type Action, V3FunctionName } from "../types/public/methods.js"; import { FlowLogger } from "../flowlogger/FlowLogger.js"; import { toTitleCase } from "../../utils.js"; @@ -131,6 +134,10 @@ export class V3CuaAgentHandler { const waitBetween = (this.options.clientOptions?.waitBetweenActions as number) || defaultDelay; + // Skip logging for screenshot actions - they're no-ops; the CUA client + // takes its own screenshot via screenshotProvider between API turns. + // Computed outside the try so the catch can still record a failed step. + const shouldLog = action.type !== "screenshot"; try { let executionResult: ActionExecutionResult | undefined; // Try to inject cursor before each action if enabled @@ -142,9 +149,6 @@ export class V3CuaAgentHandler { } } await new Promise((r) => setTimeout(r, 300)); - // Skip logging for screenshot actions - they're no-ops; the CUA client - // takes its own screenshot via screenshotProvider between API turns. - const shouldLog = action.type !== "screenshot"; if (shouldLog) { executionResult = await FlowLogger.runWithLogging( { @@ -163,7 +167,10 @@ export class V3CuaAgentHandler { action.timestamp = Date.now(); if (shouldLog && this.evidenceCallback) { - await this.emitCuaActionStep(action, executionResult); + await this.emitCuaActionStep( + action, + inferToolOutput(executionResult ?? { success: true }), + ); } await new Promise((r) => setTimeout(r, waitBetween)); @@ -174,6 +181,30 @@ export class V3CuaAgentHandler { message: `Error executing action ${action.type}: ${msg}`, level: 0, }); + // Record the failed action as an ok:false step (with a best-effort + // post-failure probe, since a throwing action can still partially + // mutate the page) before rethrowing — otherwise the failure is + // dropped from the persisted trajectory. Evidence emission must never + // mask the original action error. + if (shouldLog && this.evidenceCallback) { + try { + await this.emitCuaActionStep(action, { + ok: false, + result: undefined, + error: msg, + }); + } catch (evidenceError) { + this.logger({ + category: "agent", + message: `Failed to record failed-action evidence: ${ + evidenceError instanceof Error + ? evidenceError.message + : String(evidenceError) + }`, + level: 1, + }); + } + } throw error; } }); @@ -835,7 +866,7 @@ export class V3CuaAgentHandler { private async emitCuaActionStep( action: AgentAction, - result: ActionExecutionResult | undefined, + toolOutput: AgentStepFinishedEvent["toolOutput"], ): Promise { let pageUrl = typeof action.pageUrl === "string" @@ -848,7 +879,9 @@ export class V3CuaAgentHandler { } const actionArgs = Object.fromEntries( - Object.entries(action).filter(([key]) => key !== "screenshot"), + Object.entries(action).filter( + ([key]) => key !== "screenshot" && key !== "timestamp", + ), ); const reasoning = typeof action.reasoning === "string" @@ -862,7 +895,7 @@ export class V3CuaAgentHandler { actionName: String(action.type), actionArgs, reasoning, - toolOutput: inferToolOutput(result ?? { success: true }), + toolOutput, }); // Post-action tier-2 probe. The pre-action screenshot from diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts index d74b56a67c..30c4c2f2a2 100644 --- a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts +++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts @@ -3,10 +3,12 @@ * * These events describe observations made by Stagehand during an agent run. * They are emitted in temporal order; consumers should treat the stream as - * sequential (pair an agent-role screenshot with the next step_finished, - * apply a step_observed/probe to all steps_finished since the last probe). - * Verifier-specific storage and normalization live in the evals/verifier - * layers. + * sequential. An agent-role screenshot applies to every subsequent + * step_finished until a newer agent-role screenshot replaces it — a CUA + * provider may choose multiple actions from a single screenshot, so each of + * those steps shares that frame. A step_observed/probe applies to all + * step_finished events received since the last probe. Verifier-specific + * storage and normalization live in the evals/verifier layers. */ export type AgentEvidenceRole = "probe" | "agent"; diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index 1465d363e2..4902f1f63c 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -269,9 +269,11 @@ export async function writeTrajectoryDir( const serializableSteps: unknown[] = []; // A single post-turn probe is fanned across every step of a multi-tool turn, - // so the same screenshot Buffer is shared by reference. Dedupe by identity: - // write the PNG once and point every sharing step's screenshotPath at it. + // and a single agent screenshot is shared across every action a CUA provider + // chose from it, so the same Buffer is shared by reference. Dedupe by + // identity: write the PNG once and point every sharing step at the same file. const probePathByBuffer = new Map(); + const agentPathByBuffer = new Map(); for (const [i, step] of trajectory.steps.entries()) { const probe: ProbeEvidence = { ...step.probeEvidence }; if (probe.screenshot) { @@ -303,9 +305,13 @@ export async function writeTrajectoryDir( ); continue; } - const suffix = multipleImages ? `_${imageSeq}` : ""; - const relPath = `screenshots/agent/${i + 1}${suffix}.png`; - await fs.writeFile(path.join(dir, relPath), m.bytes); + let relPath = agentPathByBuffer.get(m.bytes); + if (!relPath) { + const suffix = multipleImages ? `_${imageSeq}` : ""; + relPath = `screenshots/agent/${i + 1}${suffix}.png`; + await fs.writeFile(path.join(dir, relPath), m.bytes); + agentPathByBuffer.set(m.bytes, relPath); + } modalities.push({ type: "image", imagePath: relPath, diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts index 4789fb5c63..62b8d38246 100644 --- a/packages/core/tests/unit/agent-captcha-hooks.test.ts +++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts @@ -557,4 +557,59 @@ describe("v3 cua handler screenshot behavior", () => { ), ).toBe(true); }); + + it("records a failed action as step_finished {ok:false} and rethrows the original error", async () => { + const events: Array<{ type: string; [k: string]: unknown }> = []; + const onEvidence = vi.fn(async (event: { type: string }) => { + events.push(event as { type: string }); + }); + + fakeCuaClient.executeImpl = vi.fn(async () => { + await fakeCuaClient.actionHandler?.({ + type: "click", + button: "left", + x: 5, + y: 9, + }); + return { success: true, message: "ok", actions: [], completed: true }; + }); + + const handler = new V3CuaAgentHandler( + { + context: { + awaitActivePage: async () => page, + }, + isCaptchaAutoSolveEnabled: false, + isAdvancedStealth: false, + configuredViewport: { width: 1288, height: 711 }, + isAgentReplayActive: () => false, + updateMetrics: vi.fn(), + } as never, + logger, + { + modelName: "openai/gpt-5.4", + clientOptions: { waitBetweenActions: 1 }, + } as never, + ); + vi.spyOn( + handler as unknown as { + executeAction: (action: Record) => Promise; + }, + "executeAction", + ).mockRejectedValue(new Error("click failed")); + + await expect( + handler.execute({ + instruction: "click the thing", + highlightCursor: false, + callbacks: { onEvidence }, + }), + ).rejects.toThrow("click failed"); + + const stepFinished = events.find((e) => e.type === "step_finished"); + expect(stepFinished).toMatchObject({ + actionName: "click", + toolOutput: { ok: false, error: "click failed" }, + }); + }); }); diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index d24b53ae9e..789bb72f4b 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -59,11 +59,12 @@ export class TrajectoryRecorder { // Steps are appended in arrival order on each step_finished event. private readonly steps: TrajectoryStep[] = []; - // The most recent agent-role screenshot is held until the next step_finished - // consumes it. A second agent-role screenshot before any step_finished - // overwrites the first — that's the desired behavior when a turn is skipped - // (e.g., captcha guard short-circuits before emitting step_finished). - private pendingAgentScreenshot?: Buffer; + // The most recent agent-role screenshot. It applies to every step_finished + // until a newer agent-role screenshot replaces it — a CUA provider can pick + // multiple actions from one screenshot, so each of those steps must carry + // that same tier-1 frame. (It is NOT cleared on consume; it is only replaced + // by a newer screenshot, or wiped on cancel().) + private latestAgentScreenshot?: Buffer; // The most recent probe-role screenshot waits for the matching step_observed. private pendingProbeScreenshot?: Buffer; // Steps that haven't yet had a probe attached. The next step_observed fans @@ -74,7 +75,7 @@ export class TrajectoryRecorder { private onScreenshot(e: AgentScreenshotEvidenceEvent): void { if (e.evidenceRole === "agent") { - this.pendingAgentScreenshot = e.screenshot; + this.latestAgentScreenshot = e.screenshot; } else { this.pendingProbeScreenshot = e.screenshot; } @@ -82,10 +83,10 @@ export class TrajectoryRecorder { private onStepFinished(e: AgentStepFinishedEvent): void { const modalities: AgentEvidence["modalities"] = []; - if (this.pendingAgentScreenshot) { + if (this.latestAgentScreenshot) { modalities.push({ type: "image", - bytes: this.pendingAgentScreenshot, + bytes: this.latestAgentScreenshot, mediaType: "image/png", }); } @@ -94,7 +95,9 @@ export class TrajectoryRecorder { buildAgentEvidenceFromStepFinished(e), ); - this.pendingAgentScreenshot = undefined; + // Intentionally not cleared here: the same agent screenshot applies to + // every step in a batched CUA turn. It's replaced when a newer agent + // screenshot arrives (onScreenshot) or wiped on cancel(). this.stepsAwaitingProbe.push(this.steps.length); this.steps.push({ actionName: e.actionName, @@ -188,7 +191,7 @@ export class TrajectoryRecorder { /** Throw away in-memory state without writing to disk. Used on early abort. */ cancel(): void { this.steps.length = 0; - this.pendingAgentScreenshot = undefined; + this.latestAgentScreenshot = undefined; this.pendingProbeScreenshot = undefined; this.stepsAwaitingProbe = []; this.finalAnswerEvent = undefined; diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts index 4dcf379e86..320c4a5259 100644 --- a/packages/evals/tests/framework/trajectoryRecorder.test.ts +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -159,6 +159,13 @@ describe("TrajectoryRecorder", () => { { type: "text", content: "Open fare details." }, ]), ); + // Both actions were chosen from the same agent screenshot (one screenshot, + // two step_finished), so the second step must carry that frame too. + expect(trajectory.steps[1].agentEvidence.modalities).toEqual( + expect.arrayContaining([ + { type: "image", bytes: screenshot, mediaType: "image/png" }, + ]), + ); expect(trajectory.finalAnswer).toBe("Business is $150 more than economy."); expect(trajectory.finalObservation).toMatchObject({ url: "https://example.com/complete", From 2c95836395410e258e18de6cb1597cea6cfee8b7 Mon Sep 17 00:00:00 2001 From: miguel Date: Mon, 25 May 2026 08:41:23 -0700 Subject: [PATCH 27/27] perf(cua): gate emitCuaScreenshot on evidenceCallback screenshotProvider/captureAndSendScreenshot call emitCuaScreenshot unconditionally; early-return when no recorder is attached so a plain CUA run does no extra work (the lastAgentScreenshotUrl bookkeeping is only read by evidence-gated code). Mirrors the emitCuaActionStep call-site gating. Co-Authored-By: Claude Opus 4.7 --- packages/core/lib/v3/handlers/v3CuaAgentHandler.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index d4b359f8e3..22513339c0 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -855,8 +855,9 @@ export class V3CuaAgentHandler { screenshot: Buffer, url: string, ): Promise { + if (!this.evidenceCallback) return; this.lastAgentScreenshotUrl = url; - await this.evidenceCallback?.({ + await this.evidenceCallback({ type: "screenshot", screenshot, url,