diff --git a/.changeset/verifier-trajectory-events.md b/.changeset/verifier-trajectory-events.md new file mode 100644 index 0000000000..4a4ee2e32e --- /dev/null +++ b/.changeset/verifier-trajectory-events.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Capture verifier trajectory evidence from agent evidence callbacks for offline scoring. diff --git a/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts new file mode 100644 index 0000000000..b68663eb04 --- /dev/null +++ b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts @@ -0,0 +1,73 @@ +/** + * captureAriaTreeProbe — capture a truncated accessibility tree of the active + * page for use as tier-2 evidence in the trajectory recorder. + * + * Shared by v3AgentHandler and v3CuaAgentHandler. Listener-gated by the + * callers so ordinary agent runs (no TrajectoryRecorder attached) don't pay + * the cost. + * + * The a11y tree is the same payload the agent's `ariaTree` tool sees, but + * captured by the harness (not the agent) so the verifier has independent + * textual ground truth for grounding non-visual claims — prices, names, + * dates, list contents — without OCR'ing screenshots. + * + * Budget: defaults to ~8000 tokens (32k chars). Per-step a11y captures + * across a ~30-step trajectory at that cap sum to ~240k tokens total, + * which the verifier handles via per-criterion top-K selection. The cap + * is configurable via VERIFIER_ARIATREE_TOKEN_BUDGET so consumers can + * trade RAM/disk for fidelity. Truncated content is marked explicitly so + * the verifier knows it was clipped. + */ +import type { V3 } from "../../v3.js"; + +const APPROX_CHARS_PER_TOKEN = 4; +const DEFAULT_TOKEN_BUDGET = 8_000; +const DEFAULT_TIMEOUT_MS = 5_000; + +interface CaptureAriaTreeOptions { + /** Soft cap on token count (chars/4 approximation). Default 8000. */ + tokenBudget?: number; + /** Hard timeout on the capture. Default 5s. */ + timeoutMs?: number; +} + +/** + * Returns the truncated a11y tree as a plain string, or undefined when + * capture fails. Never throws — a11y capture is best-effort tier-2 evidence, + * not a hard requirement, so failures are silently absorbed (the verifier + * surfaces this via evidence_insufficient). + */ +export async function captureAriaTreeProbe( + v3: V3, + opts: CaptureAriaTreeOptions = {}, +): Promise { + const envBudget = parseInt( + process.env.VERIFIER_ARIATREE_TOKEN_BUDGET ?? "", + 10, + ); + const tokenBudget = + opts.tokenBudget ?? + (Number.isFinite(envBudget) && envBudget > 0 + ? envBudget + : DEFAULT_TOKEN_BUDGET); + const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS; + const maxChars = tokenBudget * APPROX_CHARS_PER_TOKEN; + + try { + // v3.extract() without a schema returns { pageText } where pageText is the + // rendered accessibility tree — same path the agent's ariaTree tool uses. + const result = await v3.extract({ timeout: timeoutMs }); + const pageText = result?.pageText; + if (typeof pageText !== "string" || pageText.length === 0) return undefined; + + if (pageText.length > maxChars) { + return ( + pageText.slice(0, maxChars) + + `\n\n[CONTENT TRUNCATED at ~${tokenBudget} tokens — set VERIFIER_ARIATREE_TOKEN_BUDGET to raise]` + ); + } + return pageText; + } catch { + return undefined; + } +} diff --git a/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts new file mode 100644 index 0000000000..f68315dbad --- /dev/null +++ b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts @@ -0,0 +1,81 @@ +import type { AgentEvidenceCallback } from "../../types/public/agentEvidenceEvents.js"; +import type { LogLine } from "../../types/public/logs.js"; +import type { V3 } from "../../v3.js"; +import { captureAriaTreeProbe } from "./captureAriaTreeProbe.js"; + +interface CaptureProbeEvidenceOptions { + v3: V3; + url: string; + logger: (message: LogLine) => void; + warningMessage: string; +} + +interface EmitPostStepProbeEvidenceOptions extends CaptureProbeEvidenceOptions { + evidenceCallback?: AgentEvidenceCallback; +} + +function errorMessage(error: unknown): string { + return error instanceof Error ? error.message : String(error); +} + +export async function captureProbeEvidence({ + v3, + url, + logger, + warningMessage, +}: CaptureProbeEvidenceOptions): Promise<{ + url: string; + screenshot?: Buffer; + ariaTree?: string; +}> { + let probeUrl = url; + let screenshot: Buffer | undefined; + try { + const page = await v3.context.awaitActivePage(); + probeUrl = page.url(); + screenshot = await page.screenshot({ fullPage: false }); + } catch (e) { + logger({ + category: "agent", + message: `${warningMessage}: ${errorMessage(e)}`, + level: 1, + }); + } + + const ariaTree = await captureAriaTreeProbe(v3); + return { + url: probeUrl, + ...(screenshot ? { screenshot } : {}), + ...(ariaTree !== undefined ? { ariaTree } : {}), + }; +} + +export async function emitPostStepProbeEvidence({ + v3, + url, + evidenceCallback, + logger, + warningMessage, +}: EmitPostStepProbeEvidenceOptions): Promise { + if (!evidenceCallback) return; + + const probe = await captureProbeEvidence({ + v3, + url, + logger, + warningMessage, + }); + if (probe.screenshot) { + await evidenceCallback({ + type: "screenshot", + screenshot: probe.screenshot, + url: probe.url, + evidenceRole: "probe", + }); + } + await evidenceCallback({ + type: "step_observed", + url: probe.url, + ariaTree: probe.ariaTree, + }); +} diff --git a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts new file mode 100644 index 0000000000..50426b0d12 --- /dev/null +++ b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts @@ -0,0 +1,76 @@ +import type { AgentStepFinishedEvent } from "../../types/public/agentEvidenceEvents.js"; + +const ERROR_STRING_LIMIT = 1000; + +function isRecord(value: unknown): value is Record { + return value !== null && typeof value === "object" && !Array.isArray(value); +} + +function hasOwn(value: Record, key: string): boolean { + return Object.prototype.hasOwnProperty.call(value, key); +} + +function normalizeError(value: unknown): string | undefined { + if (value === undefined || value === null || value === false) { + return undefined; + } + if (value instanceof Error) { + return value.message; + } + if (typeof value === "string") { + return value; + } + if ( + typeof value === "number" || + typeof value === "boolean" || + typeof value === "bigint" + ) { + return String(value); + } + + let serialized: string; + try { + serialized = JSON.stringify(value) ?? String(value); + } catch { + serialized = String(value); + } + if (serialized.length <= ERROR_STRING_LIMIT) { + return serialized; + } + return `${serialized.slice(0, ERROR_STRING_LIMIT)}... [truncated]`; +} + +function statusCandidates(toolResult: unknown): Record[] { + if (!isRecord(toolResult)) { + return []; + } + + const candidates = [toolResult]; + const output = toolResult.output; + if (isRecord(output)) { + candidates.push(output); + } + return candidates; +} + +export function inferToolOutput( + toolResult: unknown, +): AgentStepFinishedEvent["toolOutput"] { + const candidates = statusCandidates(toolResult); + const error = candidates + .map((candidate) => + hasOwn(candidate, "error") ? normalizeError(candidate.error) : undefined, + ) + .find((message): message is string => message !== undefined); + + const successFalse = candidates.some( + (candidate) => candidate.success === false, + ); + const isError = candidates.some((candidate) => Boolean(candidate.isError)); + + return { + ok: error === undefined && !isError && !successFalse, + result: toolResult, + error, + }; +} diff --git a/packages/core/lib/v3/agent/utils/wrapEvidenceCallback.ts b/packages/core/lib/v3/agent/utils/wrapEvidenceCallback.ts new file mode 100644 index 0000000000..1b35bc04c9 --- /dev/null +++ b/packages/core/lib/v3/agent/utils/wrapEvidenceCallback.ts @@ -0,0 +1,27 @@ +import type { AgentEvidenceCallback } from "../../types/public/agentEvidenceEvents.js"; +import type { LogLine } from "../../types/public/logs.js"; + +// onEvidence is a user-supplied observability hook (trajectory recording, +// verifier capture, etc.). Wrap it once at the boundary where the handler +// receives it so a throwing user callback can never abort the agent loop — +// internal emit sites can then call the wrapped callback directly without +// per-site try/catch. +export function wrapEvidenceCallback( + callback: AgentEvidenceCallback | undefined, + logger: (message: LogLine) => void, +): AgentEvidenceCallback | undefined { + if (!callback) return undefined; + return async (event) => { + try { + await callback(event); + } catch (e) { + logger({ + category: "agent", + message: `Warning: onEvidence callback failed for ${event.type}: ${ + e instanceof Error ? e.message : String(e) + }`, + level: 1, + }); + } + }; +} diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index cff08c8a28..c3b6a5892e 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -31,6 +31,7 @@ import { AgentModelConfig, Variables, } from "../types/public/agent.js"; +import type { AgentEvidenceCallback } from "../types/public/agentEvidenceEvents.js"; import { HYBRID_CAPABLE_MODEL_PATTERNS } from "../types/private/agent.js"; import { V3FunctionName } from "../types/public/methods.js"; import { mapToolResultToActions } from "../agent/utils/actionMapping.js"; @@ -41,6 +42,12 @@ import { AgentAbortError, } from "../types/public/sdkErrors.js"; import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js"; +import { + captureProbeEvidence, + emitPostStepProbeEvidence, +} from "../agent/utils/postStepProbeEvidence.js"; +import { wrapEvidenceCallback } from "../agent/utils/wrapEvidenceCallback.js"; +import { inferToolOutput } from "../agent/utils/toolOutputEvidence.js"; import { CaptchaSolver, CAPTCHA_SOLVED_MSG, @@ -51,6 +58,19 @@ function getErrorMessage(error: unknown): string { return error instanceof Error ? error.message : String(error); } +type FinalAnswerDraft = { + message: string; + output?: Record; +}; + +interface StepHandlerOptions { + userCallback?: + | GenerateTextOnStepFinishCallback + | StreamTextOnStepFinishCallback; + evidenceCallback?: AgentEvidenceCallback; + onFinalAnswer?: (answer: FinalAnswerDraft) => void; +} + /** * Prepends a system message with cache control to the messages array. * The cache control providerOptions are used by Anthropic and ignored by other providers. @@ -244,9 +264,7 @@ export class V3AgentHandler { private createStepHandler( state: AgentState, - userCallback?: - | GenerateTextOnStepFinishCallback - | StreamTextOnStepFinishCallback, + { userCallback, evidenceCallback, onFinalAnswer }: StepHandlerOptions, ) { return async (event: StepResult) => { this.logger({ @@ -255,6 +273,8 @@ export class V3AgentHandler { level: 2, }); + let lastFinalAnswer: FinalAnswerDraft | undefined; + if (event.toolCalls && event.toolCalls.length > 0) { for (let i = 0; i < event.toolCalls.length; i++) { const toolCall = event.toolCalls[i]; @@ -279,6 +299,13 @@ export class V3AgentHandler { ? `${allReasoning} ${doneReasoning}`.trim() : allReasoning || "Task completed successfully"; } + lastFinalAnswer = { + message: state.finalMessage, + output: + typeof args?.output === "object" && args?.output !== null + ? (args.output as Record) + : undefined, + }; } const mappedActions = mapToolResultToActions({ toolCallName: toolCall.toolName, @@ -292,8 +319,35 @@ export class V3AgentHandler { action.timestamp = Date.now(); state.actions.push(action); } + + await evidenceCallback?.({ + type: "step_finished", + actionName: toolCall.toolName, + actionArgs: + typeof args === "object" && args !== null + ? (args as Record) + : {}, + reasoning: event.text ?? "", + toolOutput: inferToolOutput(toolResult), + }); } state.currentPageUrl = (await this.v3.context.awaitActivePage()).url(); + + // Harness probe — one screenshot / a11y snapshot per AI SDK step. + // The recorder applies the probe to every step_finished received + // since the previous probe, so a multi-tool turn shares the same + // post-turn observation. + await emitPostStepProbeEvidence({ + v3: this.v3, + url: state.currentPageUrl, + evidenceCallback, + logger: this.logger, + warningMessage: "Warning: harness probe failed", + }); + } + + if (lastFinalAnswer) { + onFinalAnswer?.(lastFinalAnswer); } if (userCallback) { @@ -321,6 +375,7 @@ export class V3AgentHandler { completed: false, currentPageUrl: "", }; + let finalAnswerFromDoneTool: FinalAnswerDraft | undefined; let messages: ModelMessage[] = []; let captchaSolver: CaptchaSolver | undefined; @@ -368,6 +423,11 @@ export class V3AgentHandler { } } + const evidenceCallback = wrapEvidenceCallback( + callbacks?.onEvidence, + this.logger, + ); + const result = await this.llmClient.generateText({ model: wrappedModel, messages: prependSystemMessage(systemPrompt, messages), @@ -379,7 +439,13 @@ export class V3AgentHandler { callbacks?.prepareStep, captchaSolver, ), - onStepFinish: this.createStepHandler(state, callbacks?.onStepFinish), + onStepFinish: this.createStepHandler(state, { + userCallback: callbacks?.onStepFinish, + evidenceCallback, + onFinalAnswer: (answer) => { + finalAnswerFromDoneTool = answer; + }, + }), abortSignal: preparedOptions.signal, providerOptions: { google: { mediaResolution: "MEDIA_RESOLUTION_HIGH" }, @@ -396,6 +462,15 @@ export class V3AgentHandler { preparedOptions.output, this.logger, ); + const output = doneResult.output ?? finalAnswerFromDoneTool?.output; + await this.emitFinalEvidence( + state, + { + message: state.finalMessage, + output, + }, + evidenceCallback, + ); return this.consolidateMetricsAndResult( startTime, @@ -403,7 +478,7 @@ export class V3AgentHandler { doneResult.messages, result, maxSteps, - doneResult.output, + output, ); } catch (error) { // Re-throw validation errors that should propagate to the caller @@ -449,6 +524,7 @@ export class V3AgentHandler { // Highlight cursor defaults to true for hybrid mode, can be overridden const shouldHighlightCursor = streamOptions?.highlightCursor ?? this.mode === "hybrid"; + let finalAnswerFromDoneTool: FinalAnswerDraft | undefined; const { options, @@ -503,6 +579,11 @@ export class V3AgentHandler { rejectResult(error); }; + const evidenceCallback = wrapEvidenceCallback( + callbacks?.onEvidence, + this.logger, + ); + let streamResult: ReturnType; try { streamResult = this.llmClient.streamText({ @@ -515,7 +596,13 @@ export class V3AgentHandler { callbacks?.prepareStep, captchaSolver, ), - onStepFinish: this.createStepHandler(state, callbacks?.onStepFinish), + onStepFinish: this.createStepHandler(state, { + userCallback: callbacks?.onStepFinish, + evidenceCallback, + onFinalAnswer: (answer) => { + finalAnswerFromDoneTool = answer; + }, + }), onError: (event) => { captchaSolver?.dispose(); if (callbacks?.onError) { @@ -541,17 +628,29 @@ export class V3AgentHandler { options.instruction, options.output, this.logger, - ).then((doneResult) => { - const result = this.consolidateMetricsAndResult( - startTime, - state, - doneResult.messages, - event, - maxSteps, - doneResult.output, - ); - resolveResult(result); - }); + ) + .then(async (doneResult) => { + const output = + doneResult.output ?? finalAnswerFromDoneTool?.output; + await this.emitFinalEvidence( + state, + { + message: state.finalMessage, + output, + }, + evidenceCallback, + ); + const result = this.consolidateMetricsAndResult( + startTime, + state, + doneResult.messages, + event, + maxSteps, + output, + ); + resolveResult(result); + }) + .catch(handleError); }, onAbort: (event) => { captchaSolver?.dispose(); @@ -580,6 +679,26 @@ export class V3AgentHandler { return agentStreamResult; } + private async emitFinalEvidence( + state: AgentState, + finalAnswer: { message: string; output?: Record }, + evidenceCallback?: AgentEvidenceCallback, + ): Promise { + if (!evidenceCallback) return; + + const observation = await captureProbeEvidence({ + v3: this.v3, + url: state.currentPageUrl, + logger: this.logger, + warningMessage: "Warning: final harness probe failed", + }); + await evidenceCallback({ + type: "final_answer", + ...finalAnswer, + observation, + }); + } + private consolidateMetricsAndResult( startTime: number, state: AgentState, diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index af3a3dad87..22513339c0 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -7,6 +7,12 @@ import { GoogleCUAClient } from "../agent/GoogleCUAClient.js"; import { OpenAICUAClient } from "../agent/OpenAICUAClient.js"; import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js"; import { ensureXPath } from "../agent/utils/xpath.js"; +import { + captureProbeEvidence, + emitPostStepProbeEvidence, +} from "../agent/utils/postStepProbeEvidence.js"; +import { wrapEvidenceCallback } from "../agent/utils/wrapEvidenceCallback.js"; +import { inferToolOutput } from "../agent/utils/toolOutputEvidence.js"; import { ActionExecutionResult, AgentAction, @@ -16,6 +22,10 @@ import { SafetyConfirmationHandler, } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; +import type { + AgentEvidenceCallback, + AgentStepFinishedEvent, +} from "../types/public/agentEvidenceEvents.js"; import { type Action, V3FunctionName } from "../types/public/methods.js"; import { FlowLogger } from "../flowlogger/FlowLogger.js"; import { toTitleCase } from "../../utils.js"; @@ -37,6 +47,8 @@ export class V3CuaAgentHandler { private captchaSolver: CaptchaSolver | null = null; private captchaClickGuardRemaining = 0; private currentInstruction = ""; + private lastAgentScreenshotUrl?: string; + private evidenceCallback?: AgentEvidenceCallback; constructor( v3: V3, @@ -76,6 +88,9 @@ export class V3CuaAgentHandler { this.ensureNotClosed(); const page = await this.v3.context.awaitActivePage(); const screenshotBuffer = await page.screenshot({ fullPage: false }); + + await this.emitCuaScreenshot(screenshotBuffer, page.url()); + return screenshotBuffer.toString("base64"); // base64 png }); @@ -119,7 +134,12 @@ export class V3CuaAgentHandler { const waitBetween = (this.options.clientOptions?.waitBetweenActions as number) || defaultDelay; + // Skip logging for screenshot actions - they're no-ops; the CUA client + // takes its own screenshot via screenshotProvider between API turns. + // Computed outside the try so the catch can still record a failed step. + const shouldLog = action.type !== "screenshot"; try { + let executionResult: ActionExecutionResult | undefined; // Try to inject cursor before each action if enabled if (this.highlightCursor) { try { @@ -129,11 +149,8 @@ export class V3CuaAgentHandler { } } await new Promise((r) => setTimeout(r, 300)); - // Skip logging for screenshot actions - they're no-ops; the CUA client - // takes its own screenshot via screenshotProvider between API turns. - const shouldLog = action.type !== "screenshot"; if (shouldLog) { - await FlowLogger.runWithLogging( + executionResult = await FlowLogger.runWithLogging( { eventType: `V3Cua${toTitleCase(action.type)}`, // e.g. "V3CuaClick" data: { @@ -145,10 +162,16 @@ export class V3CuaAgentHandler { [action], ); } else { - await this.executeAction(action); + executionResult = await this.executeAction(action); } action.timestamp = Date.now(); + if (shouldLog && this.evidenceCallback) { + await this.emitCuaActionStep( + action, + inferToolOutput(executionResult ?? { success: true }), + ); + } await new Promise((r) => setTimeout(r, waitBetween)); } catch (error) { @@ -158,6 +181,30 @@ export class V3CuaAgentHandler { message: `Error executing action ${action.type}: ${msg}`, level: 0, }); + // Record the failed action as an ok:false step (with a best-effort + // post-failure probe, since a throwing action can still partially + // mutate the page) before rethrowing — otherwise the failure is + // dropped from the persisted trajectory. Evidence emission must never + // mask the original action error. + if (shouldLog && this.evidenceCallback) { + try { + await this.emitCuaActionStep(action, { + ok: false, + result: undefined, + error: msg, + }); + } catch (evidenceError) { + this.logger({ + category: "agent", + message: `Failed to record failed-action evidence: ${ + evidenceError instanceof Error + ? evidenceError.message + : String(evidenceError) + }`, + level: 1, + }); + } + } throw error; } }); @@ -184,6 +231,11 @@ export class V3CuaAgentHandler { : optionsOrInstruction; this.setSafetyConfirmationHandler(options.callbacks?.onSafetyConfirmation); + this.evidenceCallback = wrapEvidenceCallback( + options.callbacks?.onEvidence, + this.logger, + ); + this.lastAgentScreenshotUrl = undefined; this.highlightCursor = options.highlightCursor !== false; this.currentInstruction = options.instruction; @@ -239,7 +291,28 @@ export class V3CuaAgentHandler { let result: AgentResult; try { result = await this.agent.execute({ options, logger: this.logger }); + if (this.evidenceCallback) { + let finalUrl = ""; + try { + finalUrl = (await this.v3.context.awaitActivePage()).url(); + } catch { + finalUrl = this.lastAgentScreenshotUrl ?? ""; + } + const observation = await captureProbeEvidence({ + v3: this.v3, + url: finalUrl, + logger: this.logger, + warningMessage: "Warning: CUA final probe failed", + }); + await this.evidenceCallback({ + type: "final_answer", + message: result.message, + output: result.output, + observation, + }); + } } finally { + this.evidenceCallback = undefined; this.captchaSolver?.dispose(); this.captchaSolver = null; } @@ -658,6 +731,10 @@ export class V3CuaAgentHandler { const screenshotBuffer = await page.screenshot({ fullPage: false }); const currentUrl = page.url(); + + // Mirror the same buffer the CUA client receives as agent evidence. + await this.emitCuaScreenshot(screenshotBuffer, currentUrl); + return await this.agentClient.captureScreenshot({ base64Image: screenshotBuffer.toString("base64"), currentUrl, @@ -767,6 +844,75 @@ export class V3CuaAgentHandler { } } + /** + * Emit a pre-action CUA screenshot — the exact buffer the model received + * as input. Tier-1 evidence (agent-mirrored); the tier-2 probe is taken + * separately in emitCuaActionStep after the action runs, so the recorder + * can compare what the model saw against what the page actually showed + * once the keystrokes/clicks landed. + */ + private async emitCuaScreenshot( + screenshot: Buffer, + url: string, + ): Promise { + if (!this.evidenceCallback) return; + this.lastAgentScreenshotUrl = url; + await this.evidenceCallback({ + type: "screenshot", + screenshot, + url, + evidenceRole: "agent", + }); + } + + private async emitCuaActionStep( + action: AgentAction, + toolOutput: AgentStepFinishedEvent["toolOutput"], + ): Promise { + let pageUrl = + typeof action.pageUrl === "string" + ? action.pageUrl + : (this.lastAgentScreenshotUrl ?? ""); + try { + pageUrl = (await this.v3.context.awaitActivePage()).url(); + } catch { + // Keep the best pre-action URL fallback. + } + + const actionArgs = Object.fromEntries( + Object.entries(action).filter( + ([key]) => key !== "screenshot" && key !== "timestamp", + ), + ); + const reasoning = + typeof action.reasoning === "string" + ? action.reasoning + : typeof action.action === "string" + ? action.action + : ""; + + await this.evidenceCallback?.({ + type: "step_finished", + actionName: String(action.type), + actionArgs, + reasoning, + toolOutput, + }); + + // Post-action tier-2 probe. The pre-action screenshot from + // screenshotProvider is what the model SAW; this one shows what the + // page actually LOOKS LIKE after the action ran. Without this the + // verifier has no visual evidence that keystrokes/clicks landed, and + // has to trust the action history alone. + await emitPostStepProbeEvidence({ + v3: this.v3, + url: pageUrl, + evidenceCallback: this.evidenceCallback, + logger: this.logger, + warningMessage: "Warning: CUA post-action probe failed", + }); + } + private async injectCursor(): Promise { try { const page = await this.v3.context.awaitActivePage(); diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts index 8e21fb0309..e2f403e9a4 100644 --- a/packages/core/lib/v3/index.ts +++ b/packages/core/lib/v3/index.ts @@ -25,9 +25,14 @@ import { getAISDKLanguageModel } from "./llm/LLMProvider.js"; import { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js"; import { maybeRunShutdownSupervisorFromArgv } from "./shutdown/supervisor.js"; import { + buildAgentEvidenceFromStepFinished, loadTrajectoryFromDisk, + mergeAgentEvidence, nextResultFilename, normalizeRubric, + redactInlineImagePayloads, + shouldPersistTrajectory, + writeTrajectoryDir, } from "./verifier/index.js"; export { V3 } from "./v3.js"; @@ -90,9 +95,14 @@ export type { VerifierRawSteps, } from "./verifier/index.js"; export { + buildAgentEvidenceFromStepFinished, loadTrajectoryFromDisk, + mergeAgentEvidence, nextResultFilename, normalizeRubric, + redactInlineImagePayloads, + shouldPersistTrajectory, + writeTrajectoryDir, } from "./verifier/index.js"; export { tool } from "ai"; export { getAISDKLanguageModel } from "./llm/LLMProvider.js"; @@ -144,9 +154,14 @@ const StagehandDefault = { toJsonSchema, connectToMCPServer, V3Evaluator, + buildAgentEvidenceFromStepFinished, loadTrajectoryFromDisk, + mergeAgentEvidence, nextResultFilename, normalizeRubric, + redactInlineImagePayloads, + shouldPersistTrajectory, + writeTrajectoryDir, tool, getAISDKLanguageModel, __internalCreateInMemoryAgentCacheHandle, diff --git a/packages/core/lib/v3/types/public/agent.ts b/packages/core/lib/v3/types/public/agent.ts index 830fb1c966..3e958fc332 100644 --- a/packages/core/lib/v3/types/public/agent.ts +++ b/packages/core/lib/v3/types/public/agent.ts @@ -15,6 +15,7 @@ import { import { LogLine } from "./logs.js"; import { ClientOptions } from "./model.js"; import { StagehandZodObject } from "../../zodCompat.js"; +import type { AgentEvidenceCallback } from "./agentEvidenceEvents.js"; // Re-export ModelMessage for consumers who want to use it for conversation continuation export type { ModelMessage } from "ai"; @@ -136,6 +137,11 @@ export interface AgentCallbacks { onStepFinish?: | GenerateTextOnStepFinishCallback | StreamTextOnStepFinishCallback; + /** + * Callback called when Stagehand captures agent-run evidence such as + * screenshots, completed tool/action steps, or post-action observations. + */ + onEvidence?: AgentEvidenceCallback; } /** diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts new file mode 100644 index 0000000000..30c4c2f2a2 --- /dev/null +++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts @@ -0,0 +1,99 @@ +/** + * Evidence events emitted through AgentExecuteOptions.callbacks.onEvidence. + * + * These events describe observations made by Stagehand during an agent run. + * They are emitted in temporal order; consumers should treat the stream as + * sequential. An agent-role screenshot applies to every subsequent + * step_finished until a newer agent-role screenshot replaces it — a CUA + * provider may choose multiple actions from a single screenshot, so each of + * those steps shares that frame. A step_observed/probe applies to all + * step_finished events received since the last probe. Verifier-specific + * storage and normalization live in the evals/verifier layers. + */ + +export type AgentEvidenceRole = "probe" | "agent"; + +export type AgentEvidenceEvent = + | AgentScreenshotEvidenceEvent + | AgentStepFinishedEvent + | AgentStepObservedEvent + | AgentFinalAnswerEvent; + +/** + * Screenshot captured during an agent run. + * + * In DOM/hybrid mode, post-tool screenshots are probe evidence. In CUA mode, + * screenshots captured by the screenshot provider are agent evidence because + * they are the exact bytes sent to the provider. + */ +export interface AgentScreenshotEvidenceEvent { + type: "screenshot"; + /** PNG bytes from page.screenshot(). */ + screenshot: Buffer; + /** Page URL at the time of capture. */ + url: string; + /** Role this screenshot plays in downstream evidence collection. */ + evidenceRole: AgentEvidenceRole; +} + +/** + * One completed agent tool/action step. + */ +export interface AgentStepFinishedEvent { + type: "step_finished"; + /** Name of the tool/action that ran, e.g. "act", "extract", "click". */ + actionName: string; + /** Arguments passed to the tool/action. */ + actionArgs: Record; + /** Agent textual reasoning for the step, when available. */ + reasoning: string; + /** Outcome of the tool/action as seen by Stagehand. */ + toolOutput: { + ok: boolean; + /** Native return value from the tool/action. */ + result: unknown; + error?: string; + }; +} + +/** + * Independent post-step browser observation. Emitted once per agent turn; + * consumers apply it to every step_finished received since the previous probe. + */ +export interface AgentStepObservedEvent { + type: "step_observed"; + /** Page URL after the step's tool/action execution. */ + url: string; + /** Accessibility tree snapshot, when captured. */ + ariaTree?: string; +} + +export interface AgentFinalObservation { + /** Page URL at the time of terminal capture. */ + url: string; + /** PNG bytes from page.screenshot(), when capture succeeds. */ + screenshot?: Buffer; + /** Accessibility tree snapshot, when captured. */ + ariaTree?: string; +} + +/** Final answer emitted by the agent, when available. */ +export interface AgentFinalAnswerEvent { + type: "final_answer"; + /** The agent's final summary message. */ + message: string; + /** Optional structured output if the agent's output schema was set. */ + output?: Record; + /** + * Independent terminal browser observation captured after the agent finishes. + * + * This preserves the legacy verifier behavior of evaluating against a final + * page screenshot even when the last agent output is a final answer rather + * than a browser action. + */ + observation?: AgentFinalObservation; +} + +export type AgentEvidenceCallback = ( + event: AgentEvidenceEvent, +) => PromiseLike | void; diff --git a/packages/core/lib/v3/types/public/index.ts b/packages/core/lib/v3/types/public/index.ts index 9c5df08d01..4fe0fb8a48 100644 --- a/packages/core/lib/v3/types/public/index.ts +++ b/packages/core/lib/v3/types/public/index.ts @@ -1,4 +1,5 @@ export * from "./agent.js"; +export * from "./agentEvidenceEvents.js"; // Export api.ts under namespace to avoid conflicts with methods.ts types export * as Api from "./api.js"; // Also export BrowserbaseRegion directly for convenience diff --git a/packages/core/lib/v3/verifier/evidenceNormalization.ts b/packages/core/lib/v3/verifier/evidenceNormalization.ts new file mode 100644 index 0000000000..486ca68e63 --- /dev/null +++ b/packages/core/lib/v3/verifier/evidenceNormalization.ts @@ -0,0 +1,118 @@ +import type { AgentStepFinishedEvent } from "../types/public/agentEvidenceEvents.js"; +import type { AgentEvidence } from "./types.js"; + +export const REDACTED_INLINE_IMAGE = "[redacted inline image payload]"; + +const INLINE_IMAGE_KEYS = new Set(["screenshotBase64"]); + +function shouldRedactBase64Key(key: string, actionName?: string): boolean { + return ( + INLINE_IMAGE_KEYS.has(key) || + (actionName === "screenshot" && key === "base64") + ); +} + +export function collectInlineImagePayloads( + value: unknown, + actionName?: string, + out: string[] = [], +): string[] { + if (!value || typeof value !== "object") return out; + if (Buffer.isBuffer(value)) return out; + + if (Array.isArray(value)) { + for (const item of value) { + collectInlineImagePayloads(item, actionName, out); + } + return out; + } + + for (const [key, nested] of Object.entries(value)) { + if (shouldRedactBase64Key(key, actionName) && typeof nested === "string") { + out.push(nested); + continue; + } + collectInlineImagePayloads(nested, actionName, out); + } + return out; +} + +export function redactInlineImagePayloads( + value: unknown, + actionName?: string, +): unknown { + if (!value || typeof value !== "object") return value; + if (Buffer.isBuffer(value)) return value; + + if (Array.isArray(value)) { + return value.map((item) => redactInlineImagePayloads(item, actionName)); + } + + const out: Record = {}; + for (const [key, nested] of Object.entries(value)) { + out[key] = + shouldRedactBase64Key(key, actionName) && typeof nested === "string" + ? REDACTED_INLINE_IMAGE + : redactInlineImagePayloads(nested, actionName); + } + return out; +} + +export function mergeAgentEvidence( + ...parts: Array +): AgentEvidence { + return { + modalities: parts.flatMap((p) => p?.modalities ?? []), + }; +} + +export function buildAgentEvidenceFromStepFinished( + event: AgentStepFinishedEvent, +): AgentEvidence { + const modalities: AgentEvidence["modalities"] = []; + if (event.reasoning) { + modalities.push({ type: "text", content: event.reasoning }); + } + + const result = event.toolOutput.result; + if (result === undefined || result === null) { + return { modalities }; + } + + if (typeof result === "string") { + modalities.push({ type: "text", content: result }); + } else if ( + typeof result === "number" || + typeof result === "boolean" || + typeof result === "bigint" + ) { + modalities.push({ type: "text", content: String(result) }); + } else if (Buffer.isBuffer(result)) { + modalities.push({ + type: "image", + bytes: result, + mediaType: "image/png", + }); + } else if (typeof result === "object") { + for (const imageBase64 of collectInlineImagePayloads( + result, + event.actionName, + )) { + try { + modalities.push({ + type: "image", + bytes: Buffer.from(imageBase64, "base64"), + mediaType: "image/png", + }); + } catch { + // Malformed base64; skip the image and keep the JSON modality. + } + } + modalities.push({ + type: "json", + content: redactInlineImagePayloads(result, event.actionName), + }); + } + + return { modalities }; +} diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts index 4061533ab9..f1638facc7 100644 --- a/packages/core/lib/v3/verifier/index.ts +++ b/packages/core/lib/v3/verifier/index.ts @@ -21,8 +21,17 @@ export type { VerifierFinding, VerifierRawSteps, } from "./types.js"; +export { + buildAgentEvidenceFromStepFinished, + collectInlineImagePayloads, + mergeAgentEvidence, + redactInlineImagePayloads, + REDACTED_INLINE_IMAGE, +} from "./evidenceNormalization.js"; export { loadTrajectoryFromDisk, nextResultFilename, normalizeRubric, + shouldPersistTrajectory, + writeTrajectoryDir, } from "./trajectory.js"; diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index a18f025c37..4902f1f63c 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -1,3 +1,5 @@ +import fs from "node:fs/promises"; +import path from "node:path"; import type { AgentEvidenceModality, ProbeEvidence, @@ -5,6 +7,7 @@ import type { Trajectory, TrajectoryStep, } from "./types.js"; +import { redactInlineImagePayloads } from "./evidenceNormalization.js"; type RawRubricCriterion = { criterion: unknown; @@ -18,6 +21,10 @@ type RawRubric = { items?: unknown; }; +type PersistedProbeEvidence = ProbeEvidence & { + screenshotPath?: string; +}; + /** * Convert dataset or generated rubric JSON into the public Stagehand shape. * Snake-case dataset fields are accepted here so serialized quirks do not leak @@ -92,8 +99,9 @@ function normalizeResultLabel(label?: string): string { * * Reverses the recorder's serialization tweaks: * - `probeEvidence.screenshotPath` → read file into `probeEvidence.screenshot`. - * - Image modalities in `agentEvidence.modalities` carry `bytesBase64` on - * disk (human-readable JSON) instead of raw Buffer; we decode back. + * - Image modalities in `agentEvidence.modalities` carry `imagePath` on + * disk instead of raw Buffer; legacy `bytesBase64` fixtures are also + * accepted. * * @param dir absolute or cwd-relative path to a `//` directory. */ @@ -105,6 +113,7 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { const trajectoryPath = path.join(trajectoryDir, "trajectory.json"); const raw = await fs.readFile(trajectoryPath, "utf8"); const parsed = JSON.parse(raw) as Trajectory & { + finalObservation?: PersistedProbeEvidence; steps: Array< TrajectoryStep & { agentEvidence: { @@ -113,20 +122,24 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { | { type: "image"; mediaType: string; - // On-disk form (recorder writes base64); accept either to - // tolerate hand-edited fixtures. + // On-disk forms. Current writer externalizes bytes to + // imagePath; bytesBase64 is accepted for older fixtures. bytes?: unknown; bytesBase64?: string; + imagePath?: string; } | { type: "json"; content: unknown } >; }; - probeEvidence: ProbeEvidence; + probeEvidence: PersistedProbeEvidence; } >; }; - const resolveWithinTrajectoryDir = (candidate: string): string => { + const resolveWithinTrajectoryDir = ( + candidate: string, + fieldName = "screenshotPath", + ): string => { const resolved = path.resolve(trajectoryDir, candidate); const relative = path.relative(trajectoryDir, resolved); const outside = @@ -136,16 +149,16 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { if (outside) { throw new Error( - `Trajectory screenshotPath escapes trajectory directory: ${candidate}`, + `Trajectory ${fieldName} escapes trajectory directory: ${candidate}`, ); } return resolved; }; - for (const step of parsed.steps) { - // Rehydrate tier-2 probe screenshot from its on-disk file reference. - const probe = step.probeEvidence; + const hydrateProbeScreenshot = async ( + probe: PersistedProbeEvidence | undefined, + ): Promise => { if (probe?.screenshotPath && !probe.screenshot) { const resolved = resolveWithinTrajectoryDir(probe.screenshotPath); try { @@ -155,25 +168,55 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { // evidence_insufficient path will handle it. } } + }; + + for (const step of parsed.steps) { + // Rehydrate tier-2 probe screenshot from its on-disk file reference. + await hydrateProbeScreenshot(step.probeEvidence); - // Decode image modalities from base64 back to Buffer. + // Decode image modalities from disk references back to Buffer. if (step.agentEvidence?.modalities) { - step.agentEvidence.modalities = step.agentEvidence.modalities.map((m) => { - // The on-disk shape carries bytesBase64 instead of bytes, so we look - // through `unknown` here rather than rely on the typed union. - const raw = m as unknown as { bytesBase64?: string }; + const modalities: AgentEvidenceModality[] = []; + for (const m of step.agentEvidence.modalities) { + // The on-disk shape carries imagePath/bytesBase64 instead of bytes, + // so we look through `unknown` rather than rely on the typed union. + const raw = m as unknown as { + bytesBase64?: string; + imagePath?: string; + }; if (m.type === "image" && typeof raw.bytesBase64 === "string") { - return { + modalities.push({ type: "image" as const, bytes: Buffer.from(raw.bytesBase64, "base64"), mediaType: m.mediaType, - }; + }); + continue; } - return m as AgentEvidenceModality; - }); + if (m.type === "image" && typeof raw.imagePath === "string") { + const resolved = resolveWithinTrajectoryDir( + raw.imagePath, + "imagePath", + ); + try { + modalities.push({ + type: "image" as const, + bytes: await fs.readFile(resolved), + mediaType: m.mediaType, + }); + } catch { + // Missing agent image file: omit that image modality. The + // verifier's evidence_insufficient path will handle missing bytes. + } + continue; + } + modalities.push(m as AgentEvidenceModality); + } + step.agentEvidence.modalities = modalities; } } + await hydrateProbeScreenshot(parsed.finalObservation); + return parsed; } @@ -187,3 +230,162 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { export function nextResultFilename(label?: string): string { return `result_${normalizeResultLabel(label)}.json`; } + +/** + * Default persistence policy: explicit override, then env, then "on unless CI". + */ +export function shouldPersistTrajectory( + override: boolean | undefined, +): boolean { + if (override !== undefined) return override; + const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase(); + if (env === "1" || env === "true") return true; + if (env === "0" || env === "false") return false; + return !process.env.CI; +} + +/** + * Write the on-disk trajectory layout under `dir`: + * + * / + * ├── task_data.json + * ├── trajectory.json (screenshots referenced by path) + * ├── screenshots/ + * │ ├── probe/.png + * │ └── agent/[_M].png + * ├── scores/ (empty; populated separately) + * └── core.log + * + * Image bytes are externalized to PNG files; the in-memory Trajectory is left + * untouched so callers can keep using it after persistence. + */ +export async function writeTrajectoryDir( + dir: string, + trajectory: Trajectory, +): Promise { + await fs.mkdir(dir, { recursive: true }); + await fs.mkdir(path.join(dir, "screenshots", "probe"), { recursive: true }); + await fs.mkdir(path.join(dir, "screenshots", "agent"), { recursive: true }); + + const serializableSteps: unknown[] = []; + // A single post-turn probe is fanned across every step of a multi-tool turn, + // and a single agent screenshot is shared across every action a CUA provider + // chose from it, so the same Buffer is shared by reference. Dedupe by + // identity: write the PNG once and point every sharing step at the same file. + const probePathByBuffer = new Map(); + const agentPathByBuffer = new Map(); + for (const [i, step] of trajectory.steps.entries()) { + const probe: ProbeEvidence = { ...step.probeEvidence }; + if (probe.screenshot) { + let relPath = probePathByBuffer.get(probe.screenshot); + if (!relPath) { + relPath = `screenshots/probe/${i + 1}.png`; + await fs.writeFile(path.join(dir, relPath), probe.screenshot); + probePathByBuffer.set(probe.screenshot, relPath); + } + probe.screenshotPath = relPath; + delete probe.screenshot; + } + + const imageModalities = step.agentEvidence.modalities.filter( + (m) => m.type === "image", + ); + const multipleImages = imageModalities.length > 1; + let imageSeq = 0; + const modalities: unknown[] = []; + for (const m of step.agentEvidence.modalities) { + if (m.type !== "image") { + modalities.push( + m.type === "json" + ? { + ...m, + content: redactInlineImagePayloads(m.content, step.actionName), + } + : m, + ); + continue; + } + let relPath = agentPathByBuffer.get(m.bytes); + if (!relPath) { + const suffix = multipleImages ? `_${imageSeq}` : ""; + relPath = `screenshots/agent/${i + 1}${suffix}.png`; + await fs.writeFile(path.join(dir, relPath), m.bytes); + agentPathByBuffer.set(m.bytes, relPath); + } + modalities.push({ + type: "image", + imagePath: relPath, + mediaType: m.mediaType, + }); + imageSeq += 1; + } + serializableSteps.push({ + ...step, + probeEvidence: probe, + agentEvidence: { modalities }, + toolOutput: { + ...step.toolOutput, + result: redactInlineImagePayloads( + step.toolOutput.result, + step.actionName, + ), + }, + }); + } + + const finalObservation: ProbeEvidence | undefined = + trajectory.finalObservation === undefined + ? undefined + : { ...trajectory.finalObservation }; + if (finalObservation?.screenshot) { + const relPath = "screenshots/probe/final.png"; + await fs.writeFile(path.join(dir, relPath), finalObservation.screenshot); + finalObservation.screenshotPath = relPath; + delete finalObservation.screenshot; + } + + // Image modalities carry imagePath instead of raw bytes on disk; cast + // through unknown rather than widen Trajectory's type contract. + const serialized = { + ...trajectory, + steps: serializableSteps, + ...(finalObservation ? { finalObservation } : {}), + } as unknown; + + await fs.writeFile( + path.join(dir, "trajectory.json"), + JSON.stringify(serialized, null, 2), + ); + + await fs.writeFile( + path.join(dir, "task_data.json"), + JSON.stringify( + { + task: trajectory.task, + status: trajectory.status, + finalAnswer: trajectory.finalAnswer ?? null, + }, + null, + 2, + ), + ); + + await fs.mkdir(path.join(dir, "scores"), { recursive: true }); + await fs.writeFile(path.join(dir, "core.log"), coreLog(trajectory)); +} + +function coreLog(trajectory: Trajectory): string { + return ( + trajectory.steps + .map((step, i) => + JSON.stringify({ + step: i, + action: step.actionName, + url: step.probeEvidence.url ?? null, + ok: step.toolOutput.ok, + reasoning: step.reasoning || undefined, + }), + ) + .join("\n") + "\n" + ); +} diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts index 04addaf9a4..4aa76ea6f1 100644 --- a/packages/core/lib/v3/verifier/types.ts +++ b/packages/core/lib/v3/verifier/types.ts @@ -89,8 +89,6 @@ export interface ProbeEvidence { screenshot?: Buffer; /** Reference to the persisted screenshot file under the trajectory dir. */ screenshotPath?: string; - /** Viewport scroll context. Lets the verifier reason about whether the agent saw the full page. */ - scroll?: { top: number; pageHeight: number }; /** Accessibility tree snapshot. */ ariaTree?: string; /** Verifier-requested probes, keyed by criterion id. */ @@ -111,7 +109,6 @@ export interface ToolOutput { /** One step in a trajectory: action + reasoning + evidence + outcome. */ export interface TrajectoryStep { - index: number; actionName: string; actionArgs: Record; /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */ @@ -119,10 +116,6 @@ export interface TrajectoryStep { agentEvidence: AgentEvidence; probeEvidence: ProbeEvidence; toolOutput: ToolOutput; - /** ISO 8601 timestamp when the step's tool execution started. */ - startedAt: string; - /** ISO 8601 timestamp when the step's tool execution finished. */ - finishedAt: string; } /** Terminal status of the agent run. */ @@ -136,19 +129,19 @@ export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error"; * .trajectories/// * ├── task_data.json — TaskSpec + result metadata * ├── trajectory.json — this object, with screenshotPath instead of bytes - * ├── screenshot_1.png — probeEvidence.screenshot for step 1, etc. + * ├── screenshots/ — step probe/agent images plus final observation * ├── scores/ * │ └── result.json — Result from V3Evaluator.verify() - * ├── core.log — captured action log - * └── times.json — step timing + token usage + * └── core.log — captured action log */ export interface Trajectory { task: TaskSpec; steps: TrajectoryStep[]; finalAnswer?: string; + /** Terminal page observation captured after the agent finishes. */ + finalObservation?: ProbeEvidence; status: TrajectoryStatus; usage: TrajectoryUsage; - timing: { startedAt: string; endedAt: string }; } /** Score for a single rubric criterion after evidence analysis + rescoring. */ diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts index 5294f6c508..5e6e5ee92f 100644 --- a/packages/core/lib/v3Evaluator.ts +++ b/packages/core/lib/v3Evaluator.ts @@ -227,19 +227,25 @@ function collectLegacyScreenshots(trajectory: Trajectory): Buffer[] { } } + if (Buffer.isBuffer(trajectory.finalObservation?.screenshot)) { + screenshots.push(trajectory.finalObservation.screenshot); + } + return screenshots; } function renderLegacyAgentReasoning( trajectory: Trajectory, ): string | undefined { - const stepLines = (trajectory.steps ?? []).map((step) => { + const stepLines = (trajectory.steps ?? []).map((step, i) => { + const status = step.toolOutput?.ok === false ? "Tool status: failed" : ""; const output = step.toolOutput?.error ? `Tool error: ${step.toolOutput.error}` : `Tool output: ${stringifyForPrompt(step.toolOutput?.result)}`; return [ - `Step ${step.index}: ${step.actionName}`, + `Step ${i}: ${step.actionName}`, step.reasoning ? `Reasoning: ${step.reasoning}` : undefined, + status || undefined, output, ] .filter(Boolean) diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts index b3d584c258..62b8d38246 100644 --- a/packages/core/tests/unit/agent-captcha-hooks.test.ts +++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts @@ -60,6 +60,7 @@ class FakeCuaClient { public contextNotes: string[] = []; public preStepHook?: () => Promise; public actionHandler?: (action: Record) => Promise; + public screenshotProvider?: () => Promise; public executeImpl = vi.fn(async (options: unknown) => { void options; return { @@ -72,7 +73,9 @@ class FakeCuaClient { public captureScreenshot = vi.fn(async () => null); public setViewport = vi.fn(); public setCurrentUrl = vi.fn(); - public setScreenshotProvider = vi.fn(); + public setScreenshotProvider = vi.fn((provider: () => Promise) => { + this.screenshotProvider = provider; + }); public setSafetyConfirmationHandler = vi.fn(); setActionHandler( @@ -247,7 +250,6 @@ describe("agent captcha hooks", () => { context: { awaitActivePage: async () => page, }, - bus: { emit: vi.fn() }, isCaptchaAutoSolveEnabled: true, isAdvancedStealth: false, configuredViewport: { width: 1288, height: 711 }, @@ -316,7 +318,6 @@ describe("agent captcha hooks", () => { context: { awaitActivePage: async () => page, }, - bus: { emit: vi.fn() }, isCaptchaAutoSolveEnabled: true, isAdvancedStealth: false, configuredViewport: { width: 1288, height: 711 }, @@ -392,7 +393,6 @@ describe("agent captcha hooks", () => { context: { awaitActivePage: async () => page, }, - bus: { emit: vi.fn() }, isCaptchaAutoSolveEnabled: true, isAdvancedStealth: false, configuredViewport: { width: 1288, height: 711 }, @@ -474,7 +474,6 @@ describe("v3 cua handler screenshot behavior", () => { context: { awaitActivePage: async () => page, }, - bus: { emit: vi.fn() }, isCaptchaAutoSolveEnabled: false, isAdvancedStealth: false, configuredViewport: { width: 1288, height: 711 }, @@ -504,4 +503,113 @@ describe("v3 cua handler screenshot behavior", () => { // the CUA client takes a single screenshot after all actions itself. expect(screenshotSpy).not.toHaveBeenCalled(); }); + + it("still returns provider screenshots when screenshot evidence callbacks fail", async () => { + const screenshotBase64 = Buffer.from("fake-image").toString("base64"); + const onEvidence = vi.fn(async (event: { type: string }) => { + if (event.type === "screenshot") { + throw new Error("recorder failed"); + } + }); + + fakeCuaClient.executeImpl = vi.fn(async () => { + await expect(fakeCuaClient.screenshotProvider?.()).resolves.toBe( + screenshotBase64, + ); + return { + success: true, + message: "ok", + actions: [], + completed: true, + }; + }); + + const handler = new V3CuaAgentHandler( + { + context: { + awaitActivePage: async () => page, + }, + isCaptchaAutoSolveEnabled: false, + isAdvancedStealth: false, + configuredViewport: { width: 1288, height: 711 }, + isAgentReplayActive: () => false, + updateMetrics: vi.fn(), + } as never, + logger, + { + modelName: "openai/gpt-5.4", + clientOptions: { waitBetweenActions: 1 }, + } as never, + ); + + await handler.execute({ + instruction: "describe the page", + highlightCursor: false, + callbacks: { onEvidence }, + }); + + expect(onEvidence).toHaveBeenCalledWith( + expect.objectContaining({ type: "screenshot" }), + ); + expect( + logs.some((line) => + line.message.includes("onEvidence callback failed for screenshot"), + ), + ).toBe(true); + }); + + it("records a failed action as step_finished {ok:false} and rethrows the original error", async () => { + const events: Array<{ type: string; [k: string]: unknown }> = []; + const onEvidence = vi.fn(async (event: { type: string }) => { + events.push(event as { type: string }); + }); + + fakeCuaClient.executeImpl = vi.fn(async () => { + await fakeCuaClient.actionHandler?.({ + type: "click", + button: "left", + x: 5, + y: 9, + }); + return { success: true, message: "ok", actions: [], completed: true }; + }); + + const handler = new V3CuaAgentHandler( + { + context: { + awaitActivePage: async () => page, + }, + isCaptchaAutoSolveEnabled: false, + isAdvancedStealth: false, + configuredViewport: { width: 1288, height: 711 }, + isAgentReplayActive: () => false, + updateMetrics: vi.fn(), + } as never, + logger, + { + modelName: "openai/gpt-5.4", + clientOptions: { waitBetweenActions: 1 }, + } as never, + ); + vi.spyOn( + handler as unknown as { + executeAction: (action: Record) => Promise; + }, + "executeAction", + ).mockRejectedValue(new Error("click failed")); + + await expect( + handler.execute({ + instruction: "click the thing", + highlightCursor: false, + callbacks: { onEvidence }, + }), + ).rejects.toThrow("click failed"); + + const stepFinished = events.find((e) => e.type === "step_finished"); + expect(stepFinished).toMatchObject({ + actionName: "click", + toolOutput: { ok: false, error: "click failed" }, + }); + }); }); diff --git a/packages/core/tests/unit/public-api/export-surface.test.ts b/packages/core/tests/unit/public-api/export-surface.test.ts index e73cde4178..163fd60094 100644 --- a/packages/core/tests/unit/public-api/export-surface.test.ts +++ b/packages/core/tests/unit/public-api/export-surface.test.ts @@ -32,6 +32,8 @@ const publicApiShape = { V3: Stagehand.V3, V3Evaluator: Stagehand.V3Evaluator, V3FunctionName: Stagehand.V3FunctionName, + buildAgentEvidenceFromStepFinished: + Stagehand.buildAgentEvidenceFromStepFinished, connectToMCPServer: Stagehand.connectToMCPServer, default: StagehandDefaultExport, defaultExtractSchema: Stagehand.defaultExtractSchema, @@ -44,18 +46,22 @@ const publicApiShape = { jsonSchemaToZod: Stagehand.jsonSchemaToZod, loadApiKeyFromEnv: Stagehand.loadApiKeyFromEnv, loadTrajectoryFromDisk: Stagehand.loadTrajectoryFromDisk, + mergeAgentEvidence: Stagehand.mergeAgentEvidence, localBrowserLaunchOptionsSchema: Stagehand.localBrowserLaunchOptionsSchema, modelToAgentProviderMap: Stagehand.modelToAgentProviderMap, nextResultFilename: Stagehand.nextResultFilename, normalizeRubric: Stagehand.normalizeRubric, pageTextSchema: Stagehand.pageTextSchema, providerEnvVarMap: Stagehand.providerEnvVarMap, + redactInlineImagePayloads: Stagehand.redactInlineImagePayloads, + shouldPersistTrajectory: Stagehand.shouldPersistTrajectory, toGeminiSchema: Stagehand.toGeminiSchema, toJsonSchema: Stagehand.toJsonSchema, tool: Stagehand.tool, transformSchema: Stagehand.transformSchema, trimTrailingTextNode: Stagehand.trimTrailingTextNode, validateZodSchema: Stagehand.validateZodSchema, + writeTrajectoryDir: Stagehand.writeTrajectoryDir, ...publicErrorTypes, } as const; diff --git a/packages/core/tests/unit/tool-output-evidence.test.ts b/packages/core/tests/unit/tool-output-evidence.test.ts new file mode 100644 index 0000000000..fd7c2aabde --- /dev/null +++ b/packages/core/tests/unit/tool-output-evidence.test.ts @@ -0,0 +1,47 @@ +import { describe, expect, it } from "vitest"; + +import { inferToolOutput } from "../../lib/v3/agent/utils/toolOutputEvidence.js"; + +describe("inferToolOutput", () => { + it.each<[string, unknown, boolean, string | undefined]>([ + [ + "preserves raw results while normalizing top-level failure status", + { success: false }, + false, + undefined, + ], + [ + "normalizes one-level AI SDK output wrappers", + { + toolCallId: "call-1", + output: { success: false, error: { message: "not found" } }, + }, + false, + '{"message":"not found"}', + ], + [ + "handles isError and non-string errors", + { isError: true, error: new Error("bad input") }, + false, + "bad input", + ], + [ + "normalizes non-json error values", + { error: Symbol("bad input") }, + false, + "Symbol(bad input)", + ], + [ + "does not recursively treat page data as tool status", + { data: { success: false, error: "page field" } }, + true, + undefined, + ], + ])("%s", (_, result, ok, error) => { + expect(inferToolOutput(result)).toEqual({ + ok, + result, + error, + }); + }); +}); diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts index 1e9e3a0f19..e6f61d54a8 100644 --- a/packages/core/tests/unit/v3-evaluator.test.ts +++ b/packages/core/tests/unit/v3-evaluator.test.ts @@ -82,6 +82,39 @@ describe("V3Evaluator verifier facade", () => { expect(result.perCriterion).toBeUndefined(); }); + it("passes final observation screenshots to the legacy verifier adapter", async () => { + const taskSpec: TaskSpec = { + id: "final-observation", + instruction: "Complete the task", + }; + const finalScreenshot = Buffer.from("final screenshot"); + const trajectory = { + ...makeTrajectory(taskSpec), + finalObservation: { + url: "https://example.com/done", + screenshot: finalScreenshot, + }, + }; + const ask = vi.fn().mockResolvedValue({ + evaluation: "YES", + reasoning: "The final screenshot shows completion.", + }); + const evaluator = new V3Evaluator({} as V3, { + backend: "legacy", + }); + Object.defineProperty(evaluator, "legacyEvaluator", { + value: { ask }, + }); + + await evaluator.verify(trajectory); + + expect(ask).toHaveBeenCalledWith( + expect.objectContaining({ + screenshot: [finalScreenshot], + }), + ); + }); + it("keeps legacy tool output detail until the overall reasoning budget is reached", async () => { const taskSpec: TaskSpec = { id: "reasoning-budget", @@ -164,10 +197,6 @@ function makeEmptyTrajectory(taskSpec: TaskSpec): Trajectory { input_tokens: 0, output_tokens: 0, }, - timing: { - startedAt: new Date(0).toISOString(), - endedAt: new Date(0).toISOString(), - }, }; } @@ -183,7 +212,6 @@ function makeTrajectory( ...makeEmptyTrajectory(taskSpec), steps: [ { - index: 0, actionName: "act", actionArgs: {}, reasoning: "I completed the task.", @@ -195,8 +223,6 @@ function makeTrajectory( ok: true, result: options.toolResult ?? "done", }, - startedAt: new Date(0).toISOString(), - finishedAt: new Date(0).toISOString(), }, ], finalAnswer: options.finalAnswer, diff --git a/packages/core/tests/unit/verifier-evidence-normalization.test.ts b/packages/core/tests/unit/verifier-evidence-normalization.test.ts new file mode 100644 index 0000000000..174b0a87f5 --- /dev/null +++ b/packages/core/tests/unit/verifier-evidence-normalization.test.ts @@ -0,0 +1,63 @@ +import { describe, expect, it } from "vitest"; + +import { + buildAgentEvidenceFromStepFinished, + REDACTED_INLINE_IMAGE, +} from "../../lib/v3/verifier/evidenceNormalization.js"; + +describe("buildAgentEvidenceFromStepFinished", () => { + it("captures primitive tool results as text evidence", () => { + const evidence = buildAgentEvidenceFromStepFinished({ + type: "step_finished", + actionName: "check", + actionArgs: {}, + reasoning: "", + toolOutput: { ok: true, result: false }, + }); + + expect(evidence.modalities).toEqual([{ type: "text", content: "false" }]); + }); + + it("lifts inline screenshot payloads into image evidence and redacts JSON", () => { + const inlineScreenshot = + Buffer.from("inline screenshot").toString("base64"); + + const evidence = buildAgentEvidenceFromStepFinished({ + type: "step_finished", + actionName: "click", + actionArgs: { describe: "Open fare details" }, + reasoning: "", + toolOutput: { + ok: true, + result: { + output: { + success: true, + describe: "Open fare details", + screenshotBase64: inlineScreenshot, + }, + }, + }, + }); + + const [imageModality, jsonModality] = evidence.modalities; + + expect(JSON.stringify(evidence)).not.toContain(inlineScreenshot); + expect(jsonModality).toMatchObject({ + type: "json", + content: { + output: { + screenshotBase64: REDACTED_INLINE_IMAGE, + }, + }, + }); + expect(imageModality).toMatchObject({ + type: "image", + mediaType: "image/png", + }); + if (imageModality?.type === "image") { + expect(imageModality.bytes).toEqual( + Buffer.from(inlineScreenshot, "base64"), + ); + } + }); +}); diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts index 4b09e53a12..7cd38662df 100644 --- a/packages/core/tests/unit/verifier-trajectory.test.ts +++ b/packages/core/tests/unit/verifier-trajectory.test.ts @@ -1,4 +1,4 @@ -import { mkdtemp, writeFile } from "node:fs/promises"; +import { mkdir, mkdtemp, readFile, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; import path from "node:path"; @@ -8,6 +8,7 @@ import { loadTrajectoryFromDisk, nextResultFilename, normalizeRubric, + writeTrajectoryDir, } from "../../lib/v3/verifier/trajectory.js"; describe("verifier trajectory utilities", () => { @@ -63,21 +64,23 @@ describe("verifier trajectory utilities", () => { it("loads trajectory screenshots and image modalities from disk", async () => { const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); const screenshot = Buffer.from("probe screenshot"); + const finalScreenshot = Buffer.from("final screenshot"); const agentImage = Buffer.from("agent image"); await writeFile(path.join(dir, "screenshot_1.png"), screenshot); + await writeFile(path.join(dir, "final.png"), finalScreenshot); + await mkdir(path.join(dir, "screenshots", "agent"), { recursive: true }); + await writeFile( + path.join(dir, "screenshots", "agent", "1.png"), + agentImage, + ); await writeFile( path.join(dir, "trajectory.json"), JSON.stringify({ task: { id: "task", instruction: "Do the task" }, status: "complete", usage: { input_tokens: 0, output_tokens: 0 }, - timing: { - startedAt: new Date(0).toISOString(), - endedAt: new Date(0).toISOString(), - }, steps: [ { - index: 0, actionName: "act", actionArgs: {}, reasoning: "", @@ -86,16 +89,18 @@ describe("verifier trajectory utilities", () => { { type: "image", mediaType: "image/png", - bytesBase64: agentImage.toString("base64"), + imagePath: "screenshots/agent/1.png", }, ], }, probeEvidence: { screenshotPath: "screenshot_1.png" }, toolOutput: { ok: true, result: null }, - startedAt: new Date(0).toISOString(), - finishedAt: new Date(0).toISOString(), }, ], + finalObservation: { + url: "https://example.com/done", + screenshotPath: "final.png", + }, }), ); @@ -103,35 +108,132 @@ describe("verifier trajectory utilities", () => { const modality = trajectory.steps[0].agentEvidence.modalities[0]; expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot); + expect(trajectory.finalObservation?.screenshot).toEqual(finalScreenshot); expect(modality.type).toBe("image"); if (modality.type === "image") { expect(modality.bytes).toEqual(agentImage); } }); - it("rejects screenshot paths outside the trajectory directory", async () => { + it("loads legacy base64 image modalities from disk", async () => { const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); + const agentImage = Buffer.from("legacy agent image"); await writeFile( path.join(dir, "trajectory.json"), JSON.stringify({ task: { id: "task", instruction: "Do the task" }, status: "complete", usage: { input_tokens: 0, output_tokens: 0 }, - timing: { - startedAt: new Date(0).toISOString(), - endedAt: new Date(0).toISOString(), + steps: [ + { + actionName: "act", + actionArgs: {}, + reasoning: "", + agentEvidence: { + modalities: [ + { + type: "image", + mediaType: "image/png", + bytesBase64: agentImage.toString("base64"), + }, + ], + }, + probeEvidence: {}, + toolOutput: { ok: true, result: null }, + }, + ], + }), + ); + + const trajectory = await loadTrajectoryFromDisk(dir); + const modality = trajectory.steps[0].agentEvidence.modalities[0]; + + expect(modality.type).toBe("image"); + if (modality.type === "image") { + expect(modality.bytes).toEqual(agentImage); + } + }); + + it("redacts inline screenshot payloads when writing trajectories", async () => { + const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); + const inlineScreenshot = + Buffer.from("inline screenshot").toString("base64"); + + await writeTrajectoryDir(dir, { + task: { id: "task", instruction: "Do the task" }, + status: "complete", + usage: { input_tokens: 0, output_tokens: 0 }, + finalObservation: { + url: "https://example.com/done", + screenshot: Buffer.from("final screenshot"), + }, + steps: [ + { + actionName: "click", + actionArgs: {}, + reasoning: "", + agentEvidence: { + modalities: [ + { + type: "json", + content: { + output: { + success: true, + screenshotBase64: inlineScreenshot, + }, + }, + }, + ], + }, + probeEvidence: {}, + toolOutput: { + ok: true, + result: { + output: { + success: true, + screenshotBase64: inlineScreenshot, + }, + }, + }, }, + ], + }); + + const raw = await readFile(path.join(dir, "trajectory.json"), "utf8"); + const trajectory = JSON.parse(raw); + + expect(raw).not.toContain(inlineScreenshot); + expect( + trajectory.steps[0].agentEvidence.modalities[0].content.output + .screenshotBase64, + ).toBe("[redacted inline image payload]"); + expect(trajectory.steps[0].toolOutput.result.output.screenshotBase64).toBe( + "[redacted inline image payload]", + ); + expect(trajectory.finalObservation.screenshotPath).toBe( + "screenshots/probe/final.png", + ); + await expect( + readFile(path.join(dir, "screenshots", "probe", "final.png")), + ).resolves.toEqual(Buffer.from("final screenshot")); + }); + + it("rejects screenshot paths outside the trajectory directory", async () => { + const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); + await writeFile( + path.join(dir, "trajectory.json"), + JSON.stringify({ + task: { id: "task", instruction: "Do the task" }, + status: "complete", + usage: { input_tokens: 0, output_tokens: 0 }, steps: [ { - index: 0, actionName: "act", actionArgs: {}, reasoning: "", agentEvidence: { modalities: [] }, probeEvidence: { screenshotPath: "../../../etc/passwd" }, toolOutput: { ok: true, result: null }, - startedAt: new Date(0).toISOString(), - finishedAt: new Date(0).toISOString(), }, ], }), diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts new file mode 100644 index 0000000000..789bb72f4b --- /dev/null +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -0,0 +1,243 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { + buildAgentEvidenceFromStepFinished, + mergeAgentEvidence, + redactInlineImagePayloads, + shouldPersistTrajectory, + writeTrajectoryDir, +} from "@browserbasehq/stagehand"; +import type { + AgentEvidence, + AgentEvidenceEvent, + AgentFinalAnswerEvent, + AgentScreenshotEvidenceEvent, + AgentStepFinishedEvent, + AgentStepObservedEvent, + ProbeEvidence, + TaskSpec, + Trajectory, + TrajectoryStatus, + TrajectoryStep, + TrajectoryUsage, + EvaluationResult, +} from "@browserbasehq/stagehand"; + +export interface TrajectoryRecorderOptions { + taskSpec: TaskSpec; + /** + * Root directory under which trajectory dirs are written. Each task run + * gets a subdirectory named by runId/task.id. + * Defaults to `/.trajectories`. + */ + outputRoot?: string; + /** Run identifier (e.g., ISO timestamp + env). Defaults to a fresh timestamp. */ + runId?: string; + /** + * Override the env-gated persistence default. `true` always persists, + * `false` never does, `undefined` defers to VERIFIER_PERSIST_TRAJECTORIES. + */ + persist?: boolean; +} + +export interface TrajectoryFinishOptions { + status: TrajectoryStatus; + finalAnswer?: string; + usage?: Partial; +} + +const ZERO_USAGE: TrajectoryUsage = { + input_tokens: 0, + output_tokens: 0, +}; + +export class TrajectoryRecorder { + private readonly taskSpec: TaskSpec; + private readonly runId: string; + private readonly outputDir: string; + private readonly persistEnabled: boolean; + + // Steps are appended in arrival order on each step_finished event. + private readonly steps: TrajectoryStep[] = []; + // The most recent agent-role screenshot. It applies to every step_finished + // until a newer agent-role screenshot replaces it — a CUA provider can pick + // multiple actions from one screenshot, so each of those steps must carry + // that same tier-1 frame. (It is NOT cleared on consume; it is only replaced + // by a newer screenshot, or wiped on cancel().) + private latestAgentScreenshot?: Buffer; + // The most recent probe-role screenshot waits for the matching step_observed. + private pendingProbeScreenshot?: Buffer; + // Steps that haven't yet had a probe attached. The next step_observed fans + // out to all of them (one probe per agent turn, N tool calls per turn). + private stepsAwaitingProbe: number[] = []; + private finalAnswerEvent?: AgentFinalAnswerEvent; + private finalObservation?: ProbeEvidence; + + private onScreenshot(e: AgentScreenshotEvidenceEvent): void { + if (e.evidenceRole === "agent") { + this.latestAgentScreenshot = e.screenshot; + } else { + this.pendingProbeScreenshot = e.screenshot; + } + } + + private onStepFinished(e: AgentStepFinishedEvent): void { + const modalities: AgentEvidence["modalities"] = []; + if (this.latestAgentScreenshot) { + modalities.push({ + type: "image", + bytes: this.latestAgentScreenshot, + mediaType: "image/png", + }); + } + const merged = mergeAgentEvidence( + { modalities }, + buildAgentEvidenceFromStepFinished(e), + ); + + // Intentionally not cleared here: the same agent screenshot applies to + // every step in a batched CUA turn. It's replaced when a newer agent + // screenshot arrives (onScreenshot) or wiped on cancel(). + this.stepsAwaitingProbe.push(this.steps.length); + this.steps.push({ + actionName: e.actionName, + actionArgs: e.actionArgs, + reasoning: e.reasoning, + agentEvidence: merged, + probeEvidence: {}, + toolOutput: { + ...e.toolOutput, + result: redactInlineImagePayloads(e.toolOutput.result, e.actionName), + }, + }); + } + + private onStepObserved(e: AgentStepObservedEvent): void { + const probe: ProbeEvidence = { url: e.url }; + if (this.pendingProbeScreenshot) + probe.screenshot = this.pendingProbeScreenshot; + if (e.ariaTree !== undefined) probe.ariaTree = e.ariaTree; + for (const idx of this.stepsAwaitingProbe) { + this.steps[idx].probeEvidence = probe; + } + this.stepsAwaitingProbe = []; + this.pendingProbeScreenshot = undefined; + } + + private onFinalAnswer(e: AgentFinalAnswerEvent): void { + this.finalAnswerEvent = e; + if (e.observation) { + this.finalObservation = { + url: e.observation.url, + ...(e.observation.screenshot + ? { screenshot: e.observation.screenshot } + : {}), + ...(e.observation.ariaTree !== undefined + ? { ariaTree: e.observation.ariaTree } + : {}), + }; + } + } + + constructor(opts: TrajectoryRecorderOptions) { + this.taskSpec = opts.taskSpec; + this.runId = opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-"); + const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories"); + this.outputDir = path.join(root, this.runId, opts.taskSpec.id); + this.persistEnabled = shouldPersistTrajectory(opts.persist); + } + + /** Ingest an evidence callback event from agent.execute(). */ + record(event: AgentEvidenceEvent): void { + switch (event.type) { + case "screenshot": + this.onScreenshot(event); + break; + case "step_finished": + this.onStepFinished(event); + break; + case "step_observed": + this.onStepObserved(event); + break; + case "final_answer": + this.onFinalAnswer(event); + break; + } + } + + /** + * Detach listeners, assemble the Trajectory, and (if persistence is on) + * write the on-disk layout. Idempotent. + */ + async finish(opts: TrajectoryFinishOptions): Promise { + const trajectory: Trajectory = { + task: this.taskSpec, + steps: this.steps, + finalAnswer: opts.finalAnswer ?? this.finalAnswerEvent?.message, + ...(this.finalObservation + ? { finalObservation: this.finalObservation } + : {}), + status: opts.status, + usage: { ...ZERO_USAGE, ...(opts.usage ?? {}) }, + }; + + if (this.persistEnabled) { + await writeTrajectoryDir(this.outputDir, trajectory); + } + + return trajectory; + } + + /** Throw away in-memory state without writing to disk. Used on early abort. */ + cancel(): void { + this.steps.length = 0; + this.latestAgentScreenshot = undefined; + this.pendingProbeScreenshot = undefined; + this.stepsAwaitingProbe = []; + this.finalAnswerEvent = undefined; + this.finalObservation = undefined; + } + + /** Where the trajectory dir lives (whether or not it was persisted). */ + get directory(): string { + return this.outputDir; + } + + /** Whether this recorder wrote the trajectory directory on finish(). */ + get persisted(): boolean { + return this.persistEnabled; + } + + /** + * Persist evaluator result next to the trajectory. No-op when trajectory + * persistence is disabled. + */ + async persistResult( + result: EvaluationResult, + filename = "result.json", + ): Promise { + if (!this.persistEnabled) return; + + const scoresDir = path.join(this.outputDir, "scores"); + await fs.mkdir(scoresDir, { recursive: true }); + await fs.writeFile( + path.join(scoresDir, filename), + JSON.stringify(result, null, 2), + ); + + const taskDataPath = path.join(this.outputDir, "task_data.json"); + let taskData: Record; + try { + taskData = JSON.parse(await fs.readFile(taskDataPath, "utf8")) as Record< + string, + unknown + >; + } catch { + taskData = { task: this.taskSpec }; + } + await fs.writeFile( + taskDataPath, + JSON.stringify({ ...taskData, result }, null, 2), + ); + } +} diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts new file mode 100644 index 0000000000..320c4a5259 --- /dev/null +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -0,0 +1,247 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { afterEach, describe, expect, it } from "vitest"; +import type { TaskSpec } from "@browserbasehq/stagehand"; + +import { TrajectoryRecorder } from "../../framework/trajectoryRecorder.js"; + +const tempDirs: string[] = []; + +afterEach(async () => { + while (tempDirs.length > 0) { + const dir = tempDirs.pop(); + if (dir) await fs.rm(dir, { recursive: true, force: true }); + } +}); + +function makeTempDir(): Promise { + return fs + .mkdtemp(path.join(os.tmpdir(), "trajectory-recorder-")) + .then((dir) => { + tempDirs.push(dir); + return dir; + }); +} + +function makeTaskSpec(): TaskSpec { + return { + id: "recorder-task", + instruction: "Compare economy and business fares.", + initUrl: "https://example.com", + }; +} + +function recordSimpleStep(recorder: TrajectoryRecorder, screenshot: Buffer) { + recorder.record({ + type: "screenshot", + screenshot, + url: "https://example.com/search", + evidenceRole: "agent", + }); + recorder.record({ + type: "step_finished", + actionName: "act", + actionArgs: { instruction: "Search fares" }, + reasoning: "Search for fares.", + toolOutput: { ok: true, result: "done" }, + }); + recorder.record({ + type: "screenshot", + screenshot, + url: "https://example.com/search", + evidenceRole: "probe", + }); + recorder.record({ + type: "step_observed", + url: "https://example.com/search", + }); +} + +function recordFinalAnswer( + recorder: TrajectoryRecorder, + opts: { message: string; screenshot: Buffer; ariaTree?: string }, +): void { + recorder.record({ + type: "final_answer", + message: opts.message, + observation: { + url: "https://example.com/complete", + screenshot: opts.screenshot, + ...(opts.ariaTree !== undefined ? { ariaTree: opts.ariaTree } : {}), + }, + }); +} + +describe("TrajectoryRecorder", () => { + it("assembles ordered callback events into trajectory steps", async () => { + const recorder = new TrajectoryRecorder({ + taskSpec: makeTaskSpec(), + persist: false, + }); + const screenshot = Buffer.from("screen-1"); + const staleScreenshot = Buffer.from("stale-screen"); + const probeScreenshot = Buffer.from("probe-screen"); + + recorder.record({ + type: "screenshot", + screenshot: staleScreenshot, + url: "https://example.com/stale", + evidenceRole: "agent", + }); + recorder.record({ + type: "screenshot", + screenshot, + url: "https://example.com/search", + evidenceRole: "agent", + }); + recorder.record({ + type: "step_finished", + actionName: "click", + actionArgs: { describe: "Open fares" }, + reasoning: "Open fare details.", + toolOutput: { ok: true, result: "opened" }, + }); + recorder.record({ + type: "step_finished", + actionName: "extract", + actionArgs: { instruction: "Read fares" }, + reasoning: "Read visible fare cells.", + toolOutput: { + ok: true, + result: { economy: "$100", business: "$250" }, + }, + }); + recorder.record({ + type: "screenshot", + screenshot: probeScreenshot, + url: "https://example.com/search", + evidenceRole: "probe", + }); + recorder.record({ + type: "step_observed", + url: "https://example.com/search", + ariaTree: "RootWebArea\nStaticText: Economy $100", + }); + recordFinalAnswer(recorder, { + message: "Business is $150 more than economy.", + screenshot: Buffer.from("final-screen"), + ariaTree: "RootWebArea\nStaticText: Complete", + }); + + const trajectory = await recorder.finish({ + status: "complete", + usage: { input_tokens: 10, output_tokens: 5 }, + }); + + expect(trajectory.steps).toHaveLength(2); + expect(trajectory.steps[0]).toMatchObject({ + actionName: "click", + probeEvidence: { + url: "https://example.com/search", + ariaTree: "RootWebArea\nStaticText: Economy $100", + }, + }); + expect(trajectory.steps[1]).toMatchObject({ + actionName: "extract", + toolOutput: { ok: true, result: { economy: "$100", business: "$250" } }, + }); + expect(trajectory.steps[0].probeEvidence.screenshot).toEqual( + probeScreenshot, + ); + expect(trajectory.steps[1].probeEvidence.screenshot).toEqual( + probeScreenshot, + ); + expect(trajectory.steps[0].agentEvidence.modalities).toEqual( + expect.arrayContaining([ + { type: "image", bytes: screenshot, mediaType: "image/png" }, + { type: "text", content: "Open fare details." }, + ]), + ); + // Both actions were chosen from the same agent screenshot (one screenshot, + // two step_finished), so the second step must carry that frame too. + expect(trajectory.steps[1].agentEvidence.modalities).toEqual( + expect.arrayContaining([ + { type: "image", bytes: screenshot, mediaType: "image/png" }, + ]), + ); + expect(trajectory.finalAnswer).toBe("Business is $150 more than economy."); + expect(trajectory.finalObservation).toMatchObject({ + url: "https://example.com/complete", + ariaTree: "RootWebArea\nStaticText: Complete", + }); + expect(trajectory.finalObservation?.screenshot).toEqual( + Buffer.from("final-screen"), + ); + }); + + it("persists trajectory files and evaluator results", async () => { + const outputRoot = await makeTempDir(); + const recorder = new TrajectoryRecorder({ + taskSpec: makeTaskSpec(), + outputRoot, + runId: "run-1", + persist: true, + }); + const screenshot = Buffer.from("screen-1"); + + recordSimpleStep(recorder, screenshot); + recordFinalAnswer(recorder, { + message: "Complete.", + screenshot: Buffer.from("final-screen"), + }); + + await recorder.finish({ status: "complete" }); + await recorder.persistResult({ + outcomeSuccess: true, + explanation: "The task was completed.", + }); + + const taskDir = path.join(outputRoot, "run-1", "recorder-task"); + await expect(fs.readdir(taskDir)).resolves.toEqual( + expect.arrayContaining([ + "core.log", + "scores", + "screenshots", + "task_data.json", + "trajectory.json", + ]), + ); + await expect( + fs.readFile(path.join(taskDir, "screenshots", "probe", "1.png")), + ).resolves.toEqual(screenshot); + await expect( + fs.readFile(path.join(taskDir, "screenshots", "probe", "final.png")), + ).resolves.toEqual(Buffer.from("final-screen")); + await expect( + fs.readFile(path.join(taskDir, "screenshots", "agent", "1.png")), + ).resolves.toEqual(screenshot); + await expect( + fs.readFile(path.join(taskDir, "scores", "result.json"), "utf8"), + ).resolves.toContain('"outcomeSuccess": true'); + + const trajectory = JSON.parse( + await fs.readFile(path.join(taskDir, "trajectory.json"), "utf8"), + ); + expect(trajectory.steps[0].probeEvidence.screenshotPath).toBe( + "screenshots/probe/1.png", + ); + expect(trajectory.finalObservation.screenshotPath).toBe( + "screenshots/probe/final.png", + ); + expect(trajectory.steps[0].agentEvidence.modalities).toContainEqual({ + type: "image", + imagePath: "screenshots/agent/1.png", + mediaType: "image/png", + }); + + const taskData = JSON.parse( + await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"), + ); + expect(taskData.result).toMatchObject({ + outcomeSuccess: true, + explanation: "The task was completed.", + }); + }); +});