From 6f27af2528dd4f229d7625aed790ef584b0171aa Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 13:44:16 -0700
Subject: [PATCH 01/27] feat(verifier): record agent trajectories

---
 .../core/lib/v3/agent/AnthropicCUAClient.ts   |   4 +
 .../v3/agent/utils/captureAriaTreeProbe.ts    |  75 +++
 .../core/lib/v3/handlers/v3AgentHandler.ts    | 109 ++++
 .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 160 +++++-
 .../core/lib/v3/types/public/busEvents.ts     | 108 ++++
 packages/core/lib/v3/types/public/index.ts    |   1 +
 .../evals/framework/trajectoryRecorder.ts     | 507 ++++++++++++++++++
 .../scripts/verify-trajectory-recorder.ts     | 230 ++++++++
 8 files changed, 1192 insertions(+), 2 deletions(-)
 create mode 100644 packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts
 create mode 100644 packages/core/lib/v3/types/public/busEvents.ts
 create mode 100644 packages/evals/framework/trajectoryRecorder.ts
 create mode 100644 packages/evals/scripts/verify-trajectory-recorder.ts
diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts
index 752d208e22..54d64f15d0 100644
--- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts
+++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts
@@ -902,6 +902,10 @@ export class AnthropicCUAClient extends AgentClient {
             ...input,
           };
         } else if (action === "triple_click" || action === "tripleClick") {
+          // Anthropic's computer_20250124 tool emits `triple_click` with
+          // `coordinate: [x, y]`. Without this branch the snake_case name +
+          // raw coordinate array fall through to the generic `else` and
+          // executeAction logs "Unknown action type: triple_click".
           return {
             type: "tripleClick",
             x:
diff --git a/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts
new file mode 100644
index 0000000000..8e3fcc050b
--- /dev/null
+++ b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts
@@ -0,0 +1,75 @@
+/**
+ * captureAriaTreeProbe — capture a truncated accessibility tree of the active
+ * page for use as tier-2 evidence in the trajectory recorder.
+ *
+ * Shared by v3AgentHandler and v3CuaAgentHandler. Listener-gated by the
+ * callers so ordinary agent runs (no TrajectoryRecorder attached) don't pay
+ * the cost.
+ *
+ * The a11y tree is the same payload the agent's `ariaTree` tool sees, but
+ * captured by the harness (not the agent) so the verifier has independent
+ * textual ground truth for grounding non-visual claims — prices, names,
+ * dates, list contents — without OCR'ing screenshots.
+ *
+ * Budget: defaults to ~8000 tokens (32k chars). Per-step a11y captures
+ * across a ~30-step trajectory at that cap sum to ~240k tokens total,
+ * which the verifier handles via per-criterion top-K selection. The cap
+ * is configurable via VERIFIER_ARIATREE_TOKEN_BUDGET so consumers can
+ * trade RAM/disk for fidelity. Truncated content is marked explicitly so
+ * the verifier knows it was clipped.
+ */
+import type { V3 } from "../../v3.js";
+
+const APPROX_CHARS_PER_TOKEN = 4;
+const DEFAULT_TOKEN_BUDGET = 8_000;
+const DEFAULT_TIMEOUT_MS = 5_000;
+
+interface CaptureAriaTreeOptions {
+  /** Soft cap on token count (chars/4 approximation). Default 8000. */
+  tokenBudget?: number;
+  /** Hard timeout on the capture. Default 5s. */
+  timeoutMs?: number;
+}
+
+/**
+ * Returns the truncated a11y tree as a plain string, or undefined when
+ * capture fails. Never throws — a11y capture is best-effort tier-2 evidence,
+ * not a hard requirement, so failures are silently absorbed (the verifier
+ * surfaces this via evidence_insufficient).
+ */
+export async function captureAriaTreeProbe(
+  v3: V3,
+  opts: CaptureAriaTreeOptions = {},
+): Promise<string | undefined> {
+  const envBudget = parseInt(
+    process.env.VERIFIER_ARIATREE_TOKEN_BUDGET ?? "",
+    10,
+  );
+  const tokenBudget =
+    opts.tokenBudget ??
+    (Number.isFinite(envBudget) && envBudget > 0
+      ? envBudget
+      : DEFAULT_TOKEN_BUDGET);
+  const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+  const maxChars = tokenBudget * APPROX_CHARS_PER_TOKEN;
+
+  try {
+    // v3.extract() without a schema returns { pageText } where pageText is the
+    // rendered accessibility tree — same path the agent's ariaTree tool uses.
+    const result = (await v3.extract({ timeout: timeoutMs })) as {
+      pageText?: string;
+    };
+    const pageText = result?.pageText;
+    if (typeof pageText !== "string" || pageText.length === 0) return undefined;
+
+    if (pageText.length > maxChars) {
+      return (
+        pageText.slice(0, maxChars) +
+        `\n\n[CONTENT TRUNCATED at ~${tokenBudget} tokens — set VERIFIER_ARIATREE_TOKEN_BUDGET to raise]`
+      );
+    }
+    return pageText;
+  } catch {
+    return undefined;
+  }
+}
diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts
index cff08c8a28..d0308bdd8a 100644
--- a/packages/core/lib/v3/handlers/v3AgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts
@@ -41,6 +41,7 @@ import {
   AgentAbortError,
 } from "../types/public/sdkErrors.js";
 import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js";
+import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js";
 import {
   CaptchaSolver,
   CAPTCHA_SOLVED_MSG,
@@ -248,6 +249,10 @@ export class V3AgentHandler {
       | GenerateTextOnStepFinishCallback<ToolSet>
       | StreamTextOnStepFinishCallback<ToolSet>,
   ) {
+    // Monotonic step counter scoped to this execute() call. Each tool call in
+    // the agent loop becomes one trajectory step. The counter feeds stepIndex
+    // on the bus events the TrajectoryRecorder subscribes to.
+    let stepCounter = 0;
     return async (event: StepResult<ToolSet>) => {
       this.logger({
         category: "agent",
@@ -255,6 +260,11 @@ export class V3AgentHandler {
         level: 2,
       });
 
+      const stepIndicesInTurn: number[] = [];
+      let lastFinalAnswer:
+        | { message: string; output?: Record<string, unknown> }
+        | undefined;
+
       if (event.toolCalls && event.toolCalls.length > 0) {
         for (let i = 0; i < event.toolCalls.length; i++) {
           const toolCall = event.toolCalls[i];
@@ -279,6 +289,13 @@ export class V3AgentHandler {
                 ? `${allReasoning} ${doneReasoning}`.trim()
                 : allReasoning || "Task completed successfully";
             }
+            lastFinalAnswer = {
+              message: state.finalMessage,
+              output:
+                typeof args?.output === "object" && args?.output !== null
+                  ? (args.output as Record<string, unknown>)
+                  : undefined,
+            };
           }
           const mappedActions = mapToolResultToActions({
             toolCallName: toolCall.toolName,
@@ -292,8 +309,100 @@ export class V3AgentHandler {
             action.timestamp = Date.now();
             state.actions.push(action);
           }
+
+          // Emit step_finished_event per tool call. The TrajectoryRecorder
+          // builds one Trajectory.Step per emission. tier-1 evidence (the
+          // bytes the LLM consumed) is captured separately via an
+          // onStepFinish wrapper in the harness (plan §10 Q1).
+          const stepIndex = stepCounter++;
+          stepIndicesInTurn.push(stepIndex);
+          const toolOk =
+            !toolResult ||
+            (typeof toolResult === "object" &&
+              !("error" in toolResult) &&
+              !("isError" in toolResult && toolResult.isError));
+          this.v3.bus.emit("agent_step_finished_event", {
+            stepIndex,
+            actionName: toolCall.toolName,
+            actionArgs:
+              typeof args === "object" && args !== null
+                ? (args as Record<string, unknown>)
+                : {},
+            reasoning: event.text ?? "",
+            toolOutput: {
+              ok: toolOk,
+              result: toolResult,
+              error:
+                toolResult &&
+                typeof toolResult === "object" &&
+                "error" in toolResult &&
+                typeof (toolResult as { error?: unknown }).error === "string"
+                  ? (toolResult as { error: string }).error
+                  : undefined,
+            },
+            finishedAt: new Date().toISOString(),
+          });
         }
         state.currentPageUrl = (await this.v3.context.awaitActivePage()).url();
+
+        // Harness probe — take a single screenshot / a11y snapshot per AI SDK
+        // step and attach it to every tool call in that turn. The observation
+        // reflects the settled page state after the batch of tool calls; this
+        // is more faithful than dropping probe evidence for all but the last
+        // tool call, while still avoiding per-tool screenshot overhead.
+        const wantsScreenshotProbe =
+          this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0;
+        const wantsStepObservation =
+          this.v3.bus.listenerCount("agent_step_observed_event") > 0;
+        if (
+          stepIndicesInTurn.length > 0 &&
+          (wantsScreenshotProbe || wantsStepObservation)
+        ) {
+          try {
+            const page = await this.v3.context.awaitActivePage();
+            let screenshot: Buffer | undefined;
+            if (wantsScreenshotProbe) {
+              screenshot = await page.screenshot({ fullPage: false });
+            }
+            let ariaTree: string | undefined;
+            if (wantsStepObservation) {
+              // Capture the a11y tree alongside the URL probe so the verifier
+              // can ground textual claims (prices, names, dates) without OCR.
+              // Best-effort: returns undefined on failure/timeout.
+              ariaTree = await captureAriaTreeProbe(this.v3);
+            }
+            for (const stepIndex of stepIndicesInTurn) {
+              if (screenshot) {
+                // DOM/hybrid: this post-step screenshot is a harness probe
+                // only. The agent's tier-1 evidence is the tool's return value
+                // captured separately in agent_step_finished_event.
+                this.v3.bus.emit("agent_screenshot_taken_event", {
+                  stepIndex,
+                  screenshot,
+                  url: state.currentPageUrl,
+                  evidenceRole: "probe",
+                });
+              }
+              if (wantsStepObservation) {
+                this.v3.bus.emit("agent_step_observed_event", {
+                  stepIndex,
+                  url: state.currentPageUrl,
+                  ariaTree,
+                });
+              }
+            }
+          } catch (e) {
+            this.logger({
+              category: "agent",
+              message: `Warning: harness probe failed: ${getErrorMessage(e)}`,
+              level: 1,
+            });
+          }
+        }
+      }
+
+      if (lastFinalAnswer) {
+        this.v3.bus.emit("agent_final_answer_event", lastFinalAnswer);
       }
 
       if (userCallback) {
diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index af3a3dad87..f1dd2666e6 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -7,6 +7,7 @@ import { GoogleCUAClient } from "../agent/GoogleCUAClient.js";
 import { OpenAICUAClient } from "../agent/OpenAICUAClient.js";
 import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js";
 import { ensureXPath } from "../agent/utils/xpath.js";
+import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js";
 import {
   ActionExecutionResult,
   AgentAction,
@@ -16,6 +17,7 @@ import {
   SafetyConfirmationHandler,
 } from "../types/public/agent.js";
 import { LogLine } from "../types/public/logs.js";
+import type { AgentScreenshotTakenEvent } from "../types/public/busEvents.js";
 import { type Action, V3FunctionName } from "../types/public/methods.js";
 import { FlowLogger } from "../flowlogger/FlowLogger.js";
 import { toTitleCase } from "../../utils.js";
@@ -37,6 +39,13 @@ export class V3CuaAgentHandler {
   private captchaSolver: CaptchaSolver | null = null;
   private captchaClickGuardRemaining = 0;
   private currentInstruction = "";
+  // Monotonic step counter used by bus events. The CUA loop is internal to
+  // the agent client, so unlike v3AgentHandler we don't have per-tool-call
+  // step events; instead we tag every screenshot emission with an
+  // incrementing index. Wave 1 may add finer-grained step events here.
+  private cuaStepCounter = 0;
+  private latestCuaScreenshot?: AgentScreenshotTakenEvent;
+  private latestCuaScreenshotConsumed = true;
 
   constructor(
     v3: V3,
@@ -76,6 +85,17 @@ export class V3CuaAgentHandler {
       this.ensureNotClosed();
       const page = await this.v3.context.awaitActivePage();
       const screenshotBuffer = await page.screenshot({ fullPage: false });
+
+      // Emit bus event so TrajectoryRecorder can capture the screenshot. In
+      // CUA mode this is the same buffer the provider receives — i.e., it
+      // serves both as tier-1 evidence (what the model saw) and as a tier-2
+      // probe. See plan §04 "Mode-by-mode sources".
+      try {
+        this.emitCuaScreenshot(screenshotBuffer, page.url());
+      } catch {
+        // bus emit errors are non-fatal
+      }
+
       return screenshotBuffer.toString("base64"); // base64 png
     });
 
@@ -120,6 +140,7 @@ export class V3CuaAgentHandler {
         (this.options.clientOptions?.waitBetweenActions as number) ||
         defaultDelay;
       try {
+        let executionResult: ActionExecutionResult | undefined;
         // Try to inject cursor before each action if enabled
         if (this.highlightCursor) {
           try {
@@ -133,7 +154,7 @@ export class V3CuaAgentHandler {
         // takes its own screenshot via screenshotProvider between API turns.
         const shouldLog = action.type !== "screenshot";
         if (shouldLog) {
-          await FlowLogger.runWithLogging(
+          executionResult = await FlowLogger.runWithLogging(
             {
               eventType: `V3Cua${toTitleCase(action.type)}`, // e.g. "V3CuaClick"
               data: {
@@ -145,10 +166,13 @@ export class V3CuaAgentHandler {
             [action],
           );
         } else {
-          await this.executeAction(action);
+          executionResult = await this.executeAction(action);
         }
 
         action.timestamp = Date.now();
+        if (shouldLog) {
+          await this.emitCuaActionStep(action, executionResult);
+        }
 
         await new Promise((r) => setTimeout(r, waitBetween));
       } catch (error) {
@@ -658,6 +682,15 @@ export class V3CuaAgentHandler {
       const screenshotBuffer = await page.screenshot({ fullPage: false });
 
       const currentUrl = page.url();
+
+      // Mirror the screenshot to the bus — same buffer the CUA client
+      // received, so it serves as both tier-1 evidence and tier-2 probe.
+      try {
+        this.emitCuaScreenshot(screenshotBuffer, currentUrl);
+      } catch {
+        // non-fatal
+      }
+
       return await this.agentClient.captureScreenshot({
         base64Image: screenshotBuffer.toString("base64"),
         currentUrl,
@@ -767,6 +800,129 @@ export class V3CuaAgentHandler {
     }
   }
 
+  /**
+   * Emit a pre-action CUA screenshot — the exact buffer the model received
+   * as input. Tier-1 evidence (agent-mirrored); the tier-2 probe is taken
+   * separately in emitCuaActionStep after the action runs, so the recorder
+   * can compare what the model saw against what the page actually showed
+   * once the keystrokes/clicks landed.
+   */
+  private emitCuaScreenshot(
+    screenshot: Buffer,
+    url: string,
+  ): AgentScreenshotTakenEvent {
+    const event: AgentScreenshotTakenEvent = {
+      stepIndex: this.cuaStepCounter++,
+      screenshot,
+      url,
+      evidenceRole: "agent",
+    };
+    this.latestCuaScreenshot = event;
+    this.latestCuaScreenshotConsumed = false;
+    this.v3.bus.emit("agent_screenshot_taken_event", event);
+    return event;
+  }
+
+  private async emitCuaActionStep(
+    action: AgentAction,
+    result: ActionExecutionResult | undefined,
+  ): Promise<void> {
+    let pageUrl =
+      typeof action.pageUrl === "string"
+        ? action.pageUrl
+        : this.latestCuaScreenshot?.url;
+    try {
+      pageUrl = (await this.v3.context.awaitActivePage()).url();
+    } catch {
+      // Keep the best pre-action URL fallback.
+    }
+    let stepIndex: number;
+
+    if (this.latestCuaScreenshot && !this.latestCuaScreenshotConsumed) {
+      stepIndex = this.latestCuaScreenshot.stepIndex;
+      this.latestCuaScreenshotConsumed = true;
+    } else if (this.latestCuaScreenshot) {
+      stepIndex = this.cuaStepCounter++;
+      this.v3.bus.emit("agent_screenshot_taken_event", {
+        ...this.latestCuaScreenshot,
+        stepIndex,
+      });
+    } else {
+      stepIndex = this.cuaStepCounter++;
+    }
+
+    const actionArgs = Object.fromEntries(
+      Object.entries(action).filter(([key]) => key !== "screenshot"),
+    );
+    const reasoning =
+      typeof action.reasoning === "string"
+        ? action.reasoning
+        : typeof action.action === "string"
+          ? action.action
+          : "";
+
+    this.v3.bus.emit("agent_step_finished_event", {
+      stepIndex,
+      actionName: String(action.type),
+      actionArgs,
+      reasoning,
+      toolOutput: {
+        ok: result?.success !== false,
+        result: result ?? { success: true },
+        error: result?.error,
+      },
+      finishedAt: new Date().toISOString(),
+    });
+
+    // Post-action tier-2 probe. The pre-action screenshot from
+    // screenshotProvider is what the model SAW; this one shows what the
+    // page actually LOOKS LIKE after the action ran. Without this the
+    // verifier has no visual evidence that keystrokes/clicks landed, and
+    // has to trust the action history alone.
+    //
+    // Listener-gated to keep ordinary agent runs free of the extra
+    // screenshot cost — mirrors v3AgentHandler's post-step probe.
+    const wantsScreenshotProbe =
+      this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0;
+    const wantsStepObservation =
+      this.v3.bus.listenerCount("agent_step_observed_event") > 0;
+    let probeUrl = pageUrl;
+    if (wantsScreenshotProbe || wantsStepObservation) {
+      try {
+        const page = await this.v3.context.awaitActivePage();
+        probeUrl = page.url();
+        if (wantsScreenshotProbe) {
+          const probeScreenshot = await page.screenshot({ fullPage: false });
+          this.v3.bus.emit("agent_screenshot_taken_event", {
+            stepIndex,
+            screenshot: probeScreenshot,
+            url: probeUrl,
+            evidenceRole: "probe",
+          });
+        }
+      } catch (e) {
+        this.logger({
+          category: "agent",
+          message: `Warning: CUA post-action probe failed: ${
+            e instanceof Error ? e.message : String(e)
+          }`,
+          level: 1,
+        });
+      }
+    }
+
+    if (probeUrl && wantsStepObservation) {
+      // Capture the a11y tree alongside the URL probe so the verifier can
+      // ground textual claims without OCR. Best-effort.
+      const ariaTree = await captureAriaTreeProbe(this.v3);
+      this.v3.bus.emit("agent_step_observed_event", {
+        stepIndex,
+        url: probeUrl,
+        ariaTree,
+      });
+    }
+  }
+
   private async injectCursor(): Promise<void> {
     try {
       const page = await this.v3.context.awaitActivePage();
diff --git a/packages/core/lib/v3/types/public/busEvents.ts b/packages/core/lib/v3/types/public/busEvents.ts
new file mode 100644
index 0000000000..62e9929492
--- /dev/null
+++ b/packages/core/lib/v3/types/public/busEvents.ts
@@ -0,0 +1,108 @@
+/**
+ * Bus event payloads emitted by V3 on `v3.bus`.
+ *
+ * The bus is an EventEmitter; these types document the payload shape per
+ * event name so consumers (TrajectoryRecorder in packages/evals, custom
+ * subscribers) can type their handlers.
+ *
+ * Wave 0 of the verifier rewrite plan introduces:
+ *   - agent_screenshot_taken_event    — independent post-step screenshot probe
+ *   - agent_step_finished_event       — fired per tool-call in a step result
+ *   - agent_step_observed_event       — fired after the harness probe completes
+ *   - agent_final_answer_event        — fired when the `done` tool resolves
+ *
+ * `agent_step_started_event` is documented in the plan but deferred — the AI
+ * SDK's `onStepFinish` is a post-hook, and there's no symmetric pre-hook per
+ * tool execution in v3AgentHandler today. Started-state can be derived from
+ * the finished event's stepIndex if needed.
+ */
+
+/**
+ * Names of bus events the agent handlers emit. Use these constants to
+ * subscribe; the bus accepts arbitrary strings, but a centralized list helps
+ * catch typos at the call site.
+ */
+export const BUS_EVENTS = {
+  AGENT_SCREENSHOT_TAKEN: "agent_screenshot_taken_event",
+  AGENT_STEP_FINISHED: "agent_step_finished_event",
+  AGENT_STEP_OBSERVED: "agent_step_observed_event",
+  AGENT_FINAL_ANSWER: "agent_final_answer_event",
+} as const;
+
+export type BusEventName = (typeof BUS_EVENTS)[keyof typeof BUS_EVENTS];
+
+/**
+ * Payload for `agent_screenshot_taken_event`. The raw screenshot Buffer the
+ * harness took after a step's tool execution.
+ *
+ * Note: in CUA mode the same Buffer is also what the provider received; in
+ * DOM/hybrid mode it's an independent harness probe. The verifier treats them
+ * as different evidence tiers regardless — see plan §04 ("Mode-by-mode sources").
+ */
+export interface AgentScreenshotTakenEvent {
+  /** Zero-based index of the step this screenshot corresponds to. */
+  stepIndex: number;
+  /** PNG bytes from page.screenshot(). */
+  screenshot: Buffer;
+  /** Page URL at the time of capture. */
+  url: string;
+  /**
+   * Evidence role for this screenshot.
+   *
+   * DOM/hybrid post-tool screenshots are probe-only. CUA screenshots are also
+   * the exact image bytes sent to the provider, so they serve both as tier-1
+   * agent evidence and tier-2 probe evidence.
+   */
+  evidenceRole?: "probe" | "agent" | "agent_and_probe";
+}
+
+/**
+ * Payload for `agent_step_finished_event`. Emitted once per tool call within
+ * a step result. Carries the tool's reported outcome and a reference to the
+ * agent's textual reasoning for the step.
+ *
+ * Tier 1 evidence (the bytes the LLM consumed as the tool result) is captured
+ * separately by the harness via an AgentExecuteCallbacks.onStepFinish wrapper
+ * — not in this payload. See plan §10 Q1 (resolved: onStepFinish).
+ */
+export interface AgentStepFinishedEvent {
+  stepIndex: number;
+  /** Name of the tool that ran (e.g., "act", "extract", "click"). */
+  actionName: string;
+  /** Arguments passed to the tool. */
+  actionArgs: Record<string, unknown>;
+  /** Agent's textual reasoning (event.text on the AI SDK StepResult). */
+  reasoning: string;
+  /** Outcome of the tool execution as seen by the harness. */
+  toolOutput: {
+    ok: boolean;
+    /** The tool's native return value. */
+    result: unknown;
+    error?: string;
+  };
+  /** ISO 8601 timestamp at which the step finished. */
+  finishedAt: string;
+}
+
+/**
+ * Payload for `agent_step_observed_event`. Emitted after the harness probe
+ * completes for a step (page URL captured at minimum; a11y tree and scroll
+ * info added in Wave 2).
+ */
+export interface AgentStepObservedEvent {
+  stepIndex: number;
+  /** Page URL after the step's tool execution. */
+  url: string;
+  /** v1 — accessibility tree snapshot. */
+  ariaTree?: string;
+  /** v1 — viewport scroll context. */
+  scroll?: { top: number; pageHeight: number };
+}
+
+/** Payload for `agent_final_answer_event`. Emitted when the `done` tool resolves. */
+export interface AgentFinalAnswerEvent {
+  /** The agent's final summary message. */
+  message: string;
+  /** Optional structured output if the agent's `output` schema was set. */
+  output?: Record<string, unknown>;
+}
diff --git a/packages/core/lib/v3/types/public/index.ts b/packages/core/lib/v3/types/public/index.ts
index 9c5df08d01..9bf24eb271 100644
--- a/packages/core/lib/v3/types/public/index.ts
+++ b/packages/core/lib/v3/types/public/index.ts
@@ -1,4 +1,5 @@
 export * from "./agent.js";
+export * from "./busEvents.js";
 // Export api.ts under namespace to avoid conflicts with methods.ts types
 export * as Api from "./api.js";
 // Also export BrowserbaseRegion directly for convenience
diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
new file mode 100644
index 0000000000..2b7f24b529
--- /dev/null
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -0,0 +1,507 @@
+/**
+ * TrajectoryRecorder — subscribes to v3.bus step events emitted by the agent
+ * handlers (v3AgentHandler / v3CuaAgentHandler) and assembles a Trajectory
+ * the verifier can consume.
+ *
+ * Lifecycle:
+ *   const recorder = new TrajectoryRecorder({ v3, taskSpec });
+ *   recorder.start();
+ *   await agent.execute(...);
+ *   const trajectory = await recorder.finish({ status: "complete", usage });
+ *
+ * Persistence is env-gated by `VERIFIER_PERSIST_TRAJECTORIES` (plan §10 Q2):
+ *   - unset: persistence follows the default (on locally, off in CI).
+ *   - "1" / "true": always persist.
+ *   - "0" / "false": never persist.
+ *
+ * On-disk layout matches microsoft/fara's example_trajectory/ so we can
+ * cross-validate against verify_trajectories.py without format conversion.
+ *
+ * @see ~/.claude/plans/verifier-rewrite.html §06 (Trajectory on-disk)
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+import type {
+  AgentEvidence,
+  AgentFinalAnswerEvent,
+  AgentScreenshotTakenEvent,
+  AgentStepFinishedEvent,
+  AgentStepObservedEvent,
+  ProbeEvidence,
+  TaskSpec,
+  Trajectory,
+  TrajectoryStatus,
+  TrajectoryStep,
+  TrajectoryUsage,
+  Verdict,
+  V3,
+} from "@browserbasehq/stagehand";
+
+interface PartialStep {
+  index: number;
+  actionName: string;
+  actionArgs: Record<string, unknown>;
+  reasoning: string;
+  agentEvidence: AgentEvidence;
+  probeEvidence: ProbeEvidence;
+  toolOutput: { ok: boolean; result: unknown; error?: string };
+  finishedAt: string;
+}
+
+export interface TrajectoryRecorderOptions {
+  v3: V3;
+  taskSpec: TaskSpec;
+  /**
+   * Root directory under which trajectory dirs are written. Each task run
+   * gets a subdirectory named by runId/task.id.
+   * Defaults to `<cwd>/.trajectories`.
+   */
+  outputRoot?: string;
+  /** Run identifier (e.g., ISO timestamp + env). Defaults to a fresh timestamp. */
+  runId?: string;
+  /**
+   * Override the env-gated persistence default. `true` always persists,
+   * `false` never does, `undefined` defers to VERIFIER_PERSIST_TRAJECTORIES.
+   */
+  persist?: boolean;
+}
+
+export interface TrajectoryFinishOptions {
+  status: TrajectoryStatus;
+  finalAnswer?: string;
+  usage?: Partial<TrajectoryUsage>;
+}
+
+const ZERO_USAGE: TrajectoryUsage = {
+  input_tokens: 0,
+  output_tokens: 0,
+};
+
+/**
+ * Decide whether to persist by default. Honors the explicit override first,
+ * then env, then falls back to "persist when not in CI".
+ */
+function shouldPersist(override: boolean | undefined): boolean {
+  if (override !== undefined) return override;
+  const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase();
+  if (env === "1" || env === "true") return true;
+  if (env === "0" || env === "false") return false;
+  return !process.env.CI;
+}
+
+export class TrajectoryRecorder {
+  private readonly v3: V3;
+  private readonly taskSpec: TaskSpec;
+  private readonly runId: string;
+  private readonly outputDir: string;
+  private readonly persistEnabled: boolean;
+
+  // Per-stepIndex builders; events can arrive out-of-order in theory, though
+  // the handlers emit step_finished → screenshot_taken → step_observed in the
+  // same microtask.
+  private readonly partialSteps = new Map<number, Partial<PartialStep>>();
+  private readonly observationByStep = new Map<
+    number,
+    AgentStepObservedEvent
+  >();
+  private readonly screenshotsByStep = new Map<
+    number,
+    AgentScreenshotTakenEvent
+  >();
+  private finalAnswerEvent?: AgentFinalAnswerEvent;
+  private startedAt = "";
+  private endedAt = "";
+  private listenersAttached = false;
+
+  // Strongly-typed bound handlers so we can attach/detach the same references.
+  private readonly onScreenshot = (e: AgentScreenshotTakenEvent) => {
+    this.screenshotsByStep.set(e.stepIndex, e);
+    const partial = this.ensurePartial(e.stepIndex);
+
+    // Default to "probe" when the emit site doesn't tag the role — matches
+    // v3AgentHandler's post-step screenshot, which is always a tier-2 probe.
+    const role = e.evidenceRole ?? "probe";
+
+    // Probe channel (tier 2): the page's state at observation time. For CUA
+    // the pre-action screenshot is NOT a probe — that role is filled by the
+    // post-action emit from emitCuaActionStep. So only update probe.screenshot
+    // when the event explicitly carries the probe role.
+    if (role === "probe" || role === "agent_and_probe") {
+      const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) };
+      probe.screenshot = e.screenshot;
+      probe.url = e.url;
+      partial.probeEvidence = probe;
+    } else if (!partial.probeEvidence?.url) {
+      // Even for tier-1-only events, the URL is useful probe context if we
+      // don't have one yet. Doesn't overwrite a later post-action URL.
+      partial.probeEvidence = {
+        ...(partial.probeEvidence ?? {}),
+        url: e.url,
+      };
+    }
+
+    // Agent channel (tier 1): bytes the model ingested.
+    if (role === "agent" || role === "agent_and_probe") {
+      partial.agentEvidence = mergeAgentEvidence(partial.agentEvidence, {
+        modalities: [
+          { type: "image", bytes: e.screenshot, mediaType: "image/png" },
+        ],
+      });
+    }
+  };
+  private readonly onStepFinished = (e: AgentStepFinishedEvent) => {
+    const partial = this.ensurePartial(e.stepIndex);
+    partial.actionName = e.actionName;
+    partial.actionArgs = e.actionArgs;
+    partial.reasoning = e.reasoning;
+    partial.toolOutput = e.toolOutput;
+    partial.finishedAt = e.finishedAt;
+    partial.agentEvidence = mergeAgentEvidence(
+      partial.agentEvidence,
+      buildAgentEvidence(e),
+    );
+  };
+  private readonly onStepObserved = (e: AgentStepObservedEvent) => {
+    this.observationByStep.set(e.stepIndex, e);
+    const partial = this.ensurePartial(e.stepIndex);
+    const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) };
+    probe.url = e.url;
+    if (e.ariaTree !== undefined) probe.ariaTree = e.ariaTree;
+    if (e.scroll !== undefined) probe.scroll = e.scroll;
+    partial.probeEvidence = probe;
+  };
+  private readonly onFinalAnswer = (e: AgentFinalAnswerEvent) => {
+    this.finalAnswerEvent = e;
+  };
+
+  constructor(opts: TrajectoryRecorderOptions) {
+    this.v3 = opts.v3;
+    this.taskSpec = opts.taskSpec;
+    this.runId =
+      opts.runId ??
+      new Date().toISOString().replace(/[:.]/g, "-").replace("T", "T");
+    const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories");
+    this.outputDir = path.join(root, this.runId, opts.taskSpec.id);
+    this.persistEnabled = shouldPersist(opts.persist);
+  }
+
+  /** Subscribe to bus events. Call once before agent.execute(). */
+  start(): void {
+    if (this.listenersAttached) return;
+    this.startedAt = new Date().toISOString();
+    this.v3.bus.on("agent_screenshot_taken_event", this.onScreenshot);
+    this.v3.bus.on("agent_step_finished_event", this.onStepFinished);
+    this.v3.bus.on("agent_step_observed_event", this.onStepObserved);
+    this.v3.bus.on("agent_final_answer_event", this.onFinalAnswer);
+    this.listenersAttached = true;
+  }
+
+  /**
+   * Detach listeners, assemble the Trajectory, and (if persistence is on)
+   * write the on-disk layout. Idempotent.
+   */
+  async finish(opts: TrajectoryFinishOptions): Promise<Trajectory> {
+    this.detach();
+    this.endedAt = new Date().toISOString();
+
+    const steps = this.assembleSteps();
+    const trajectory: Trajectory = {
+      task: this.taskSpec,
+      steps,
+      finalAnswer: opts.finalAnswer ?? this.finalAnswerEvent?.message,
+      status: opts.status,
+      usage: { ...ZERO_USAGE, ...(opts.usage ?? {}) },
+      timing: { startedAt: this.startedAt, endedAt: this.endedAt },
+    };
+
+    if (this.persistEnabled) {
+      await this.persist(trajectory);
+    }
+
+    return trajectory;
+  }
+
+  /** Throw away in-memory state without writing to disk. Used on early abort. */
+  cancel(): void {
+    this.detach();
+    this.partialSteps.clear();
+    this.observationByStep.clear();
+    this.screenshotsByStep.clear();
+    this.finalAnswerEvent = undefined;
+  }
+
+  /** Where the trajectory dir lives (whether or not it was persisted). */
+  get directory(): string {
+    return this.outputDir;
+  }
+
+  /** Whether this recorder wrote the trajectory directory on finish(). */
+  get persisted(): boolean {
+    return this.persistEnabled;
+  }
+
+  /**
+   * Persist verifier scores next to the trajectory. No-op when trajectory
+   * persistence is disabled.
+   */
+  async persistVerdict(
+    verdict: Verdict,
+    filename = "mmrubric_v1.json",
+  ): Promise<void> {
+    if (!this.persistEnabled) return;
+
+    const scoresDir = path.join(this.outputDir, "scores");
+    await fs.mkdir(scoresDir, { recursive: true });
+    await fs.writeFile(
+      path.join(scoresDir, filename),
+      JSON.stringify(verdict, null, 2),
+    );
+
+    const taskDataPath = path.join(this.outputDir, "task_data.json");
+    let taskData: Record<string, unknown> = {};
+    try {
+      taskData = JSON.parse(await fs.readFile(taskDataPath, "utf8")) as Record<
+        string,
+        unknown
+      >;
+    } catch {
+      taskData = { task: this.taskSpec };
+    }
+    await fs.writeFile(
+      taskDataPath,
+      JSON.stringify({ ...taskData, verdict }, null, 2),
+    );
+  }
+
+  private detach(): void {
+    if (!this.listenersAttached) return;
+    this.v3.bus.off("agent_screenshot_taken_event", this.onScreenshot);
+    this.v3.bus.off("agent_step_finished_event", this.onStepFinished);
+    this.v3.bus.off("agent_step_observed_event", this.onStepObserved);
+    this.v3.bus.off("agent_final_answer_event", this.onFinalAnswer);
+    this.listenersAttached = false;
+  }
+
+  private ensurePartial(stepIndex: number): Partial<PartialStep> {
+    let p = this.partialSteps.get(stepIndex);
+    if (!p) {
+      p = { index: stepIndex };
+      this.partialSteps.set(stepIndex, p);
+    }
+    return p;
+  }
+
+  /**
+   * Materialize ordered TrajectoryStep[] from the accumulated partials.
+   * Steps that never received a step_finished event are skipped (they can
+   * appear for CUA where only screenshot events fire — those are recorded as
+   * orphan probe screenshots and elided here).
+   */
+  private assembleSteps(): TrajectoryStep[] {
+    const out: TrajectoryStep[] = [];
+    const indices = [...this.partialSteps.keys()].sort((a, b) => a - b);
+    for (const i of indices) {
+      const p = this.partialSteps.get(i)!;
+      if (
+        p.actionName === undefined ||
+        p.toolOutput === undefined ||
+        p.finishedAt === undefined
+      ) {
+        // Orphan screenshot-only entry (typically CUA). Skip — we record
+        // these by writing the screenshot to disk separately during persist().
+        continue;
+      }
+      out.push({
+        index: i,
+        actionName: p.actionName,
+        actionArgs: p.actionArgs ?? {},
+        reasoning: p.reasoning ?? "",
+        agentEvidence: p.agentEvidence ?? { modalities: [] },
+        probeEvidence: p.probeEvidence ?? {},
+        toolOutput: p.toolOutput,
+        startedAt: this.startedAt,
+        finishedAt: p.finishedAt,
+      });
+    }
+    return out;
+  }
+
+  /**
+   * Write the trajectory directory layout. Mirrors fara's example_trajectory/:
+   *
+   *   <outputDir>/
+   *     ├── task_data.json
+   *     ├── trajectory.json    (screenshots referenced by path)
+   *     ├── screenshot_<N>.png
+   *     └── times.json
+   */
+  private async persist(trajectory: Trajectory): Promise<void> {
+    await fs.mkdir(this.outputDir, { recursive: true });
+
+    // Walk steps and write screenshots; replace Buffer with path reference in
+    // the serialized trajectory. Both tiers externalize image bytes under
+    //   screenshots/probe/<N>.png   — tier 2, what the harness observed
+    //   screenshots/agent/<N>.png   — tier 1, what the model received
+    // The `_<j>` suffix only appears when a step carries multiple images
+    // (rare; typically zero or one per step). Paths in JSON are relative to
+    // the trajectory dir so the directory is movable/copyable as a unit.
+    await fs.mkdir(path.join(this.outputDir, "screenshots", "probe"), {
+      recursive: true,
+    });
+    await fs.mkdir(path.join(this.outputDir, "screenshots", "agent"), {
+      recursive: true,
+    });
+
+    const serializableSteps: unknown[] = [];
+    for (const step of trajectory.steps) {
+      const probe: ProbeEvidence = { ...step.probeEvidence };
+      if (probe.screenshot) {
+        const relPath = `screenshots/probe/${step.index + 1}.png`;
+        await fs.writeFile(
+          path.join(this.outputDir, relPath),
+          probe.screenshot,
+        );
+        probe.screenshotPath = relPath;
+        delete probe.screenshot;
+      }
+
+      const imageModalities = step.agentEvidence.modalities.filter(
+        (m) => m.type === "image",
+      );
+      const multipleImages = imageModalities.length > 1;
+      let imageSeq = 0;
+      const modalities: unknown[] = [];
+      for (const m of step.agentEvidence.modalities) {
+        if (m.type !== "image") {
+          modalities.push(m);
+          continue;
+        }
+        const suffix = multipleImages ? `_${imageSeq}` : "";
+        const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`;
+        await fs.writeFile(path.join(this.outputDir, relPath), m.bytes);
+        modalities.push({
+          type: "image",
+          imagePath: relPath,
+          mediaType: m.mediaType,
+        });
+        imageSeq += 1;
+      }
+      const agentEvidence = { modalities };
+      serializableSteps.push({ ...step, probeEvidence: probe, agentEvidence });
+    }
+
+    // Image modalities carry imagePath instead of raw bytes on disk, so this
+    // is no longer a strict Trajectory at the type level. Cast through
+    // unknown rather than widening the type contract.
+    const serialized = {
+      ...trajectory,
+      steps: serializableSteps,
+    } as unknown;
+
+    await fs.writeFile(
+      path.join(this.outputDir, "trajectory.json"),
+      JSON.stringify(serialized, null, 2),
+    );
+
+    // task_data.json mirrors fara's shape: TaskSpec + (later) verdict.
+    await fs.writeFile(
+      path.join(this.outputDir, "task_data.json"),
+      JSON.stringify(
+        {
+          task: trajectory.task,
+          status: trajectory.status,
+          finalAnswer: trajectory.finalAnswer ?? null,
+        },
+        null,
+        2,
+      ),
+    );
+
+    await fs.writeFile(
+      path.join(this.outputDir, "times.json"),
+      JSON.stringify(
+        {
+          timing: trajectory.timing,
+          usage: trajectory.usage,
+          stepCount: trajectory.steps.length,
+        },
+        null,
+        2,
+      ),
+    );
+
+    await fs.mkdir(path.join(this.outputDir, "scores"), { recursive: true });
+    await fs.writeFile(
+      path.join(this.outputDir, "core.log"),
+      coreLog(trajectory),
+    );
+  }
+}
+
+function mergeAgentEvidence(
+  ...parts: Array<AgentEvidence | undefined>
+): AgentEvidence {
+  return {
+    modalities: parts.flatMap((p) => p?.modalities ?? []),
+  };
+}
+
+/**
+ * Build a tier-1 AgentEvidence from a step_finished event. The handler's
+ * toolOutput.result is what the LLM consumed next turn (modulo SDK
+ * serialization). Wave 1 will replace this with a higher-fidelity capture
+ * pulled from event.response.messages.
+ */
+function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence {
+  const modalities: AgentEvidence["modalities"] = [];
+  if (e.reasoning) {
+    modalities.push({ type: "text", content: e.reasoning });
+  }
+  const result = e.toolOutput.result;
+  if (result === undefined || result === null) {
+    return { modalities };
+  }
+  if (typeof result === "string") {
+    modalities.push({ type: "text", content: result });
+  } else if (Buffer.isBuffer(result)) {
+    modalities.push({
+      type: "image",
+      bytes: result,
+      mediaType: "image/png",
+    });
+  } else if (typeof result === "object") {
+    // Tool results commonly include a screenshotBase64 field for vision tools.
+    const r = result as { screenshotBase64?: string } & Record<string, unknown>;
+    if (typeof r.screenshotBase64 === "string") {
+      try {
+        modalities.push({
+          type: "image",
+          bytes: Buffer.from(r.screenshotBase64, "base64"),
+          mediaType: "image/png",
+        });
+      } catch {
+        // ignore
+      }
+    }
+    modalities.push({ type: "json", content: result });
+  }
+  return { modalities };
+}
+
+function coreLog(trajectory: Trajectory): string {
+  return (
+    trajectory.steps
+      .map((step) =>
+        JSON.stringify({
+          step: step.index,
+          action: step.actionName,
+          url: step.probeEvidence.url ?? null,
+          ok: step.toolOutput.ok,
+          reasoning: step.reasoning || undefined,
+          startedAt: step.startedAt,
+          finishedAt: step.finishedAt,
+        }),
+      )
+      .join("\n") + "\n"
+  );
+}
diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts
new file mode 100644
index 0000000000..20dfb85b6b
--- /dev/null
+++ b/packages/evals/scripts/verify-trajectory-recorder.ts
@@ -0,0 +1,230 @@
+/**
+ * Wave 0 smoke test — verifies the TrajectoryRecorder plumbing end-to-end
+ * without launching a browser or calling an LLM.
+ *
+ * Drives a fake V3 (just an EventEmitter-shaped `bus`) through the same bus
+ * events the real agent handlers emit, then asserts:
+ *   1. The recorder assembles a Trajectory with the expected step shape.
+ *   2. The persisted directory layout matches fara's example_trajectory/.
+ *   3. V3Evaluator.verify() returns a parseable stub Verdict.
+ *
+ * Run via:  pnpm tsx packages/evals/scripts/verify-trajectory-recorder.ts
+ */
+import assert from "node:assert/strict";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { EventEmitter } from "node:events";
+
+import { TrajectoryRecorder } from "../framework/trajectoryRecorder.js";
+import { V3Evaluator } from "@browserbasehq/stagehand";
+import type { TaskSpec, V3 } from "@browserbasehq/stagehand";
+
+interface FakeV3 {
+  bus: EventEmitter;
+}
+
+async function main(): Promise<void> {
+  const tmpRoot = await fs.mkdtemp(
+    path.join(os.tmpdir(), "verifier-rewrite-smoke-"),
+  );
+  console.log(`▸ tmpdir: ${tmpRoot}`);
+
+  const bus = new EventEmitter();
+  const v3 = { bus } as unknown as V3;
+  const taskSpec: TaskSpec = {
+    id: "smoke-united_13",
+    instruction:
+      "What is the price difference between economy and business class on United?",
+    initUrl: "https://www.google.com",
+    precomputedRubric: {
+      items: [
+        {
+          criterion: "Identify correct route",
+          description: "Agent identifies United CHI→GRU flight.",
+          max_points: 2,
+        },
+        {
+          criterion: "Report price delta",
+          description: "Agent reports economy↔business price delta.",
+          max_points: 3,
+        },
+      ],
+    },
+    expectedAnswer: "Approximately $4,000 difference.",
+  };
+
+  const recorder = new TrajectoryRecorder({
+    v3,
+    taskSpec,
+    outputRoot: tmpRoot,
+    runId: "smoke-run",
+    persist: true,
+  });
+  recorder.start();
+
+  // Emit a three-step synthetic trajectory.
+  bus.emit("agent_step_finished_event", {
+    stepIndex: 0,
+    actionName: "goto",
+    actionArgs: { url: "https://united.com" },
+    reasoning: "Open United Airlines homepage.",
+    toolOutput: { ok: true, result: { url: "https://united.com" } },
+    finishedAt: new Date().toISOString(),
+  });
+  bus.emit("agent_screenshot_taken_event", {
+    stepIndex: 0,
+    screenshot: Buffer.from("fake-png-bytes-0"),
+    url: "https://united.com",
+    evidenceRole: "agent_and_probe",
+  });
+  bus.emit("agent_step_observed_event", {
+    stepIndex: 0,
+    url: "https://united.com",
+  });
+
+  bus.emit("agent_step_finished_event", {
+    stepIndex: 1,
+    actionName: "act",
+    actionArgs: { instruction: "Search Chicago to São Paulo, Nov 24" },
+    reasoning: "Enter route and dates.",
+    toolOutput: {
+      ok: true,
+      result: { success: true, describe: "Filled route + dates" },
+    },
+    finishedAt: new Date().toISOString(),
+  });
+  bus.emit("agent_screenshot_taken_event", {
+    stepIndex: 1,
+    screenshot: Buffer.from("fake-png-bytes-1"),
+    url: "https://united.com/search",
+  });
+  bus.emit("agent_step_observed_event", {
+    stepIndex: 1,
+    url: "https://united.com/search",
+  });
+
+  bus.emit("agent_step_finished_event", {
+    stepIndex: 2,
+    actionName: "extract",
+    actionArgs: { instruction: "extract fare cells" },
+    reasoning: "Read economy and business fares from the results page.",
+    toolOutput: {
+      ok: true,
+      result: { economy: "$1,234", business: "$5,789" },
+    },
+    finishedAt: new Date().toISOString(),
+  });
+  bus.emit("agent_screenshot_taken_event", {
+    stepIndex: 2,
+    screenshot: Buffer.from("fake-png-bytes-2"),
+    url: "https://united.com/results",
+  });
+  bus.emit("agent_step_observed_event", {
+    stepIndex: 2,
+    url: "https://united.com/results",
+    ariaTree:
+      "[0-1] RootWebArea: United Search Results\n  [0-3] heading: Flight 1234\n    [0-4] StaticText: Economy $1,234\n    [0-5] StaticText: Business $5,789",
+  });
+
+  bus.emit("agent_final_answer_event", {
+    message: "Economy $1,234 vs business $5,789 — delta $4,555.",
+  });
+
+  const trajectory = await recorder.finish({
+    status: "complete",
+    usage: { input_tokens: 1234, output_tokens: 567 },
+  });
+
+  // ── Assertions ──────────────────────────────────────────────────────────
+  assert.equal(trajectory.steps.length, 3, "expected 3 steps");
+  assert.equal(trajectory.steps[0].actionName, "goto");
+  assert.equal(trajectory.steps[1].actionName, "act");
+  assert.equal(trajectory.steps[2].actionName, "extract");
+  assert.ok(
+    trajectory.steps[0].agentEvidence.modalities.some(
+      (m) => m.type === "image",
+    ),
+    "CUA-style screenshot event should populate tier-1 image evidence",
+  );
+  assert.ok(
+    trajectory.steps[2].agentEvidence.modalities.some(
+      (m) =>
+        m.type === "json" &&
+        typeof m.content === "object" &&
+        m.content !== null &&
+        "economy" in (m.content as Record<string, unknown>),
+    ),
+    "extract step should carry a json modality with economy field",
+  );
+  assert.equal(
+    trajectory.finalAnswer,
+    "Economy $1,234 vs business $5,789 — delta $4,555.",
+  );
+  assert.equal(trajectory.status, "complete");
+  assert.equal(trajectory.usage.input_tokens, 1234);
+  // a11y dump on step 2 should round-trip through the recorder into
+  // probeEvidence.ariaTree.
+  assert.ok(
+    trajectory.steps[2].probeEvidence.ariaTree?.includes("Economy $1,234"),
+    "step_observed.ariaTree should populate probeEvidence.ariaTree",
+  );
+  console.log("  ✓ in-memory Trajectory shape (incl. ariaTree round-trip)");
+
+  // ── On-disk layout ──────────────────────────────────────────────────────
+  const taskDir = path.join(tmpRoot, "smoke-run", "smoke-united_13");
+  const files = (await fs.readdir(taskDir)).sort();
+  assert.deepEqual(
+    files,
+    [
+      "core.log",
+      "scores",
+      "screenshots",
+      "task_data.json",
+      "times.json",
+      "trajectory.json",
+    ],
+    `expected new trajectory layout, got ${files.join(", ")}`,
+  );
+  const probeFiles = (
+    await fs.readdir(path.join(taskDir, "screenshots", "probe"))
+  ).sort();
+  assert.deepEqual(
+    probeFiles,
+    ["1.png", "2.png", "3.png"],
+    `expected probe screenshots, got ${probeFiles.join(", ")}`,
+  );
+  const screenshotBytes = await fs.readFile(
+    path.join(taskDir, "screenshots", "probe", "1.png"),
+  );
+  assert.equal(screenshotBytes.toString(), "fake-png-bytes-0");
+  const coreLog = await fs.readFile(path.join(taskDir, "core.log"), "utf8");
+  assert.ok(coreLog.includes('"action":"goto"'));
+  console.log("  ✓ on-disk layout matches fara's example_trajectory");
+
+  const persistedTask = JSON.parse(
+    await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"),
+  );
+  assert.equal(persistedTask.task.id, "smoke-united_13");
+  assert.equal(persistedTask.status, "complete");
+
+  // ── V3Evaluator.verify() exercised live in verify-live-trajectory.ts ──
+  // Sanity-check that the V3Evaluator class still constructs from a minimal
+  // V3 shape (recorder doesn't depend on the evaluator for plumbing).
+  const _unused: typeof V3Evaluator = V3Evaluator;
+  void _unused;
+  console.log(
+    "  ✓ V3Evaluator still constructs (verify() exercised live elsewhere)",
+  );
+
+  console.log("\n✅ Wave 0 plumbing OK");
+  await fs.rm(tmpRoot, { recursive: true, force: true });
+}
+
+main().catch((err) => {
+  console.error("\n❌ Wave 0 plumbing FAILED:", err);
+  process.exit(1);
+});
+
+// Type guard for FakeV3 lint suppression (the file uses `as unknown as V3`).
+export type { FakeV3 };

From 40e7ab30b81903677551f28d1238d20747763bc5 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:16:26 -0700
Subject: [PATCH 02/27] fix(verifier): align trajectory naming

---
 .changeset/verifier-trajectory-events.md             | 5 +++++
 packages/evals/framework/trajectoryRecorder.ts       | 6 ++++--
 packages/evals/scripts/verify-trajectory-recorder.ts | 4 ++--
 3 files changed, 11 insertions(+), 4 deletions(-)
 create mode 100644 .changeset/verifier-trajectory-events.md

diff --git a/.changeset/verifier-trajectory-events.md b/.changeset/verifier-trajectory-events.md
new file mode 100644
index 0000000000..9dcb5c8192
--- /dev/null
+++ b/.changeset/verifier-trajectory-events.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Capture verifier trajectory evidence from v3 agent events for offline scoring.
diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index 2b7f24b529..501668c2be 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -327,12 +327,14 @@ export class TrajectoryRecorder {
   }
 
   /**
-   * Write the trajectory directory layout. Mirrors fara's example_trajectory/:
+   * Write the trajectory directory layout.
    *
    *   <outputDir>/
    *     ├── task_data.json
    *     ├── trajectory.json    (screenshots referenced by path)
-   *     ├── screenshot_<N>.png
+   *     ├── screenshots/
+   *     │   ├── probe/<N>.png
+   *     │   └── agent/<N>.png
    *     └── times.json
    */
   private async persist(trajectory: Trajectory): Promise<void> {
diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts
index 20dfb85b6b..7076fff21b 100644
--- a/packages/evals/scripts/verify-trajectory-recorder.ts
+++ b/packages/evals/scripts/verify-trajectory-recorder.ts
@@ -42,12 +42,12 @@ async function main(): Promise<void> {
         {
           criterion: "Identify correct route",
           description: "Agent identifies United CHI→GRU flight.",
-          max_points: 2,
+          maxPoints: 2,
         },
         {
           criterion: "Report price delta",
           description: "Agent reports economy↔business price delta.",
-          max_points: 3,
+          maxPoints: 3,
         },
       ],
     },

From c25367bbcf0af855fadb6321dc0d8f94b7aadd7e Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:35:06 -0700
Subject: [PATCH 03/27] chore(evals): remove upstream trajectory references

---
 packages/evals/framework/trajectoryRecorder.ts       | 6 +++---
 packages/evals/scripts/verify-trajectory-recorder.ts | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index 501668c2be..5a8a62f1dc 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -14,8 +14,8 @@
  *   - "1" / "true": always persist.
  *   - "0" / "false": never persist.
  *
- * On-disk layout matches microsoft/fara's example_trajectory/ so we can
- * cross-validate against verify_trajectories.py without format conversion.
+ * On-disk layout is stable JSON + screenshots so saved runs can be re-scored
+ * without format conversion.
  *
  * @see ~/.claude/plans/verifier-rewrite.html §06 (Trajectory on-disk)
  */
@@ -405,7 +405,7 @@ export class TrajectoryRecorder {
       JSON.stringify(serialized, null, 2),
     );
 
-    // task_data.json mirrors fara's shape: TaskSpec + (later) verdict.
+    // task_data.json stores TaskSpec + (later) verdict.
     await fs.writeFile(
       path.join(this.outputDir, "task_data.json"),
       JSON.stringify(
diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts
index 7076fff21b..049b96c706 100644
--- a/packages/evals/scripts/verify-trajectory-recorder.ts
+++ b/packages/evals/scripts/verify-trajectory-recorder.ts
@@ -5,7 +5,7 @@
  * Drives a fake V3 (just an EventEmitter-shaped `bus`) through the same bus
  * events the real agent handlers emit, then asserts:
  *   1. The recorder assembles a Trajectory with the expected step shape.
- *   2. The persisted directory layout matches fara's example_trajectory/.
+ *   2. The persisted directory layout has the expected verifier files.
  *   3. V3Evaluator.verify() returns a parseable stub Verdict.
  *
  * Run via:  pnpm tsx packages/evals/scripts/verify-trajectory-recorder.ts
@@ -200,7 +200,7 @@ async function main(): Promise<void> {
   assert.equal(screenshotBytes.toString(), "fake-png-bytes-0");
   const coreLog = await fs.readFile(path.join(taskDir, "core.log"), "utf8");
   assert.ok(coreLog.includes('"action":"goto"'));
-  console.log("  ✓ on-disk layout matches fara's example_trajectory");
+  console.log("  ✓ on-disk layout has expected verifier files");
 
   const persistedTask = JSON.parse(
     await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"),

From 8e9962cc78c486ec9d497569d57c208fd0c79a3f Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 15:19:16 -0700
Subject: [PATCH 04/27] docs(verifier): remove rollout comments from trajectory
 capture

---
 packages/core/lib/v3/handlers/v3AgentHandler.ts      | 2 +-
 packages/core/lib/v3/handlers/v3CuaAgentHandler.ts   | 4 ++--
 packages/evals/framework/trajectoryRecorder.ts       | 7 ++-----
 packages/evals/scripts/verify-trajectory-recorder.ts | 6 +++---
 4 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts
index d0308bdd8a..afddddef22 100644
--- a/packages/core/lib/v3/handlers/v3AgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts
@@ -313,7 +313,7 @@ export class V3AgentHandler {
           // Emit step_finished_event per tool call. The TrajectoryRecorder
           // builds one Trajectory.Step per emission. tier-1 evidence (the
           // bytes the LLM consumed) is captured separately via an
-          // onStepFinish wrapper in the harness (plan §10 Q1).
+          // onStepFinish wrapper in the harness.
           const stepIndex = stepCounter++;
           stepIndicesInTurn.push(stepIndex);
           const toolOk =
diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index f1dd2666e6..2fd08b8647 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -42,7 +42,7 @@ export class V3CuaAgentHandler {
   // Monotonic step counter used by bus events. The CUA loop is internal to
   // the agent client, so unlike v3AgentHandler we don't have per-tool-call
   // step events; instead we tag every screenshot emission with an
-  // incrementing index. Wave 1 may add finer-grained step events here.
+  // incrementing index.
   private cuaStepCounter = 0;
   private latestCuaScreenshot?: AgentScreenshotTakenEvent;
   private latestCuaScreenshotConsumed = true;
@@ -89,7 +89,7 @@ export class V3CuaAgentHandler {
       // Emit bus event so TrajectoryRecorder can capture the screenshot. In
       // CUA mode this is the same buffer the provider receives — i.e., it
       // serves both as tier-1 evidence (what the model saw) and as a tier-2
-      // probe. See plan §04 "Mode-by-mode sources".
+      // probe.
       try {
         this.emitCuaScreenshot(screenshotBuffer, page.url());
       } catch {
diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index 5a8a62f1dc..d7c4d62ab4 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -9,15 +9,13 @@
  *   await agent.execute(...);
  *   const trajectory = await recorder.finish({ status: "complete", usage });
  *
- * Persistence is env-gated by `VERIFIER_PERSIST_TRAJECTORIES` (plan §10 Q2):
+ * Persistence is env-gated by `VERIFIER_PERSIST_TRAJECTORIES`:
  *   - unset: persistence follows the default (on locally, off in CI).
  *   - "1" / "true": always persist.
  *   - "0" / "false": never persist.
  *
  * On-disk layout is stable JSON + screenshots so saved runs can be re-scored
  * without format conversion.
- *
- * @see ~/.claude/plans/verifier-rewrite.html §06 (Trajectory on-disk)
  */
 import fs from "node:fs/promises";
 import path from "node:path";
@@ -451,8 +449,7 @@ function mergeAgentEvidence(
 /**
  * Build a tier-1 AgentEvidence from a step_finished event. The handler's
  * toolOutput.result is what the LLM consumed next turn (modulo SDK
- * serialization). Wave 1 will replace this with a higher-fidelity capture
- * pulled from event.response.messages.
+ * serialization).
  */
 function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence {
   const modalities: AgentEvidence["modalities"] = [];
diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts
index 049b96c706..c2df86fd15 100644
--- a/packages/evals/scripts/verify-trajectory-recorder.ts
+++ b/packages/evals/scripts/verify-trajectory-recorder.ts
@@ -1,5 +1,5 @@
 /**
- * Wave 0 smoke test — verifies the TrajectoryRecorder plumbing end-to-end
+ * Smoke test — verifies the TrajectoryRecorder plumbing end-to-end
  * without launching a browser or calling an LLM.
  *
  * Drives a fake V3 (just an EventEmitter-shaped `bus`) through the same bus
@@ -217,12 +217,12 @@ async function main(): Promise<void> {
     "  ✓ V3Evaluator still constructs (verify() exercised live elsewhere)",
   );
 
-  console.log("\n✅ Wave 0 plumbing OK");
+  console.log("\n✅ Trajectory recorder plumbing OK");
   await fs.rm(tmpRoot, { recursive: true, force: true });
 }
 
 main().catch((err) => {
-  console.error("\n❌ Wave 0 plumbing FAILED:", err);
+  console.error("\n❌ Trajectory recorder plumbing FAILED:", err);
   process.exit(1);
 });
 

From bb514e3ab5a8f42970fc5bdf9ef5c423281969db Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 22:31:14 -0700
Subject: [PATCH 05/27] test(evals): cover trajectory recorder in vitest

---
 .../evals/framework/trajectoryRecorder.ts     |  16 +-
 .../scripts/verify-trajectory-recorder.ts     | 230 ------------------
 .../framework/trajectoryRecorder.test.ts      | 197 +++++++++++++++
 3 files changed, 205 insertions(+), 238 deletions(-)
 delete mode 100644 packages/evals/scripts/verify-trajectory-recorder.ts
 create mode 100644 packages/evals/tests/framework/trajectoryRecorder.test.ts

diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index d7c4d62ab4..8895a08443 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -31,7 +31,7 @@ import type {
   TrajectoryStatus,
   TrajectoryStep,
   TrajectoryUsage,
-  Verdict,
+  EvaluationResult,
   V3,
 } from "@browserbasehq/stagehand";
 
@@ -239,12 +239,12 @@ export class TrajectoryRecorder {
   }
 
   /**
-   * Persist verifier scores next to the trajectory. No-op when trajectory
+   * Persist evaluator result next to the trajectory. No-op when trajectory
    * persistence is disabled.
    */
-  async persistVerdict(
-    verdict: Verdict,
-    filename = "mmrubric_v1.json",
+  async persistResult(
+    result: EvaluationResult,
+    filename = "result.json",
   ): Promise<void> {
     if (!this.persistEnabled) return;
 
@@ -252,7 +252,7 @@ export class TrajectoryRecorder {
     await fs.mkdir(scoresDir, { recursive: true });
     await fs.writeFile(
       path.join(scoresDir, filename),
-      JSON.stringify(verdict, null, 2),
+      JSON.stringify(result, null, 2),
     );
 
     const taskDataPath = path.join(this.outputDir, "task_data.json");
@@ -267,7 +267,7 @@ export class TrajectoryRecorder {
     }
     await fs.writeFile(
       taskDataPath,
-      JSON.stringify({ ...taskData, verdict }, null, 2),
+      JSON.stringify({ ...taskData, result }, null, 2),
     );
   }
 
@@ -403,7 +403,7 @@ export class TrajectoryRecorder {
       JSON.stringify(serialized, null, 2),
     );
 
-    // task_data.json stores TaskSpec + (later) verdict.
+    // task_data.json stores TaskSpec + (later) result.
     await fs.writeFile(
       path.join(this.outputDir, "task_data.json"),
       JSON.stringify(
diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts
deleted file mode 100644
index c2df86fd15..0000000000
--- a/packages/evals/scripts/verify-trajectory-recorder.ts
+++ /dev/null
@@ -1,230 +0,0 @@
-/**
- * Smoke test — verifies the TrajectoryRecorder plumbing end-to-end
- * without launching a browser or calling an LLM.
- *
- * Drives a fake V3 (just an EventEmitter-shaped `bus`) through the same bus
- * events the real agent handlers emit, then asserts:
- *   1. The recorder assembles a Trajectory with the expected step shape.
- *   2. The persisted directory layout has the expected verifier files.
- *   3. V3Evaluator.verify() returns a parseable stub Verdict.
- *
- * Run via:  pnpm tsx packages/evals/scripts/verify-trajectory-recorder.ts
- */
-import assert from "node:assert/strict";
-import fs from "node:fs/promises";
-import os from "node:os";
-import path from "node:path";
-import { EventEmitter } from "node:events";
-
-import { TrajectoryRecorder } from "../framework/trajectoryRecorder.js";
-import { V3Evaluator } from "@browserbasehq/stagehand";
-import type { TaskSpec, V3 } from "@browserbasehq/stagehand";
-
-interface FakeV3 {
-  bus: EventEmitter;
-}
-
-async function main(): Promise<void> {
-  const tmpRoot = await fs.mkdtemp(
-    path.join(os.tmpdir(), "verifier-rewrite-smoke-"),
-  );
-  console.log(`▸ tmpdir: ${tmpRoot}`);
-
-  const bus = new EventEmitter();
-  const v3 = { bus } as unknown as V3;
-  const taskSpec: TaskSpec = {
-    id: "smoke-united_13",
-    instruction:
-      "What is the price difference between economy and business class on United?",
-    initUrl: "https://www.google.com",
-    precomputedRubric: {
-      items: [
-        {
-          criterion: "Identify correct route",
-          description: "Agent identifies United CHI→GRU flight.",
-          maxPoints: 2,
-        },
-        {
-          criterion: "Report price delta",
-          description: "Agent reports economy↔business price delta.",
-          maxPoints: 3,
-        },
-      ],
-    },
-    expectedAnswer: "Approximately $4,000 difference.",
-  };
-
-  const recorder = new TrajectoryRecorder({
-    v3,
-    taskSpec,
-    outputRoot: tmpRoot,
-    runId: "smoke-run",
-    persist: true,
-  });
-  recorder.start();
-
-  // Emit a three-step synthetic trajectory.
-  bus.emit("agent_step_finished_event", {
-    stepIndex: 0,
-    actionName: "goto",
-    actionArgs: { url: "https://united.com" },
-    reasoning: "Open United Airlines homepage.",
-    toolOutput: { ok: true, result: { url: "https://united.com" } },
-    finishedAt: new Date().toISOString(),
-  });
-  bus.emit("agent_screenshot_taken_event", {
-    stepIndex: 0,
-    screenshot: Buffer.from("fake-png-bytes-0"),
-    url: "https://united.com",
-    evidenceRole: "agent_and_probe",
-  });
-  bus.emit("agent_step_observed_event", {
-    stepIndex: 0,
-    url: "https://united.com",
-  });
-
-  bus.emit("agent_step_finished_event", {
-    stepIndex: 1,
-    actionName: "act",
-    actionArgs: { instruction: "Search Chicago to São Paulo, Nov 24" },
-    reasoning: "Enter route and dates.",
-    toolOutput: {
-      ok: true,
-      result: { success: true, describe: "Filled route + dates" },
-    },
-    finishedAt: new Date().toISOString(),
-  });
-  bus.emit("agent_screenshot_taken_event", {
-    stepIndex: 1,
-    screenshot: Buffer.from("fake-png-bytes-1"),
-    url: "https://united.com/search",
-  });
-  bus.emit("agent_step_observed_event", {
-    stepIndex: 1,
-    url: "https://united.com/search",
-  });
-
-  bus.emit("agent_step_finished_event", {
-    stepIndex: 2,
-    actionName: "extract",
-    actionArgs: { instruction: "extract fare cells" },
-    reasoning: "Read economy and business fares from the results page.",
-    toolOutput: {
-      ok: true,
-      result: { economy: "$1,234", business: "$5,789" },
-    },
-    finishedAt: new Date().toISOString(),
-  });
-  bus.emit("agent_screenshot_taken_event", {
-    stepIndex: 2,
-    screenshot: Buffer.from("fake-png-bytes-2"),
-    url: "https://united.com/results",
-  });
-  bus.emit("agent_step_observed_event", {
-    stepIndex: 2,
-    url: "https://united.com/results",
-    ariaTree:
-      "[0-1] RootWebArea: United Search Results\n  [0-3] heading: Flight 1234\n    [0-4] StaticText: Economy $1,234\n    [0-5] StaticText: Business $5,789",
-  });
-
-  bus.emit("agent_final_answer_event", {
-    message: "Economy $1,234 vs business $5,789 — delta $4,555.",
-  });
-
-  const trajectory = await recorder.finish({
-    status: "complete",
-    usage: { input_tokens: 1234, output_tokens: 567 },
-  });
-
-  // ── Assertions ──────────────────────────────────────────────────────────
-  assert.equal(trajectory.steps.length, 3, "expected 3 steps");
-  assert.equal(trajectory.steps[0].actionName, "goto");
-  assert.equal(trajectory.steps[1].actionName, "act");
-  assert.equal(trajectory.steps[2].actionName, "extract");
-  assert.ok(
-    trajectory.steps[0].agentEvidence.modalities.some(
-      (m) => m.type === "image",
-    ),
-    "CUA-style screenshot event should populate tier-1 image evidence",
-  );
-  assert.ok(
-    trajectory.steps[2].agentEvidence.modalities.some(
-      (m) =>
-        m.type === "json" &&
-        typeof m.content === "object" &&
-        m.content !== null &&
-        "economy" in (m.content as Record<string, unknown>),
-    ),
-    "extract step should carry a json modality with economy field",
-  );
-  assert.equal(
-    trajectory.finalAnswer,
-    "Economy $1,234 vs business $5,789 — delta $4,555.",
-  );
-  assert.equal(trajectory.status, "complete");
-  assert.equal(trajectory.usage.input_tokens, 1234);
-  // a11y dump on step 2 should round-trip through the recorder into
-  // probeEvidence.ariaTree.
-  assert.ok(
-    trajectory.steps[2].probeEvidence.ariaTree?.includes("Economy $1,234"),
-    "step_observed.ariaTree should populate probeEvidence.ariaTree",
-  );
-  console.log("  ✓ in-memory Trajectory shape (incl. ariaTree round-trip)");
-
-  // ── On-disk layout ──────────────────────────────────────────────────────
-  const taskDir = path.join(tmpRoot, "smoke-run", "smoke-united_13");
-  const files = (await fs.readdir(taskDir)).sort();
-  assert.deepEqual(
-    files,
-    [
-      "core.log",
-      "scores",
-      "screenshots",
-      "task_data.json",
-      "times.json",
-      "trajectory.json",
-    ],
-    `expected new trajectory layout, got ${files.join(", ")}`,
-  );
-  const probeFiles = (
-    await fs.readdir(path.join(taskDir, "screenshots", "probe"))
-  ).sort();
-  assert.deepEqual(
-    probeFiles,
-    ["1.png", "2.png", "3.png"],
-    `expected probe screenshots, got ${probeFiles.join(", ")}`,
-  );
-  const screenshotBytes = await fs.readFile(
-    path.join(taskDir, "screenshots", "probe", "1.png"),
-  );
-  assert.equal(screenshotBytes.toString(), "fake-png-bytes-0");
-  const coreLog = await fs.readFile(path.join(taskDir, "core.log"), "utf8");
-  assert.ok(coreLog.includes('"action":"goto"'));
-  console.log("  ✓ on-disk layout has expected verifier files");
-
-  const persistedTask = JSON.parse(
-    await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"),
-  );
-  assert.equal(persistedTask.task.id, "smoke-united_13");
-  assert.equal(persistedTask.status, "complete");
-
-  // ── V3Evaluator.verify() exercised live in verify-live-trajectory.ts ──
-  // Sanity-check that the V3Evaluator class still constructs from a minimal
-  // V3 shape (recorder doesn't depend on the evaluator for plumbing).
-  const _unused: typeof V3Evaluator = V3Evaluator;
-  void _unused;
-  console.log(
-    "  ✓ V3Evaluator still constructs (verify() exercised live elsewhere)",
-  );
-
-  console.log("\n✅ Trajectory recorder plumbing OK");
-  await fs.rm(tmpRoot, { recursive: true, force: true });
-}
-
-main().catch((err) => {
-  console.error("\n❌ Trajectory recorder plumbing FAILED:", err);
-  process.exit(1);
-});
-
-// Type guard for FakeV3 lint suppression (the file uses `as unknown as V3`).
-export type { FakeV3 };
diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts
new file mode 100644
index 0000000000..5c5268e66a
--- /dev/null
+++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts
@@ -0,0 +1,197 @@
+import { EventEmitter } from "node:events";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { afterEach, describe, expect, it } from "vitest";
+import type { TaskSpec, V3 } from "@browserbasehq/stagehand";
+
+import { TrajectoryRecorder } from "../../framework/trajectoryRecorder.js";
+
+const tempDirs: string[] = [];
+
+afterEach(async () => {
+  while (tempDirs.length > 0) {
+    const dir = tempDirs.pop();
+    if (dir) await fs.rm(dir, { recursive: true, force: true });
+  }
+});
+
+function makeTempDir(): Promise<string> {
+  return fs
+    .mkdtemp(path.join(os.tmpdir(), "trajectory-recorder-"))
+    .then((dir) => {
+      tempDirs.push(dir);
+      return dir;
+    });
+}
+
+function makeV3(bus = new EventEmitter()): V3 {
+  return { bus } as unknown as V3;
+}
+
+function makeTaskSpec(): TaskSpec {
+  return {
+    id: "recorder-task",
+    instruction: "Compare economy and business fares.",
+    initUrl: "https://example.com",
+    precomputedRubric: {
+      items: [
+        {
+          criterion: "Report fare delta",
+          description: "Report the difference between two fares.",
+          maxPoints: 1,
+        },
+      ],
+    },
+  };
+}
+
+describe("TrajectoryRecorder", () => {
+  it("assembles trajectory evidence from bus events", async () => {
+    const bus = new EventEmitter();
+    const recorder = new TrajectoryRecorder({
+      v3: makeV3(bus),
+      taskSpec: makeTaskSpec(),
+      persist: false,
+    });
+    const screenshot = Buffer.from("screen-1");
+
+    recorder.start();
+    bus.emit("agent_screenshot_taken_event", {
+      stepIndex: 0,
+      screenshot,
+      url: "https://example.com/search",
+      evidenceRole: "agent_and_probe",
+    });
+    bus.emit("agent_step_finished_event", {
+      stepIndex: 0,
+      actionName: "extract",
+      actionArgs: { instruction: "Read fares" },
+      reasoning: "Read visible fare cells.",
+      toolOutput: {
+        ok: true,
+        result: { economy: "$100", business: "$250" },
+      },
+      finishedAt: new Date(0).toISOString(),
+    });
+    bus.emit("agent_step_observed_event", {
+      stepIndex: 0,
+      url: "https://example.com/search",
+      ariaTree: "RootWebArea\nStaticText: Economy $100",
+    });
+    bus.emit("agent_final_answer_event", {
+      message: "Business is $150 more than economy.",
+    });
+
+    const trajectory = await recorder.finish({
+      status: "complete",
+      usage: { input_tokens: 10, output_tokens: 5 },
+    });
+
+    expect(trajectory.steps).toHaveLength(1);
+    expect(trajectory.steps[0]).toMatchObject({
+      index: 0,
+      actionName: "extract",
+      actionArgs: { instruction: "Read fares" },
+      reasoning: "Read visible fare cells.",
+      toolOutput: {
+        ok: true,
+        result: { economy: "$100", business: "$250" },
+      },
+      probeEvidence: {
+        url: "https://example.com/search",
+        ariaTree: "RootWebArea\nStaticText: Economy $100",
+      },
+    });
+    expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot);
+    expect(trajectory.steps[0].agentEvidence.modalities).toEqual(
+      expect.arrayContaining([
+        { type: "image", bytes: screenshot, mediaType: "image/png" },
+        { type: "text", content: "Read visible fare cells." },
+        { type: "json", content: { economy: "$100", business: "$250" } },
+      ]),
+    );
+    expect(trajectory.finalAnswer).toBe("Business is $150 more than economy.");
+  });
+
+  it("persists trajectory files and evaluator results", async () => {
+    const outputRoot = await makeTempDir();
+    const bus = new EventEmitter();
+    const recorder = new TrajectoryRecorder({
+      v3: makeV3(bus),
+      taskSpec: makeTaskSpec(),
+      outputRoot,
+      runId: "run-1",
+      persist: true,
+    });
+    const screenshot = Buffer.from("screen-1");
+
+    recorder.start();
+    bus.emit("agent_screenshot_taken_event", {
+      stepIndex: 0,
+      screenshot,
+      url: "https://example.com/search",
+      evidenceRole: "agent_and_probe",
+    });
+    bus.emit("agent_step_finished_event", {
+      stepIndex: 0,
+      actionName: "act",
+      actionArgs: { instruction: "Search fares" },
+      reasoning: "Search for fares.",
+      toolOutput: { ok: true, result: "done" },
+      finishedAt: new Date(0).toISOString(),
+    });
+    bus.emit("agent_step_observed_event", {
+      stepIndex: 0,
+      url: "https://example.com/search",
+    });
+
+    await recorder.finish({ status: "complete" });
+    await recorder.persistResult({
+      outcomeSuccess: true,
+      explanation: "The task was completed.",
+    });
+
+    const taskDir = path.join(outputRoot, "run-1", "recorder-task");
+    await expect(fs.readdir(taskDir)).resolves.toEqual(
+      expect.arrayContaining([
+        "core.log",
+        "scores",
+        "screenshots",
+        "task_data.json",
+        "times.json",
+        "trajectory.json",
+      ]),
+    );
+    await expect(
+      fs.readFile(path.join(taskDir, "screenshots", "probe", "1.png")),
+    ).resolves.toEqual(screenshot);
+    await expect(
+      fs.readFile(path.join(taskDir, "screenshots", "agent", "1.png")),
+    ).resolves.toEqual(screenshot);
+    await expect(
+      fs.readFile(path.join(taskDir, "scores", "result.json"), "utf8"),
+    ).resolves.toContain('"outcomeSuccess": true');
+
+    const trajectory = JSON.parse(
+      await fs.readFile(path.join(taskDir, "trajectory.json"), "utf8"),
+    );
+    expect(trajectory.steps[0].probeEvidence.screenshotPath).toBe(
+      "screenshots/probe/1.png",
+    );
+    expect(trajectory.steps[0].agentEvidence.modalities).toContainEqual({
+      type: "image",
+      imagePath: "screenshots/agent/1.png",
+      mediaType: "image/png",
+    });
+
+    const taskData = JSON.parse(
+      await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"),
+    );
+    expect(taskData.result).toMatchObject({
+      outcomeSuccess: true,
+      explanation: "The task was completed.",
+    });
+  });
+});

From 9138ddf88a514c67d37bdfe444e8c7135f549e26 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 22:48:53 -0700
Subject: [PATCH 06/27] docs(verifier): trim trajectory event comments

---
 .../core/lib/v3/agent/AnthropicCUAClient.ts   |  4 ----
 .../core/lib/v3/types/public/busEvents.ts     | 19 +++++--------------
 2 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts
index 54d64f15d0..752d208e22 100644
--- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts
+++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts
@@ -902,10 +902,6 @@ export class AnthropicCUAClient extends AgentClient {
             ...input,
           };
         } else if (action === "triple_click" || action === "tripleClick") {
-          // Anthropic's computer_20250124 tool emits `triple_click` with
-          // `coordinate: [x, y]`. Without this branch the snake_case name +
-          // raw coordinate array fall through to the generic `else` and
-          // executeAction logs "Unknown action type: triple_click".
           return {
             type: "tripleClick",
             x:
diff --git a/packages/core/lib/v3/types/public/busEvents.ts b/packages/core/lib/v3/types/public/busEvents.ts
index 62e9929492..e2fa119499 100644
--- a/packages/core/lib/v3/types/public/busEvents.ts
+++ b/packages/core/lib/v3/types/public/busEvents.ts
@@ -5,16 +5,8 @@
  * event name so consumers (TrajectoryRecorder in packages/evals, custom
  * subscribers) can type their handlers.
  *
- * Wave 0 of the verifier rewrite plan introduces:
- *   - agent_screenshot_taken_event    — independent post-step screenshot probe
- *   - agent_step_finished_event       — fired per tool-call in a step result
- *   - agent_step_observed_event       — fired after the harness probe completes
- *   - agent_final_answer_event        — fired when the `done` tool resolves
- *
- * `agent_step_started_event` is documented in the plan but deferred — the AI
- * SDK's `onStepFinish` is a post-hook, and there's no symmetric pre-hook per
- * tool execution in v3AgentHandler today. Started-state can be derived from
- * the finished event's stepIndex if needed.
+ * The verifier recorder consumes these events to assemble persisted
+ * trajectories without coupling to individual agent handlers.
  */
 
 /**
@@ -37,7 +29,7 @@ export type BusEventName = (typeof BUS_EVENTS)[keyof typeof BUS_EVENTS];
  *
  * Note: in CUA mode the same Buffer is also what the provider received; in
  * DOM/hybrid mode it's an independent harness probe. The verifier treats them
- * as different evidence tiers regardless — see plan §04 ("Mode-by-mode sources").
+ * as different evidence tiers regardless.
  */
 export interface AgentScreenshotTakenEvent {
   /** Zero-based index of the step this screenshot corresponds to. */
@@ -63,7 +55,7 @@ export interface AgentScreenshotTakenEvent {
  *
  * Tier 1 evidence (the bytes the LLM consumed as the tool result) is captured
  * separately by the harness via an AgentExecuteCallbacks.onStepFinish wrapper
- * — not in this payload. See plan §10 Q1 (resolved: onStepFinish).
+ * and is not part of this payload.
  */
 export interface AgentStepFinishedEvent {
   stepIndex: number;
@@ -86,8 +78,7 @@ export interface AgentStepFinishedEvent {
 
 /**
  * Payload for `agent_step_observed_event`. Emitted after the harness probe
- * completes for a step (page URL captured at minimum; a11y tree and scroll
- * info added in Wave 2).
+ * completes for a step.
  */
 export interface AgentStepObservedEvent {
   stepIndex: number;

From 1303315cc91c69ed8880f65961638ad0a0cc4c32 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Mon, 18 May 2026 17:00:45 -0700
Subject: [PATCH 07/27] refactor(verifier): extract writeTrajectoryDir +
 shouldPersistTrajectory

Lift the on-disk persistence helpers from TrajectoryRecorder into
verifier/trajectory.ts so #2137's harness adapter can share them. Also
drop the recorder's no-op .replace("T","T") and the WHAT-narration
comments per project policy.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/core/lib/v3/index.ts                 |   6 +
 packages/core/lib/v3/verifier/index.ts        |   2 +
 packages/core/lib/v3/verifier/trajectory.ts   | 137 +++++++++++
 .../evals/framework/trajectoryRecorder.ts     | 214 ++----------------
 4 files changed, 165 insertions(+), 194 deletions(-)

diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts
index 8e21fb0309..a5cbccf746 100644
--- a/packages/core/lib/v3/index.ts
+++ b/packages/core/lib/v3/index.ts
@@ -28,6 +28,8 @@ import {
   loadTrajectoryFromDisk,
   nextResultFilename,
   normalizeRubric,
+  shouldPersistTrajectory,
+  writeTrajectoryDir,
 } from "./verifier/index.js";
 
 export { V3 } from "./v3.js";
@@ -93,6 +95,8 @@ export {
   loadTrajectoryFromDisk,
   nextResultFilename,
   normalizeRubric,
+  shouldPersistTrajectory,
+  writeTrajectoryDir,
 } from "./verifier/index.js";
 export { tool } from "ai";
 export { getAISDKLanguageModel } from "./llm/LLMProvider.js";
@@ -147,6 +151,8 @@ const StagehandDefault = {
   loadTrajectoryFromDisk,
   nextResultFilename,
   normalizeRubric,
+  shouldPersistTrajectory,
+  writeTrajectoryDir,
   tool,
   getAISDKLanguageModel,
   __internalCreateInMemoryAgentCacheHandle,
diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts
index 4061533ab9..2b14cfb16a 100644
--- a/packages/core/lib/v3/verifier/index.ts
+++ b/packages/core/lib/v3/verifier/index.ts
@@ -25,4 +25,6 @@ export {
   loadTrajectoryFromDisk,
   nextResultFilename,
   normalizeRubric,
+  shouldPersistTrajectory,
+  writeTrajectoryDir,
 } from "./trajectory.js";
diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index a18f025c37..ec602d04d0 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -1,3 +1,5 @@
+import fs from "node:fs/promises";
+import path from "node:path";
 import type {
   AgentEvidenceModality,
   ProbeEvidence,
@@ -187,3 +189,138 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
 export function nextResultFilename(label?: string): string {
   return `result_${normalizeResultLabel(label)}.json`;
 }
+
+/**
+ * Default persistence policy: explicit override, then env, then "on unless CI".
+ */
+export function shouldPersistTrajectory(
+  override: boolean | undefined,
+): boolean {
+  if (override !== undefined) return override;
+  const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase();
+  if (env === "1" || env === "true") return true;
+  if (env === "0" || env === "false") return false;
+  return !process.env.CI;
+}
+
+/**
+ * Write the on-disk trajectory layout under `dir`:
+ *
+ *   <dir>/
+ *     ├── task_data.json
+ *     ├── trajectory.json    (screenshots referenced by path)
+ *     ├── screenshots/
+ *     │   ├── probe/<N>.png
+ *     │   └── agent/<N>[_M].png
+ *     ├── times.json
+ *     ├── scores/            (empty; populated separately)
+ *     └── core.log
+ *
+ * Image bytes are externalized to PNG files; the in-memory Trajectory is left
+ * untouched so callers can keep using it after persistence.
+ */
+export async function writeTrajectoryDir(
+  dir: string,
+  trajectory: Trajectory,
+): Promise<void> {
+  await fs.mkdir(dir, { recursive: true });
+  await fs.mkdir(path.join(dir, "screenshots", "probe"), { recursive: true });
+  await fs.mkdir(path.join(dir, "screenshots", "agent"), { recursive: true });
+
+  const serializableSteps: unknown[] = [];
+  for (const step of trajectory.steps) {
+    const probe: ProbeEvidence = { ...step.probeEvidence };
+    if (probe.screenshot) {
+      const relPath = `screenshots/probe/${step.index + 1}.png`;
+      await fs.writeFile(path.join(dir, relPath), probe.screenshot);
+      probe.screenshotPath = relPath;
+      delete probe.screenshot;
+    }
+
+    const imageModalities = step.agentEvidence.modalities.filter(
+      (m) => m.type === "image",
+    );
+    const multipleImages = imageModalities.length > 1;
+    let imageSeq = 0;
+    const modalities: unknown[] = [];
+    for (const m of step.agentEvidence.modalities) {
+      if (m.type !== "image") {
+        modalities.push(m);
+        continue;
+      }
+      const suffix = multipleImages ? `_${imageSeq}` : "";
+      const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`;
+      await fs.writeFile(path.join(dir, relPath), m.bytes);
+      modalities.push({
+        type: "image",
+        imagePath: relPath,
+        mediaType: m.mediaType,
+      });
+      imageSeq += 1;
+    }
+    serializableSteps.push({
+      ...step,
+      probeEvidence: probe,
+      agentEvidence: { modalities },
+    });
+  }
+
+  // Image modalities carry imagePath instead of raw bytes on disk; cast
+  // through unknown rather than widen Trajectory's type contract.
+  const serialized = {
+    ...trajectory,
+    steps: serializableSteps,
+  } as unknown;
+
+  await fs.writeFile(
+    path.join(dir, "trajectory.json"),
+    JSON.stringify(serialized, null, 2),
+  );
+
+  await fs.writeFile(
+    path.join(dir, "task_data.json"),
+    JSON.stringify(
+      {
+        task: trajectory.task,
+        status: trajectory.status,
+        finalAnswer: trajectory.finalAnswer ?? null,
+      },
+      null,
+      2,
+    ),
+  );
+
+  await fs.writeFile(
+    path.join(dir, "times.json"),
+    JSON.stringify(
+      {
+        timing: trajectory.timing,
+        usage: trajectory.usage,
+        stepCount: trajectory.steps.length,
+      },
+      null,
+      2,
+    ),
+  );
+
+  await fs.mkdir(path.join(dir, "scores"), { recursive: true });
+  await fs.writeFile(path.join(dir, "core.log"), coreLog(trajectory));
+}
+
+function coreLog(trajectory: Trajectory): string {
+  return (
+    trajectory.steps
+      .map((step) =>
+        JSON.stringify({
+          step: step.index,
+          action: step.actionName,
+          url: step.probeEvidence.url ?? null,
+          ok: step.toolOutput.ok,
+          reasoning: step.reasoning || undefined,
+          startedAt: step.startedAt,
+          finishedAt: step.finishedAt,
+        }),
+      )
+      .join("\n") + "\n"
+  );
+}
diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index 8895a08443..91c7b42987 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -1,24 +1,9 @@
-/**
- * TrajectoryRecorder — subscribes to v3.bus step events emitted by the agent
- * handlers (v3AgentHandler / v3CuaAgentHandler) and assembles a Trajectory
- * the verifier can consume.
- *
- * Lifecycle:
- *   const recorder = new TrajectoryRecorder({ v3, taskSpec });
- *   recorder.start();
- *   await agent.execute(...);
- *   const trajectory = await recorder.finish({ status: "complete", usage });
- *
- * Persistence is env-gated by `VERIFIER_PERSIST_TRAJECTORIES`:
- *   - unset: persistence follows the default (on locally, off in CI).
- *   - "1" / "true": always persist.
- *   - "0" / "false": never persist.
- *
- * On-disk layout is stable JSON + screenshots so saved runs can be re-scored
- * without format conversion.
- */
 import fs from "node:fs/promises";
 import path from "node:path";
+import {
+  shouldPersistTrajectory,
+  writeTrajectoryDir,
+} from "@browserbasehq/stagehand";
 import type {
   AgentEvidence,
   AgentFinalAnswerEvent,
@@ -75,18 +60,6 @@ const ZERO_USAGE: TrajectoryUsage = {
   output_tokens: 0,
 };
 
-/**
- * Decide whether to persist by default. Honors the explicit override first,
- * then env, then falls back to "persist when not in CI".
- */
-function shouldPersist(override: boolean | undefined): boolean {
-  if (override !== undefined) return override;
-  const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase();
-  if (env === "1" || env === "true") return true;
-  if (env === "0" || env === "false") return false;
-  return !process.env.CI;
-}
-
 export class TrajectoryRecorder {
   private readonly v3: V3;
   private readonly taskSpec: TaskSpec;
@@ -94,9 +67,8 @@ export class TrajectoryRecorder {
   private readonly outputDir: string;
   private readonly persistEnabled: boolean;
 
-  // Per-stepIndex builders; events can arrive out-of-order in theory, though
-  // the handlers emit step_finished → screenshot_taken → step_observed in the
-  // same microtask.
+  // Events can arrive out-of-order across step indices; same-step events all
+  // fire in one microtask.
   private readonly partialSteps = new Map<number, Partial<PartialStep>>();
   private readonly observationByStep = new Map<
     number,
@@ -111,34 +83,30 @@ export class TrajectoryRecorder {
   private endedAt = "";
   private listenersAttached = false;
 
-  // Strongly-typed bound handlers so we can attach/detach the same references.
+  // Bound handlers so attach/detach refer to the same references.
   private readonly onScreenshot = (e: AgentScreenshotTakenEvent) => {
     this.screenshotsByStep.set(e.stepIndex, e);
     const partial = this.ensurePartial(e.stepIndex);
 
-    // Default to "probe" when the emit site doesn't tag the role — matches
-    // v3AgentHandler's post-step screenshot, which is always a tier-2 probe.
+    // Default to probe when the emit site doesn't tag a role: matches
+    // v3AgentHandler's post-step screenshot. For CUA the pre-action shot is
+    // NOT a probe — emitCuaActionStep fills that role post-action.
     const role = e.evidenceRole ?? "probe";
 
-    // Probe channel (tier 2): the page's state at observation time. For CUA
-    // the pre-action screenshot is NOT a probe — that role is filled by the
-    // post-action emit from emitCuaActionStep. So only update probe.screenshot
-    // when the event explicitly carries the probe role.
     if (role === "probe" || role === "agent_and_probe") {
       const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) };
       probe.screenshot = e.screenshot;
       probe.url = e.url;
       partial.probeEvidence = probe;
     } else if (!partial.probeEvidence?.url) {
-      // Even for tier-1-only events, the URL is useful probe context if we
-      // don't have one yet. Doesn't overwrite a later post-action URL.
+      // Capture URL even for tier-1-only events; a later post-action URL
+      // can still overwrite it.
       partial.probeEvidence = {
         ...(partial.probeEvidence ?? {}),
         url: e.url,
       };
     }
 
-    // Agent channel (tier 1): bytes the model ingested.
     if (role === "agent" || role === "agent_and_probe") {
       partial.agentEvidence = mergeAgentEvidence(partial.agentEvidence, {
         modalities: [
@@ -176,11 +144,10 @@ export class TrajectoryRecorder {
     this.v3 = opts.v3;
     this.taskSpec = opts.taskSpec;
     this.runId =
-      opts.runId ??
-      new Date().toISOString().replace(/[:.]/g, "-").replace("T", "T");
+      opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-");
     const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories");
     this.outputDir = path.join(root, this.runId, opts.taskSpec.id);
-    this.persistEnabled = shouldPersist(opts.persist);
+    this.persistEnabled = shouldPersistTrajectory(opts.persist);
   }
 
   /** Subscribe to bus events. Call once before agent.execute(). */
@@ -213,7 +180,7 @@ export class TrajectoryRecorder {
     };
 
     if (this.persistEnabled) {
-      await this.persist(trajectory);
+      await writeTrajectoryDir(this.outputDir, trajectory);
     }
 
     return trajectory;
@@ -289,12 +256,6 @@ export class TrajectoryRecorder {
     return p;
   }
 
-  /**
-   * Materialize ordered TrajectoryStep[] from the accumulated partials.
-   * Steps that never received a step_finished event are skipped (they can
-   * appear for CUA where only screenshot events fire — those are recorded as
-   * orphan probe screenshots and elided here).
-   */
   private assembleSteps(): TrajectoryStep[] {
     const out: TrajectoryStep[] = [];
     const indices = [...this.partialSteps.keys()].sort((a, b) => a - b);
@@ -305,8 +266,8 @@ export class TrajectoryRecorder {
         p.toolOutput === undefined ||
         p.finishedAt === undefined
       ) {
-        // Orphan screenshot-only entry (typically CUA). Skip — we record
-        // these by writing the screenshot to disk separately during persist().
+        // CUA emits screenshot-only entries between actions; skip them here
+        // and let writeTrajectoryDir record them via the probe channel.
         continue;
       }
       out.push({
@@ -323,119 +284,6 @@ export class TrajectoryRecorder {
     }
     return out;
   }
-
-  /**
-   * Write the trajectory directory layout.
-   *
-   *   <outputDir>/
-   *     ├── task_data.json
-   *     ├── trajectory.json    (screenshots referenced by path)
-   *     ├── screenshots/
-   *     │   ├── probe/<N>.png
-   *     │   └── agent/<N>.png
-   *     └── times.json
-   */
-  private async persist(trajectory: Trajectory): Promise<void> {
-    await fs.mkdir(this.outputDir, { recursive: true });
-
-    // Walk steps and write screenshots; replace Buffer with path reference in
-    // the serialized trajectory. Both tiers externalize image bytes under
-    //   screenshots/probe/<N>.png   — tier 2, what the harness observed
-    //   screenshots/agent/<N>.png   — tier 1, what the model received
-    // The `_<j>` suffix only appears when a step carries multiple images
-    // (rare; typically zero or one per step). Paths in JSON are relative to
-    // the trajectory dir so the directory is movable/copyable as a unit.
-    await fs.mkdir(path.join(this.outputDir, "screenshots", "probe"), {
-      recursive: true,
-    });
-    await fs.mkdir(path.join(this.outputDir, "screenshots", "agent"), {
-      recursive: true,
-    });
-
-    const serializableSteps: unknown[] = [];
-    for (const step of trajectory.steps) {
-      const probe: ProbeEvidence = { ...step.probeEvidence };
-      if (probe.screenshot) {
-        const relPath = `screenshots/probe/${step.index + 1}.png`;
-        await fs.writeFile(
-          path.join(this.outputDir, relPath),
-          probe.screenshot,
-        );
-        probe.screenshotPath = relPath;
-        delete probe.screenshot;
-      }
-
-      const imageModalities = step.agentEvidence.modalities.filter(
-        (m) => m.type === "image",
-      );
-      const multipleImages = imageModalities.length > 1;
-      let imageSeq = 0;
-      const modalities: unknown[] = [];
-      for (const m of step.agentEvidence.modalities) {
-        if (m.type !== "image") {
-          modalities.push(m);
-          continue;
-        }
-        const suffix = multipleImages ? `_${imageSeq}` : "";
-        const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`;
-        await fs.writeFile(path.join(this.outputDir, relPath), m.bytes);
-        modalities.push({
-          type: "image",
-          imagePath: relPath,
-          mediaType: m.mediaType,
-        });
-        imageSeq += 1;
-      }
-      const agentEvidence = { modalities };
-      serializableSteps.push({ ...step, probeEvidence: probe, agentEvidence });
-    }
-
-    // Image modalities carry imagePath instead of raw bytes on disk, so this
-    // is no longer a strict Trajectory at the type level. Cast through
-    // unknown rather than widening the type contract.
-    const serialized = {
-      ...trajectory,
-      steps: serializableSteps,
-    } as unknown;
-
-    await fs.writeFile(
-      path.join(this.outputDir, "trajectory.json"),
-      JSON.stringify(serialized, null, 2),
-    );
-
-    // task_data.json stores TaskSpec + (later) result.
-    await fs.writeFile(
-      path.join(this.outputDir, "task_data.json"),
-      JSON.stringify(
-        {
-          task: trajectory.task,
-          status: trajectory.status,
-          finalAnswer: trajectory.finalAnswer ?? null,
-        },
-        null,
-        2,
-      ),
-    );
-
-    await fs.writeFile(
-      path.join(this.outputDir, "times.json"),
-      JSON.stringify(
-        {
-          timing: trajectory.timing,
-          usage: trajectory.usage,
-          stepCount: trajectory.steps.length,
-        },
-        null,
-        2,
-      ),
-    );
-
-    await fs.mkdir(path.join(this.outputDir, "scores"), { recursive: true });
-    await fs.writeFile(
-      path.join(this.outputDir, "core.log"),
-      coreLog(trajectory),
-    );
-  }
 }
 
 function mergeAgentEvidence(
@@ -446,11 +294,6 @@ function mergeAgentEvidence(
   };
 }
 
-/**
- * Build a tier-1 AgentEvidence from a step_finished event. The handler's
- * toolOutput.result is what the LLM consumed next turn (modulo SDK
- * serialization).
- */
 function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence {
   const modalities: AgentEvidence["modalities"] = [];
   if (e.reasoning) {
@@ -469,7 +312,8 @@ function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence {
       mediaType: "image/png",
     });
   } else if (typeof result === "object") {
-    // Tool results commonly include a screenshotBase64 field for vision tools.
+    // Vision tools embed a screenshotBase64 alongside the JSON result; lift
+    // it to its own image modality so the verifier sees both.
     const r = result as { screenshotBase64?: string } & Record<string, unknown>;
     if (typeof r.screenshotBase64 === "string") {
       try {
@@ -479,28 +323,10 @@ function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence {
           mediaType: "image/png",
         });
       } catch {
-        // ignore
+        // Malformed base64; skip the image and keep the JSON modality.
       }
     }
     modalities.push({ type: "json", content: result });
   }
   return { modalities };
 }
-
-function coreLog(trajectory: Trajectory): string {
-  return (
-    trajectory.steps
-      .map((step) =>
-        JSON.stringify({
-          step: step.index,
-          action: step.actionName,
-          url: step.probeEvidence.url ?? null,
-          ok: step.toolOutput.ok,
-          reasoning: step.reasoning || undefined,
-          startedAt: step.startedAt,
-          finishedAt: step.finishedAt,
-        }),
-      )
-      .join("\n") + "\n"
-  );
-}

From 10b03ca7385f00b98c940a65bfdc68e8823d9531 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Mon, 18 May 2026 17:45:32 -0700
Subject: [PATCH 08/27] =?UTF-8?q?style(recorder):=20prettier=20=E2=80=94?=
 =?UTF-8?q?=20collapse=20runId=20fallback=20onto=20one=20line?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/evals/framework/trajectoryRecorder.ts | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index 91c7b42987..3cf1c17621 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -143,8 +143,7 @@ export class TrajectoryRecorder {
   constructor(opts: TrajectoryRecorderOptions) {
     this.v3 = opts.v3;
     this.taskSpec = opts.taskSpec;
-    this.runId =
-      opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-");
+    this.runId = opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-");
     const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories");
     this.outputDir = path.join(root, this.runId, opts.taskSpec.id);
     this.persistEnabled = shouldPersistTrajectory(opts.persist);

From 6caeb1bac5737ed640dac1833b813f9afb606fbf Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Mon, 18 May 2026 17:49:40 -0700
Subject: [PATCH 09/27] fix(verifier): guard bus.listenerCount and align
 export-surface snapshot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- v3AgentHandler / v3CuaAgentHandler use optional-chained listenerCount
  so test mocks without one (captcha-hooks, temperature) don't blow up.
- Add bus stub to the agent-temperature createV3() mock so bus.emit
  doesn't NPE on the new agent_step_finished_event emit.
- Add BUS_EVENTS, shouldPersistTrajectory, writeTrajectoryDir to the
  export-surface snapshot — these are intentional new public exports.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/core/lib/v3/handlers/v3AgentHandler.ts            | 4 ++--
 packages/core/lib/v3/handlers/v3CuaAgentHandler.ts         | 4 ++--
 packages/core/tests/unit/agent-temperature.test.ts         | 6 ++++++
 packages/core/tests/unit/public-api/export-surface.test.ts | 3 +++
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts
index afddddef22..fc2761d902 100644
--- a/packages/core/lib/v3/handlers/v3AgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts
@@ -351,9 +351,9 @@ export class V3AgentHandler {
         // is more faithful than dropping probe evidence for all but the last
         // tool call, while still avoiding per-tool screenshot overhead.
         const wantsScreenshotProbe =
-          this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0;
+          this.v3.bus.listenerCount?.("agent_screenshot_taken_event") > 0;
         const wantsStepObservation =
-          this.v3.bus.listenerCount("agent_step_observed_event") > 0;
+          this.v3.bus.listenerCount?.("agent_step_observed_event") > 0;
         if (
           stepIndicesInTurn.length > 0 &&
           (wantsScreenshotProbe || wantsStepObservation)
diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index 2fd08b8647..bc1d6d5fd4 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -883,9 +883,9 @@ export class V3CuaAgentHandler {
     // Listener-gated to keep ordinary agent runs free of the extra
     // screenshot cost — mirrors v3AgentHandler's post-step probe.
     const wantsScreenshotProbe =
-      this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0;
+      this.v3.bus.listenerCount?.("agent_screenshot_taken_event") > 0;
     const wantsStepObservation =
-      this.v3.bus.listenerCount("agent_step_observed_event") > 0;
+      this.v3.bus.listenerCount?.("agent_step_observed_event") > 0;
     let probeUrl = pageUrl;
     if (wantsScreenshotProbe || wantsStepObservation) {
       try {
diff --git a/packages/core/tests/unit/agent-temperature.test.ts b/packages/core/tests/unit/agent-temperature.test.ts
index 45184a9888..8f12b4a7e6 100644
--- a/packages/core/tests/unit/agent-temperature.test.ts
+++ b/packages/core/tests/unit/agent-temperature.test.ts
@@ -125,6 +125,12 @@ function createV3() {
     context: {
       awaitActivePage: vi.fn(async () => page),
     },
+    bus: {
+      emit: vi.fn(),
+      on: vi.fn(),
+      off: vi.fn(),
+      listenerCount: vi.fn(() => 0),
+    },
     isCaptchaAutoSolveEnabled: false,
     browserbaseApiKey: undefined,
     logger: vi.fn(),
diff --git a/packages/core/tests/unit/public-api/export-surface.test.ts b/packages/core/tests/unit/public-api/export-surface.test.ts
index e73cde4178..fe4003f138 100644
--- a/packages/core/tests/unit/public-api/export-surface.test.ts
+++ b/packages/core/tests/unit/public-api/export-surface.test.ts
@@ -21,6 +21,7 @@ const publicApiShape = {
   AISdkClient: Stagehand.AISdkClient,
   Api: Stagehand.Api,
   AVAILABLE_CUA_MODELS: Stagehand.AVAILABLE_CUA_MODELS,
+  BUS_EVENTS: Stagehand.BUS_EVENTS,
   AgentProvider: Stagehand.AgentProvider,
   AnnotatedScreenshotText: Stagehand.AnnotatedScreenshotText,
   ConsoleMessage: Stagehand.ConsoleMessage,
@@ -50,12 +51,14 @@ const publicApiShape = {
   normalizeRubric: Stagehand.normalizeRubric,
   pageTextSchema: Stagehand.pageTextSchema,
   providerEnvVarMap: Stagehand.providerEnvVarMap,
+  shouldPersistTrajectory: Stagehand.shouldPersistTrajectory,
   toGeminiSchema: Stagehand.toGeminiSchema,
   toJsonSchema: Stagehand.toJsonSchema,
   tool: Stagehand.tool,
   transformSchema: Stagehand.transformSchema,
   trimTrailingTextNode: Stagehand.trimTrailingTextNode,
   validateZodSchema: Stagehand.validateZodSchema,
+  writeTrajectoryDir: Stagehand.writeTrajectoryDir,
   ...publicErrorTypes,
 } as const;
 

From 16669e80d7805e0b91649782ee0b820693686f10 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Thu, 21 May 2026 10:30:45 -0700
Subject: [PATCH 10/27] refactor(verifier): collect evidence via agent
 callbacks

---
 .changeset/verifier-trajectory-events.md      |  2 +-
 .../core/lib/v3/handlers/v3AgentHandler.ts    | 96 +++++++++---------
 .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 88 +++++++++--------
 packages/core/lib/v3/types/public/agent.ts    |  6 ++
 .../v3/types/public/agentEvidenceEvents.ts    | 84 ++++++++++++++++
 .../core/lib/v3/types/public/busEvents.ts     | 99 -------------------
 packages/core/lib/v3/types/public/index.ts    |  2 +-
 .../unit/public-api/export-surface.test.ts    |  1 -
 .../evals/framework/trajectoryRecorder.ts     | 70 ++++++-------
 .../framework/trajectoryRecorder.test.ts      | 34 +++----
 10 files changed, 238 insertions(+), 244 deletions(-)
 create mode 100644 packages/core/lib/v3/types/public/agentEvidenceEvents.ts
 delete mode 100644 packages/core/lib/v3/types/public/busEvents.ts

diff --git a/.changeset/verifier-trajectory-events.md b/.changeset/verifier-trajectory-events.md
index 9dcb5c8192..4a4ee2e32e 100644
--- a/.changeset/verifier-trajectory-events.md
+++ b/.changeset/verifier-trajectory-events.md
@@ -2,4 +2,4 @@
 "@browserbasehq/stagehand": patch
 ---
 
-Capture verifier trajectory evidence from v3 agent events for offline scoring.
+Capture verifier trajectory evidence from agent evidence callbacks for offline scoring.
diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts
index fc2761d902..965c30eded 100644
--- a/packages/core/lib/v3/handlers/v3AgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts
@@ -31,6 +31,7 @@ import {
   AgentModelConfig,
   Variables,
 } from "../types/public/agent.js";
+import type { AgentEvidenceCallback } from "../types/public/agentEvidenceEvents.js";
 import { HYBRID_CAPABLE_MODEL_PATTERNS } from "../types/private/agent.js";
 import { V3FunctionName } from "../types/public/methods.js";
 import { mapToolResultToActions } from "../agent/utils/actionMapping.js";
@@ -248,10 +249,11 @@ export class V3AgentHandler {
     userCallback?:
       | GenerateTextOnStepFinishCallback<ToolSet>
       | StreamTextOnStepFinishCallback<ToolSet>,
+    evidenceCallback?: AgentEvidenceCallback,
   ) {
     // Monotonic step counter scoped to this execute() call. Each tool call in
     // the agent loop becomes one trajectory step. The counter feeds stepIndex
-    // on the bus events the TrajectoryRecorder subscribes to.
+    // on evidence callback events.
     let stepCounter = 0;
     return async (event: StepResult<ToolSet>) => {
       this.logger({
@@ -310,10 +312,6 @@ export class V3AgentHandler {
             state.actions.push(action);
           }
 
-          // Emit step_finished_event per tool call. The TrajectoryRecorder
-          // builds one Trajectory.Step per emission. tier-1 evidence (the
-          // bytes the LLM consumed) is captured separately via an
-          // onStepFinish wrapper in the harness.
           const stepIndex = stepCounter++;
           stepIndicesInTurn.push(stepIndex);
           const toolOk =
@@ -321,7 +319,8 @@ export class V3AgentHandler {
             (typeof toolResult === "object" &&
               !("error" in toolResult) &&
               !("isError" in toolResult && toolResult.isError));
-          this.v3.bus.emit("agent_step_finished_event", {
+          await evidenceCallback?.({
+            type: "step_finished",
             stepIndex,
             actionName: toolCall.toolName,
             actionArgs:
@@ -350,47 +349,17 @@ export class V3AgentHandler {
         // reflects the settled page state after the batch of tool calls; this
         // is more faithful than dropping probe evidence for all but the last
         // tool call, while still avoiding per-tool screenshot overhead.
-        const wantsScreenshotProbe =
-          this.v3.bus.listenerCount?.("agent_screenshot_taken_event") > 0;
-        const wantsStepObservation =
-          this.v3.bus.listenerCount?.("agent_step_observed_event") > 0;
-        if (
-          stepIndicesInTurn.length > 0 &&
-          (wantsScreenshotProbe || wantsStepObservation)
-        ) {
+        const wantsEvidence = evidenceCallback !== undefined;
+        if (stepIndicesInTurn.length > 0 && wantsEvidence) {
+          let screenshot: Buffer | undefined;
+          let ariaTree: string | undefined;
           try {
             const page = await this.v3.context.awaitActivePage();
-            let screenshot: Buffer | undefined;
-            if (wantsScreenshotProbe) {
-              screenshot = await page.screenshot({ fullPage: false });
-            }
-            let ariaTree: string | undefined;
-            if (wantsStepObservation) {
-              // Capture the a11y tree alongside the URL probe so the verifier
-              // can ground textual claims (prices, names, dates) without OCR.
-              // Best-effort: returns undefined on failure/timeout.
-              ariaTree = await captureAriaTreeProbe(this.v3);
-            }
-            for (const stepIndex of stepIndicesInTurn) {
-              if (screenshot) {
-                // DOM/hybrid: this post-step screenshot is a harness probe
-                // only. The agent's tier-1 evidence is the tool's return value
-                // captured separately in agent_step_finished_event.
-                this.v3.bus.emit("agent_screenshot_taken_event", {
-                  stepIndex,
-                  screenshot,
-                  url: state.currentPageUrl,
-                  evidenceRole: "probe",
-                });
-              }
-              if (wantsStepObservation) {
-                this.v3.bus.emit("agent_step_observed_event", {
-                  stepIndex,
-                  url: state.currentPageUrl,
-                  ariaTree,
-                });
-              }
-            }
+            screenshot = await page.screenshot({ fullPage: false });
+            // Capture the a11y tree alongside the URL probe so the verifier
+            // can ground textual claims (prices, names, dates) without OCR.
+            // Best-effort: returns undefined on failure/timeout.
+            ariaTree = await captureAriaTreeProbe(this.v3);
           } catch (e) {
             this.logger({
               category: "agent",
@@ -398,11 +367,34 @@ export class V3AgentHandler {
               level: 1,
             });
           }
+          for (const stepIndex of stepIndicesInTurn) {
+            // DOM/hybrid: this post-step screenshot is a harness probe
+            // only. The agent's tier-1 evidence is the tool's return value
+            // captured separately in step_finished.
+            if (screenshot) {
+              await evidenceCallback?.({
+                type: "screenshot",
+                stepIndex,
+                screenshot,
+                url: state.currentPageUrl,
+                evidenceRole: "probe",
+              });
+            }
+            await evidenceCallback?.({
+              type: "step_observed",
+              stepIndex,
+              url: state.currentPageUrl,
+              ariaTree,
+            });
+          }
         }
       }
 
       if (lastFinalAnswer) {
-        this.v3.bus.emit("agent_final_answer_event", lastFinalAnswer);
+        await evidenceCallback?.({
+          type: "final_answer",
+          ...lastFinalAnswer,
+        });
       }
 
       if (userCallback) {
@@ -488,7 +480,11 @@ export class V3AgentHandler {
           callbacks?.prepareStep,
           captchaSolver,
         ),
-        onStepFinish: this.createStepHandler(state, callbacks?.onStepFinish),
+        onStepFinish: this.createStepHandler(
+          state,
+          callbacks?.onStepFinish,
+          callbacks?.onEvidence,
+        ),
         abortSignal: preparedOptions.signal,
         providerOptions: {
           google: { mediaResolution: "MEDIA_RESOLUTION_HIGH" },
@@ -624,7 +620,11 @@ export class V3AgentHandler {
           callbacks?.prepareStep,
           captchaSolver,
         ),
-        onStepFinish: this.createStepHandler(state, callbacks?.onStepFinish),
+        onStepFinish: this.createStepHandler(
+          state,
+          callbacks?.onStepFinish,
+          callbacks?.onEvidence,
+        ),
         onError: (event) => {
           captchaSolver?.dispose();
           if (callbacks?.onError) {
diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index bc1d6d5fd4..31f0a649c8 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -17,7 +17,10 @@ import {
   SafetyConfirmationHandler,
 } from "../types/public/agent.js";
 import { LogLine } from "../types/public/logs.js";
-import type { AgentScreenshotTakenEvent } from "../types/public/busEvents.js";
+import type {
+  AgentEvidenceCallback,
+  AgentScreenshotEvidenceEvent,
+} from "../types/public/agentEvidenceEvents.js";
 import { type Action, V3FunctionName } from "../types/public/methods.js";
 import { FlowLogger } from "../flowlogger/FlowLogger.js";
 import { toTitleCase } from "../../utils.js";
@@ -39,13 +42,14 @@ export class V3CuaAgentHandler {
   private captchaSolver: CaptchaSolver | null = null;
   private captchaClickGuardRemaining = 0;
   private currentInstruction = "";
-  // Monotonic step counter used by bus events. The CUA loop is internal to
+  // Monotonic step counter used by evidence callbacks. The CUA loop is internal to
   // the agent client, so unlike v3AgentHandler we don't have per-tool-call
   // step events; instead we tag every screenshot emission with an
   // incrementing index.
   private cuaStepCounter = 0;
-  private latestCuaScreenshot?: AgentScreenshotTakenEvent;
+  private latestCuaScreenshot?: AgentScreenshotEvidenceEvent;
   private latestCuaScreenshotConsumed = true;
+  private evidenceCallback?: AgentEvidenceCallback;
 
   constructor(
     v3: V3,
@@ -86,15 +90,7 @@ export class V3CuaAgentHandler {
       const page = await this.v3.context.awaitActivePage();
       const screenshotBuffer = await page.screenshot({ fullPage: false });
 
-      // Emit bus event so TrajectoryRecorder can capture the screenshot. In
-      // CUA mode this is the same buffer the provider receives — i.e., it
-      // serves both as tier-1 evidence (what the model saw) and as a tier-2
-      // probe.
-      try {
-        this.emitCuaScreenshot(screenshotBuffer, page.url());
-      } catch {
-        // bus emit errors are non-fatal
-      }
+      await this.emitCuaScreenshot(screenshotBuffer, page.url());
 
       return screenshotBuffer.toString("base64"); // base64 png
     });
@@ -208,6 +204,10 @@ export class V3CuaAgentHandler {
         : optionsOrInstruction;
 
     this.setSafetyConfirmationHandler(options.callbacks?.onSafetyConfirmation);
+    this.evidenceCallback = options.callbacks?.onEvidence;
+    this.cuaStepCounter = 0;
+    this.latestCuaScreenshot = undefined;
+    this.latestCuaScreenshotConsumed = true;
 
     this.highlightCursor = options.highlightCursor !== false;
     this.currentInstruction = options.instruction;
@@ -263,7 +263,13 @@ export class V3CuaAgentHandler {
     let result: AgentResult;
     try {
       result = await this.agent.execute({ options, logger: this.logger });
+      await this.evidenceCallback?.({
+        type: "final_answer",
+        message: result.message,
+        output: result.output,
+      });
     } finally {
+      this.evidenceCallback = undefined;
       this.captchaSolver?.dispose();
       this.captchaSolver = null;
     }
@@ -683,13 +689,8 @@ export class V3CuaAgentHandler {
 
       const currentUrl = page.url();
 
-      // Mirror the screenshot to the bus — same buffer the CUA client
-      // received, so it serves as both tier-1 evidence and tier-2 probe.
-      try {
-        this.emitCuaScreenshot(screenshotBuffer, currentUrl);
-      } catch {
-        // non-fatal
-      }
+      // Mirror the same buffer the CUA client receives as agent evidence.
+      await this.emitCuaScreenshot(screenshotBuffer, currentUrl);
 
       return await this.agentClient.captureScreenshot({
         base64Image: screenshotBuffer.toString("base64"),
@@ -807,11 +808,12 @@ export class V3CuaAgentHandler {
    * can compare what the model saw against what the page actually showed
    * once the keystrokes/clicks landed.
    */
-  private emitCuaScreenshot(
+  private async emitCuaScreenshot(
     screenshot: Buffer,
     url: string,
-  ): AgentScreenshotTakenEvent {
-    const event: AgentScreenshotTakenEvent = {
+  ): Promise<AgentScreenshotEvidenceEvent> {
+    const event: AgentScreenshotEvidenceEvent = {
+      type: "screenshot",
       stepIndex: this.cuaStepCounter++,
       screenshot,
       url,
@@ -819,7 +821,7 @@ export class V3CuaAgentHandler {
     };
     this.latestCuaScreenshot = event;
     this.latestCuaScreenshotConsumed = false;
-    this.v3.bus.emit("agent_screenshot_taken_event", event);
+    await this.evidenceCallback?.(event);
     return event;
   }
 
@@ -843,7 +845,7 @@ export class V3CuaAgentHandler {
       this.latestCuaScreenshotConsumed = true;
     } else if (this.latestCuaScreenshot) {
       stepIndex = this.cuaStepCounter++;
-      this.v3.bus.emit("agent_screenshot_taken_event", {
+      await this.evidenceCallback?.({
         ...this.latestCuaScreenshot,
         stepIndex,
       });
@@ -861,7 +863,8 @@ export class V3CuaAgentHandler {
           ? action.action
           : "";
 
-    this.v3.bus.emit("agent_step_finished_event", {
+    await this.evidenceCallback?.({
+      type: "step_finished",
       stepIndex,
       actionName: String(action.type),
       actionArgs,
@@ -880,26 +883,16 @@ export class V3CuaAgentHandler {
     // verifier has no visual evidence that keystrokes/clicks landed, and
     // has to trust the action history alone.
     //
-    // Listener-gated to keep ordinary agent runs free of the extra
+    // Callback-gated to keep ordinary agent runs free of the extra
     // screenshot cost — mirrors v3AgentHandler's post-step probe.
-    const wantsScreenshotProbe =
-      this.v3.bus.listenerCount?.("agent_screenshot_taken_event") > 0;
-    const wantsStepObservation =
-      this.v3.bus.listenerCount?.("agent_step_observed_event") > 0;
+    const wantsEvidence = this.evidenceCallback !== undefined;
     let probeUrl = pageUrl;
-    if (wantsScreenshotProbe || wantsStepObservation) {
+    let probeScreenshot: Buffer | undefined;
+    if (wantsEvidence) {
       try {
         const page = await this.v3.context.awaitActivePage();
         probeUrl = page.url();
-        if (wantsScreenshotProbe) {
-          const probeScreenshot = await page.screenshot({ fullPage: false });
-          this.v3.bus.emit("agent_screenshot_taken_event", {
-            stepIndex,
-            screenshot: probeScreenshot,
-            url: probeUrl,
-            evidenceRole: "probe",
-          });
-        }
+        probeScreenshot = await page.screenshot({ fullPage: false });
       } catch (e) {
         this.logger({
           category: "agent",
@@ -911,11 +904,22 @@ export class V3CuaAgentHandler {
       }
     }
 
-    if (probeUrl && wantsStepObservation) {
+    if (probeScreenshot) {
+      await this.evidenceCallback?.({
+        type: "screenshot",
+        stepIndex,
+        screenshot: probeScreenshot,
+        url: probeUrl,
+        evidenceRole: "probe",
+      });
+    }
+
+    if (probeUrl && wantsEvidence) {
       // Capture the a11y tree alongside the URL probe so the verifier can
       // ground textual claims without OCR. Best-effort.
       const ariaTree = await captureAriaTreeProbe(this.v3);
-      this.v3.bus.emit("agent_step_observed_event", {
+      await this.evidenceCallback?.({
+        type: "step_observed",
         stepIndex,
         url: probeUrl,
         ariaTree,
diff --git a/packages/core/lib/v3/types/public/agent.ts b/packages/core/lib/v3/types/public/agent.ts
index 830fb1c966..3e958fc332 100644
--- a/packages/core/lib/v3/types/public/agent.ts
+++ b/packages/core/lib/v3/types/public/agent.ts
@@ -15,6 +15,7 @@ import {
 import { LogLine } from "./logs.js";
 import { ClientOptions } from "./model.js";
 import { StagehandZodObject } from "../../zodCompat.js";
+import type { AgentEvidenceCallback } from "./agentEvidenceEvents.js";
 
 // Re-export ModelMessage for consumers who want to use it for conversation continuation
 export type { ModelMessage } from "ai";
@@ -136,6 +137,11 @@ export interface AgentCallbacks {
   onStepFinish?:
     | GenerateTextOnStepFinishCallback<ToolSet>
     | StreamTextOnStepFinishCallback<ToolSet>;
+  /**
+   * Callback called when Stagehand captures agent-run evidence such as
+   * screenshots, completed tool/action steps, or post-action observations.
+   */
+  onEvidence?: AgentEvidenceCallback;
 }
 
 /**
diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
new file mode 100644
index 0000000000..b31f493145
--- /dev/null
+++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
@@ -0,0 +1,84 @@
+/**
+ * Evidence events emitted through AgentExecuteOptions.callbacks.onEvidence.
+ *
+ * These events describe observations made by Stagehand during an agent run.
+ * They are intentionally transport-level callback payloads; verifier-specific
+ * storage and normalization live in the evals/verifier layers.
+ */
+
+export type AgentEvidenceRole = "probe" | "agent" | "agent_and_probe";
+
+export type AgentEvidenceEvent =
+  | AgentScreenshotEvidenceEvent
+  | AgentStepFinishedEvent
+  | AgentStepObservedEvent
+  | AgentFinalAnswerEvent;
+
+/**
+ * Screenshot captured during an agent run.
+ *
+ * In DOM/hybrid mode, post-tool screenshots are probe evidence. In CUA mode,
+ * screenshots captured by the screenshot provider are agent evidence because
+ * they are the exact bytes sent to the provider.
+ */
+export interface AgentScreenshotEvidenceEvent {
+  type: "screenshot";
+  /** Zero-based index of the step this screenshot corresponds to. */
+  stepIndex: number;
+  /** PNG bytes from page.screenshot(). */
+  screenshot: Buffer;
+  /** Page URL at the time of capture. */
+  url: string;
+  /** Role this screenshot plays in downstream evidence collection. */
+  evidenceRole?: AgentEvidenceRole;
+}
+
+/**
+ * One completed agent tool/action step.
+ */
+export interface AgentStepFinishedEvent {
+  type: "step_finished";
+  stepIndex: number;
+  /** Name of the tool/action that ran, e.g. "act", "extract", "click". */
+  actionName: string;
+  /** Arguments passed to the tool/action. */
+  actionArgs: Record<string, unknown>;
+  /** Agent textual reasoning for the step, when available. */
+  reasoning: string;
+  /** Outcome of the tool/action as seen by Stagehand. */
+  toolOutput: {
+    ok: boolean;
+    /** Native return value from the tool/action. */
+    result: unknown;
+    error?: string;
+  };
+  /** ISO 8601 timestamp at which the step finished. */
+  finishedAt: string;
+}
+
+/**
+ * Independent post-step browser observation.
+ */
+export interface AgentStepObservedEvent {
+  type: "step_observed";
+  stepIndex: number;
+  /** Page URL after the step's tool/action execution. */
+  url: string;
+  /** Accessibility tree snapshot, when captured. */
+  ariaTree?: string;
+  /** Viewport scroll context, when captured. */
+  scroll?: { top: number; pageHeight: number };
+}
+
+/** Final answer emitted by the agent, when available. */
+export interface AgentFinalAnswerEvent {
+  type: "final_answer";
+  /** The agent's final summary message. */
+  message: string;
+  /** Optional structured output if the agent's output schema was set. */
+  output?: Record<string, unknown>;
+}
+
+export type AgentEvidenceCallback = (
+  event: AgentEvidenceEvent,
+) => PromiseLike<void> | void;
diff --git a/packages/core/lib/v3/types/public/busEvents.ts b/packages/core/lib/v3/types/public/busEvents.ts
deleted file mode 100644
index e2fa119499..0000000000
--- a/packages/core/lib/v3/types/public/busEvents.ts
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Bus event payloads emitted by V3 on `v3.bus`.
- *
- * The bus is an EventEmitter; these types document the payload shape per
- * event name so consumers (TrajectoryRecorder in packages/evals, custom
- * subscribers) can type their handlers.
- *
- * The verifier recorder consumes these events to assemble persisted
- * trajectories without coupling to individual agent handlers.
- */
-
-/**
- * Names of bus events the agent handlers emit. Use these constants to
- * subscribe; the bus accepts arbitrary strings, but a centralized list helps
- * catch typos at the call site.
- */
-export const BUS_EVENTS = {
-  AGENT_SCREENSHOT_TAKEN: "agent_screenshot_taken_event",
-  AGENT_STEP_FINISHED: "agent_step_finished_event",
-  AGENT_STEP_OBSERVED: "agent_step_observed_event",
-  AGENT_FINAL_ANSWER: "agent_final_answer_event",
-} as const;
-
-export type BusEventName = (typeof BUS_EVENTS)[keyof typeof BUS_EVENTS];
-
-/**
- * Payload for `agent_screenshot_taken_event`. The raw screenshot Buffer the
- * harness took after a step's tool execution.
- *
- * Note: in CUA mode the same Buffer is also what the provider received; in
- * DOM/hybrid mode it's an independent harness probe. The verifier treats them
- * as different evidence tiers regardless.
- */
-export interface AgentScreenshotTakenEvent {
-  /** Zero-based index of the step this screenshot corresponds to. */
-  stepIndex: number;
-  /** PNG bytes from page.screenshot(). */
-  screenshot: Buffer;
-  /** Page URL at the time of capture. */
-  url: string;
-  /**
-   * Evidence role for this screenshot.
-   *
-   * DOM/hybrid post-tool screenshots are probe-only. CUA screenshots are also
-   * the exact image bytes sent to the provider, so they serve both as tier-1
-   * agent evidence and tier-2 probe evidence.
-   */
-  evidenceRole?: "probe" | "agent" | "agent_and_probe";
-}
-
-/**
- * Payload for `agent_step_finished_event`. Emitted once per tool call within
- * a step result. Carries the tool's reported outcome and a reference to the
- * agent's textual reasoning for the step.
- *
- * Tier 1 evidence (the bytes the LLM consumed as the tool result) is captured
- * separately by the harness via an AgentExecuteCallbacks.onStepFinish wrapper
- * and is not part of this payload.
- */
-export interface AgentStepFinishedEvent {
-  stepIndex: number;
-  /** Name of the tool that ran (e.g., "act", "extract", "click"). */
-  actionName: string;
-  /** Arguments passed to the tool. */
-  actionArgs: Record<string, unknown>;
-  /** Agent's textual reasoning (event.text on the AI SDK StepResult). */
-  reasoning: string;
-  /** Outcome of the tool execution as seen by the harness. */
-  toolOutput: {
-    ok: boolean;
-    /** The tool's native return value. */
-    result: unknown;
-    error?: string;
-  };
-  /** ISO 8601 timestamp at which the step finished. */
-  finishedAt: string;
-}
-
-/**
- * Payload for `agent_step_observed_event`. Emitted after the harness probe
- * completes for a step.
- */
-export interface AgentStepObservedEvent {
-  stepIndex: number;
-  /** Page URL after the step's tool execution. */
-  url: string;
-  /** v1 — accessibility tree snapshot. */
-  ariaTree?: string;
-  /** v1 — viewport scroll context. */
-  scroll?: { top: number; pageHeight: number };
-}
-
-/** Payload for `agent_final_answer_event`. Emitted when the `done` tool resolves. */
-export interface AgentFinalAnswerEvent {
-  /** The agent's final summary message. */
-  message: string;
-  /** Optional structured output if the agent's `output` schema was set. */
-  output?: Record<string, unknown>;
-}
diff --git a/packages/core/lib/v3/types/public/index.ts b/packages/core/lib/v3/types/public/index.ts
index 9bf24eb271..4fe0fb8a48 100644
--- a/packages/core/lib/v3/types/public/index.ts
+++ b/packages/core/lib/v3/types/public/index.ts
@@ -1,5 +1,5 @@
 export * from "./agent.js";
-export * from "./busEvents.js";
+export * from "./agentEvidenceEvents.js";
 // Export api.ts under namespace to avoid conflicts with methods.ts types
 export * as Api from "./api.js";
 // Also export BrowserbaseRegion directly for convenience
diff --git a/packages/core/tests/unit/public-api/export-surface.test.ts b/packages/core/tests/unit/public-api/export-surface.test.ts
index fe4003f138..7a1f1f65dc 100644
--- a/packages/core/tests/unit/public-api/export-surface.test.ts
+++ b/packages/core/tests/unit/public-api/export-surface.test.ts
@@ -21,7 +21,6 @@ const publicApiShape = {
   AISdkClient: Stagehand.AISdkClient,
   Api: Stagehand.Api,
   AVAILABLE_CUA_MODELS: Stagehand.AVAILABLE_CUA_MODELS,
-  BUS_EVENTS: Stagehand.BUS_EVENTS,
   AgentProvider: Stagehand.AgentProvider,
   AnnotatedScreenshotText: Stagehand.AnnotatedScreenshotText,
   ConsoleMessage: Stagehand.ConsoleMessage,
diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index 3cf1c17621..2b136fe0b5 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -6,8 +6,9 @@ import {
 } from "@browserbasehq/stagehand";
 import type {
   AgentEvidence,
+  AgentEvidenceEvent,
   AgentFinalAnswerEvent,
-  AgentScreenshotTakenEvent,
+  AgentScreenshotEvidenceEvent,
   AgentStepFinishedEvent,
   AgentStepObservedEvent,
   ProbeEvidence,
@@ -17,7 +18,6 @@ import type {
   TrajectoryStep,
   TrajectoryUsage,
   EvaluationResult,
-  V3,
 } from "@browserbasehq/stagehand";
 
 interface PartialStep {
@@ -32,7 +32,6 @@ interface PartialStep {
 }
 
 export interface TrajectoryRecorderOptions {
-  v3: V3;
   taskSpec: TaskSpec;
   /**
    * Root directory under which trajectory dirs are written. Each task run
@@ -61,7 +60,6 @@ const ZERO_USAGE: TrajectoryUsage = {
 };
 
 export class TrajectoryRecorder {
-  private readonly v3: V3;
   private readonly taskSpec: TaskSpec;
   private readonly runId: string;
   private readonly outputDir: string;
@@ -76,15 +74,13 @@ export class TrajectoryRecorder {
   >();
   private readonly screenshotsByStep = new Map<
     number,
-    AgentScreenshotTakenEvent
+    AgentScreenshotEvidenceEvent
   >();
   private finalAnswerEvent?: AgentFinalAnswerEvent;
   private startedAt = "";
   private endedAt = "";
-  private listenersAttached = false;
 
-  // Bound handlers so attach/detach refer to the same references.
-  private readonly onScreenshot = (e: AgentScreenshotTakenEvent) => {
+  private onScreenshot(e: AgentScreenshotEvidenceEvent): void {
     this.screenshotsByStep.set(e.stepIndex, e);
     const partial = this.ensurePartial(e.stepIndex);
 
@@ -114,8 +110,9 @@ export class TrajectoryRecorder {
         ],
       });
     }
-  };
-  private readonly onStepFinished = (e: AgentStepFinishedEvent) => {
+  }
+
+  private onStepFinished(e: AgentStepFinishedEvent): void {
     const partial = this.ensurePartial(e.stepIndex);
     partial.actionName = e.actionName;
     partial.actionArgs = e.actionArgs;
@@ -126,8 +123,9 @@ export class TrajectoryRecorder {
       partial.agentEvidence,
       buildAgentEvidence(e),
     );
-  };
-  private readonly onStepObserved = (e: AgentStepObservedEvent) => {
+  }
+
+  private onStepObserved(e: AgentStepObservedEvent): void {
     this.observationByStep.set(e.stepIndex, e);
     const partial = this.ensurePartial(e.stepIndex);
     const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) };
@@ -135,13 +133,13 @@ export class TrajectoryRecorder {
     if (e.ariaTree !== undefined) probe.ariaTree = e.ariaTree;
     if (e.scroll !== undefined) probe.scroll = e.scroll;
     partial.probeEvidence = probe;
-  };
-  private readonly onFinalAnswer = (e: AgentFinalAnswerEvent) => {
+  }
+
+  private onFinalAnswer(e: AgentFinalAnswerEvent): void {
     this.finalAnswerEvent = e;
-  };
+  }
 
   constructor(opts: TrajectoryRecorderOptions) {
-    this.v3 = opts.v3;
     this.taskSpec = opts.taskSpec;
     this.runId = opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-");
     const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories");
@@ -149,15 +147,29 @@ export class TrajectoryRecorder {
     this.persistEnabled = shouldPersistTrajectory(opts.persist);
   }
 
-  /** Subscribe to bus events. Call once before agent.execute(). */
+  /** Mark the beginning of collection. Call once before agent.execute(). */
   start(): void {
-    if (this.listenersAttached) return;
+    if (this.startedAt) return;
     this.startedAt = new Date().toISOString();
-    this.v3.bus.on("agent_screenshot_taken_event", this.onScreenshot);
-    this.v3.bus.on("agent_step_finished_event", this.onStepFinished);
-    this.v3.bus.on("agent_step_observed_event", this.onStepObserved);
-    this.v3.bus.on("agent_final_answer_event", this.onFinalAnswer);
-    this.listenersAttached = true;
+  }
+
+  /** Ingest an evidence callback event from agent.execute(). */
+  record(event: AgentEvidenceEvent): void {
+    if (!this.startedAt) this.start();
+    switch (event.type) {
+      case "screenshot":
+        this.onScreenshot(event);
+        break;
+      case "step_finished":
+        this.onStepFinished(event);
+        break;
+      case "step_observed":
+        this.onStepObserved(event);
+        break;
+      case "final_answer":
+        this.onFinalAnswer(event);
+        break;
+    }
   }
 
   /**
@@ -165,7 +177,7 @@ export class TrajectoryRecorder {
    * write the on-disk layout. Idempotent.
    */
   async finish(opts: TrajectoryFinishOptions): Promise<Trajectory> {
-    this.detach();
+    if (!this.startedAt) this.start();
     this.endedAt = new Date().toISOString();
 
     const steps = this.assembleSteps();
@@ -187,7 +199,6 @@ export class TrajectoryRecorder {
 
   /** Throw away in-memory state without writing to disk. Used on early abort. */
   cancel(): void {
-    this.detach();
     this.partialSteps.clear();
     this.observationByStep.clear();
     this.screenshotsByStep.clear();
@@ -237,15 +248,6 @@ export class TrajectoryRecorder {
     );
   }
 
-  private detach(): void {
-    if (!this.listenersAttached) return;
-    this.v3.bus.off("agent_screenshot_taken_event", this.onScreenshot);
-    this.v3.bus.off("agent_step_finished_event", this.onStepFinished);
-    this.v3.bus.off("agent_step_observed_event", this.onStepObserved);
-    this.v3.bus.off("agent_final_answer_event", this.onFinalAnswer);
-    this.listenersAttached = false;
-  }
-
   private ensurePartial(stepIndex: number): Partial<PartialStep> {
     let p = this.partialSteps.get(stepIndex);
     if (!p) {
diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts
index 5c5268e66a..81f9ef8b53 100644
--- a/packages/evals/tests/framework/trajectoryRecorder.test.ts
+++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts
@@ -1,10 +1,9 @@
-import { EventEmitter } from "node:events";
 import fs from "node:fs/promises";
 import os from "node:os";
 import path from "node:path";
 
 import { afterEach, describe, expect, it } from "vitest";
-import type { TaskSpec, V3 } from "@browserbasehq/stagehand";
+import type { TaskSpec } from "@browserbasehq/stagehand";
 
 import { TrajectoryRecorder } from "../../framework/trajectoryRecorder.js";
 
@@ -26,10 +25,6 @@ function makeTempDir(): Promise<string> {
     });
 }
 
-function makeV3(bus = new EventEmitter()): V3 {
-  return { bus } as unknown as V3;
-}
-
 function makeTaskSpec(): TaskSpec {
   return {
     id: "recorder-task",
@@ -48,23 +43,23 @@ function makeTaskSpec(): TaskSpec {
 }
 
 describe("TrajectoryRecorder", () => {
-  it("assembles trajectory evidence from bus events", async () => {
-    const bus = new EventEmitter();
+  it("assembles trajectory evidence from callback events", async () => {
     const recorder = new TrajectoryRecorder({
-      v3: makeV3(bus),
       taskSpec: makeTaskSpec(),
       persist: false,
     });
     const screenshot = Buffer.from("screen-1");
 
     recorder.start();
-    bus.emit("agent_screenshot_taken_event", {
+    recorder.record({
+      type: "screenshot",
       stepIndex: 0,
       screenshot,
       url: "https://example.com/search",
       evidenceRole: "agent_and_probe",
     });
-    bus.emit("agent_step_finished_event", {
+    recorder.record({
+      type: "step_finished",
       stepIndex: 0,
       actionName: "extract",
       actionArgs: { instruction: "Read fares" },
@@ -75,12 +70,14 @@ describe("TrajectoryRecorder", () => {
       },
       finishedAt: new Date(0).toISOString(),
     });
-    bus.emit("agent_step_observed_event", {
+    recorder.record({
+      type: "step_observed",
       stepIndex: 0,
       url: "https://example.com/search",
       ariaTree: "RootWebArea\nStaticText: Economy $100",
     });
-    bus.emit("agent_final_answer_event", {
+    recorder.record({
+      type: "final_answer",
       message: "Business is $150 more than economy.",
     });
 
@@ -117,9 +114,7 @@ describe("TrajectoryRecorder", () => {
 
   it("persists trajectory files and evaluator results", async () => {
     const outputRoot = await makeTempDir();
-    const bus = new EventEmitter();
     const recorder = new TrajectoryRecorder({
-      v3: makeV3(bus),
       taskSpec: makeTaskSpec(),
       outputRoot,
       runId: "run-1",
@@ -128,13 +123,15 @@ describe("TrajectoryRecorder", () => {
     const screenshot = Buffer.from("screen-1");
 
     recorder.start();
-    bus.emit("agent_screenshot_taken_event", {
+    recorder.record({
+      type: "screenshot",
       stepIndex: 0,
       screenshot,
       url: "https://example.com/search",
       evidenceRole: "agent_and_probe",
     });
-    bus.emit("agent_step_finished_event", {
+    recorder.record({
+      type: "step_finished",
       stepIndex: 0,
       actionName: "act",
       actionArgs: { instruction: "Search fares" },
@@ -142,7 +139,8 @@ describe("TrajectoryRecorder", () => {
       toolOutput: { ok: true, result: "done" },
       finishedAt: new Date(0).toISOString(),
     });
-    bus.emit("agent_step_observed_event", {
+    recorder.record({
+      type: "step_observed",
       stepIndex: 0,
       url: "https://example.com/search",
     });

From b493fa7fdfe232f6559e0f670c8993d6d4faf96a Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Thu, 21 May 2026 10:45:01 -0700
Subject: [PATCH 11/27] fix(cua): keep screenshot provider evidence non-fatal

---
 .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 21 ++++++-
 .../tests/unit/agent-captcha-hooks.test.ts    | 60 ++++++++++++++++++-
 2 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index 31f0a649c8..8a611275f0 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -90,7 +90,7 @@ export class V3CuaAgentHandler {
       const page = await this.v3.context.awaitActivePage();
       const screenshotBuffer = await page.screenshot({ fullPage: false });
 
-      await this.emitCuaScreenshot(screenshotBuffer, page.url());
+      await this.emitCuaScreenshotNonFatal(screenshotBuffer, page.url());
 
       return screenshotBuffer.toString("base64"); // base64 png
     });
@@ -690,7 +690,7 @@ export class V3CuaAgentHandler {
       const currentUrl = page.url();
 
       // Mirror the same buffer the CUA client receives as agent evidence.
-      await this.emitCuaScreenshot(screenshotBuffer, currentUrl);
+      await this.emitCuaScreenshotNonFatal(screenshotBuffer, currentUrl);
 
       return await this.agentClient.captureScreenshot({
         base64Image: screenshotBuffer.toString("base64"),
@@ -825,6 +825,23 @@ export class V3CuaAgentHandler {
     return event;
   }
 
+  private async emitCuaScreenshotNonFatal(
+    screenshot: Buffer,
+    url: string,
+  ): Promise<void> {
+    try {
+      await this.emitCuaScreenshot(screenshot, url);
+    } catch (e) {
+      this.logger({
+        category: "agent",
+        message: `Warning: CUA screenshot evidence callback failed: ${
+          e instanceof Error ? e.message : String(e)
+        }`,
+        level: 1,
+      });
+    }
+  }
+
   private async emitCuaActionStep(
     action: AgentAction,
     result: ActionExecutionResult | undefined,
diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts
index b3d584c258..e2524da417 100644
--- a/packages/core/tests/unit/agent-captcha-hooks.test.ts
+++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts
@@ -60,6 +60,7 @@ class FakeCuaClient {
   public contextNotes: string[] = [];
   public preStepHook?: () => Promise<void>;
   public actionHandler?: (action: Record<string, unknown>) => Promise<void>;
+  public screenshotProvider?: () => Promise<string>;
   public executeImpl = vi.fn(async (options: unknown) => {
     void options;
     return {
@@ -72,7 +73,9 @@ class FakeCuaClient {
   public captureScreenshot = vi.fn(async () => null);
   public setViewport = vi.fn();
   public setCurrentUrl = vi.fn();
-  public setScreenshotProvider = vi.fn();
+  public setScreenshotProvider = vi.fn((provider: () => Promise<string>) => {
+    this.screenshotProvider = provider;
+  });
   public setSafetyConfirmationHandler = vi.fn();
 
   setActionHandler(
@@ -504,4 +507,59 @@ describe("v3 cua handler screenshot behavior", () => {
     // the CUA client takes a single screenshot after all actions itself.
     expect(screenshotSpy).not.toHaveBeenCalled();
   });
+
+  it("still returns provider screenshots when screenshot evidence callbacks fail", async () => {
+    const screenshotBase64 = Buffer.from("fake-image").toString("base64");
+    const onEvidence = vi.fn(async (event: { type: string }) => {
+      if (event.type === "screenshot") {
+        throw new Error("recorder failed");
+      }
+    });
+
+    fakeCuaClient.executeImpl = vi.fn(async () => {
+      await expect(fakeCuaClient.screenshotProvider?.()).resolves.toBe(
+        screenshotBase64,
+      );
+      return {
+        success: true,
+        message: "ok",
+        actions: [],
+        completed: true,
+      };
+    });
+
+    const handler = new V3CuaAgentHandler(
+      {
+        context: {
+          awaitActivePage: async () => page,
+        },
+        bus: { emit: vi.fn() },
+        isCaptchaAutoSolveEnabled: false,
+        isAdvancedStealth: false,
+        configuredViewport: { width: 1288, height: 711 },
+        isAgentReplayActive: () => false,
+        updateMetrics: vi.fn(),
+      } as never,
+      logger,
+      {
+        modelName: "openai/gpt-5.4",
+        clientOptions: { waitBetweenActions: 1 },
+      } as never,
+    );
+
+    await handler.execute({
+      instruction: "describe the page",
+      highlightCursor: false,
+      callbacks: { onEvidence },
+    });
+
+    expect(onEvidence).toHaveBeenCalledWith(
+      expect.objectContaining({ type: "screenshot" }),
+    );
+    expect(
+      logs.some((line) =>
+        line.message.includes("CUA screenshot evidence callback failed"),
+      ),
+    ).toBe(true);
+  });
 });

From 043b3e1b661b13f369b150dc5f9f4b6e5bcb5326 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Thu, 21 May 2026 10:52:54 -0700
Subject: [PATCH 12/27] fix(verifier): hydrate persisted agent image paths

---
 packages/core/lib/v3/verifier/trajectory.ts   | 58 ++++++++++++++-----
 .../tests/unit/verifier-trajectory.test.ts    | 55 +++++++++++++++++-
 2 files changed, 96 insertions(+), 17 deletions(-)

diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index ec602d04d0..ae72dfac95 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -94,8 +94,9 @@ function normalizeResultLabel(label?: string): string {
  *
  * Reverses the recorder's serialization tweaks:
  *   - `probeEvidence.screenshotPath` → read file into `probeEvidence.screenshot`.
- *   - Image modalities in `agentEvidence.modalities` carry `bytesBase64` on
- *     disk (human-readable JSON) instead of raw Buffer; we decode back.
+ *   - Image modalities in `agentEvidence.modalities` carry `imagePath` on
+ *     disk instead of raw Buffer; legacy `bytesBase64` fixtures are also
+ *     accepted.
  *
  * @param dir absolute or cwd-relative path to a `<run-id>/<task-id>/` directory.
  */
@@ -115,10 +116,11 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
             | {
                 type: "image";
                 mediaType: string;
-                // On-disk form (recorder writes base64); accept either to
-                // tolerate hand-edited fixtures.
+                // On-disk forms. Current writer externalizes bytes to
+                // imagePath; bytesBase64 is accepted for older fixtures.
                 bytes?: unknown;
                 bytesBase64?: string;
+                imagePath?: string;
               }
             | { type: "json"; content: unknown }
           >;
@@ -128,7 +130,10 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
     >;
   };
 
-  const resolveWithinTrajectoryDir = (candidate: string): string => {
+  const resolveWithinTrajectoryDir = (
+    candidate: string,
+    fieldName = "screenshotPath",
+  ): string => {
     const resolved = path.resolve(trajectoryDir, candidate);
     const relative = path.relative(trajectoryDir, resolved);
     const outside =
@@ -138,7 +143,7 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
 
     if (outside) {
       throw new Error(
-        `Trajectory screenshotPath escapes trajectory directory: ${candidate}`,
+        `Trajectory ${fieldName} escapes trajectory directory: ${candidate}`,
       );
     }
 
@@ -158,21 +163,44 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
       }
     }
 
-    // Decode image modalities from base64 back to Buffer.
+    // Decode image modalities from disk references back to Buffer.
     if (step.agentEvidence?.modalities) {
-      step.agentEvidence.modalities = step.agentEvidence.modalities.map((m) => {
-        // The on-disk shape carries bytesBase64 instead of bytes, so we look
-        // through `unknown` here rather than rely on the typed union.
-        const raw = m as unknown as { bytesBase64?: string };
+      const modalities: AgentEvidenceModality[] = [];
+      for (const m of step.agentEvidence.modalities) {
+        // The on-disk shape carries imagePath/bytesBase64 instead of bytes,
+        // so we look through `unknown` rather than rely on the typed union.
+        const raw = m as unknown as {
+          bytesBase64?: string;
+          imagePath?: string;
+        };
         if (m.type === "image" && typeof raw.bytesBase64 === "string") {
-          return {
+          modalities.push({
             type: "image" as const,
             bytes: Buffer.from(raw.bytesBase64, "base64"),
             mediaType: m.mediaType,
-          };
+          });
+          continue;
         }
-        return m as AgentEvidenceModality;
-      });
+        if (m.type === "image" && typeof raw.imagePath === "string") {
+          const resolved = resolveWithinTrajectoryDir(
+            raw.imagePath,
+            "imagePath",
+          );
+          try {
+            modalities.push({
+              type: "image" as const,
+              bytes: await fs.readFile(resolved),
+              mediaType: m.mediaType,
+            });
+          } catch {
+            // Missing agent image file: omit that image modality. The
+            // verifier's evidence_insufficient path will handle missing bytes.
+          }
+          continue;
+        }
+        modalities.push(m as AgentEvidenceModality);
+      }
+      step.agentEvidence.modalities = modalities;
     }
   }
 
diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts
index 4b09e53a12..cc6e674a1a 100644
--- a/packages/core/tests/unit/verifier-trajectory.test.ts
+++ b/packages/core/tests/unit/verifier-trajectory.test.ts
@@ -1,4 +1,4 @@
-import { mkdtemp, writeFile } from "node:fs/promises";
+import { mkdir, mkdtemp, writeFile } from "node:fs/promises";
 import { tmpdir } from "node:os";
 import path from "node:path";
 
@@ -65,6 +65,11 @@ describe("verifier trajectory utilities", () => {
     const screenshot = Buffer.from("probe screenshot");
     const agentImage = Buffer.from("agent image");
     await writeFile(path.join(dir, "screenshot_1.png"), screenshot);
+    await mkdir(path.join(dir, "screenshots", "agent"), { recursive: true });
+    await writeFile(
+      path.join(dir, "screenshots", "agent", "1.png"),
+      agentImage,
+    );
     await writeFile(
       path.join(dir, "trajectory.json"),
       JSON.stringify({
@@ -86,7 +91,7 @@ describe("verifier trajectory utilities", () => {
                 {
                   type: "image",
                   mediaType: "image/png",
-                  bytesBase64: agentImage.toString("base64"),
+                  imagePath: "screenshots/agent/1.png",
                 },
               ],
             },
@@ -109,6 +114,52 @@ describe("verifier trajectory utilities", () => {
     }
   });
 
+  it("loads legacy base64 image modalities from disk", async () => {
+    const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-"));
+    const agentImage = Buffer.from("legacy agent image");
+    await writeFile(
+      path.join(dir, "trajectory.json"),
+      JSON.stringify({
+        task: { id: "task", instruction: "Do the task" },
+        status: "complete",
+        usage: { input_tokens: 0, output_tokens: 0 },
+        timing: {
+          startedAt: new Date(0).toISOString(),
+          endedAt: new Date(0).toISOString(),
+        },
+        steps: [
+          {
+            index: 0,
+            actionName: "act",
+            actionArgs: {},
+            reasoning: "",
+            agentEvidence: {
+              modalities: [
+                {
+                  type: "image",
+                  mediaType: "image/png",
+                  bytesBase64: agentImage.toString("base64"),
+                },
+              ],
+            },
+            probeEvidence: {},
+            toolOutput: { ok: true, result: null },
+            startedAt: new Date(0).toISOString(),
+            finishedAt: new Date(0).toISOString(),
+          },
+        ],
+      }),
+    );
+
+    const trajectory = await loadTrajectoryFromDisk(dir);
+    const modality = trajectory.steps[0].agentEvidence.modalities[0];
+
+    expect(modality.type).toBe("image");
+    if (modality.type === "image") {
+      expect(modality.bytes).toEqual(agentImage);
+    }
+  });
+
   it("rejects screenshot paths outside the trajectory directory", async () => {
     const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-"));
     await writeFile(

From 8596d2aedd86e95fd5a6748c75bc61e1bd7d2ecb Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Thu, 21 May 2026 10:52:54 -0700
Subject: [PATCH 13/27] fix(evals): avoid useless task data assignment

---
 packages/evals/framework/trajectoryRecorder.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index 2b136fe0b5..af52fa67a8 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -233,7 +233,7 @@ export class TrajectoryRecorder {
     );
 
     const taskDataPath = path.join(this.outputDir, "task_data.json");
-    let taskData: Record<string, unknown> = {};
+    let taskData: Record<string, unknown>;
     try {
       taskData = JSON.parse(await fs.readFile(taskDataPath, "utf8")) as Record<
         string,

From 2ba6c1f2bbe8f1e93d0d6b0b6c8cb1435719418b Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Thu, 21 May 2026 11:04:06 -0700
Subject: [PATCH 14/27] test(agent): drop stale bus mocks

---
 packages/core/tests/unit/agent-captcha-hooks.test.ts | 5 -----
 packages/core/tests/unit/agent-temperature.test.ts   | 6 ------
 2 files changed, 11 deletions(-)

diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts
index e2524da417..9cb626cf39 100644
--- a/packages/core/tests/unit/agent-captcha-hooks.test.ts
+++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts
@@ -250,7 +250,6 @@ describe("agent captcha hooks", () => {
         context: {
           awaitActivePage: async () => page,
         },
-        bus: { emit: vi.fn() },
         isCaptchaAutoSolveEnabled: true,
         isAdvancedStealth: false,
         configuredViewport: { width: 1288, height: 711 },
@@ -319,7 +318,6 @@ describe("agent captcha hooks", () => {
         context: {
           awaitActivePage: async () => page,
         },
-        bus: { emit: vi.fn() },
         isCaptchaAutoSolveEnabled: true,
         isAdvancedStealth: false,
         configuredViewport: { width: 1288, height: 711 },
@@ -395,7 +393,6 @@ describe("agent captcha hooks", () => {
         context: {
           awaitActivePage: async () => page,
         },
-        bus: { emit: vi.fn() },
         isCaptchaAutoSolveEnabled: true,
         isAdvancedStealth: false,
         configuredViewport: { width: 1288, height: 711 },
@@ -477,7 +474,6 @@ describe("v3 cua handler screenshot behavior", () => {
         context: {
           awaitActivePage: async () => page,
         },
-        bus: { emit: vi.fn() },
         isCaptchaAutoSolveEnabled: false,
         isAdvancedStealth: false,
         configuredViewport: { width: 1288, height: 711 },
@@ -533,7 +529,6 @@ describe("v3 cua handler screenshot behavior", () => {
         context: {
           awaitActivePage: async () => page,
         },
-        bus: { emit: vi.fn() },
         isCaptchaAutoSolveEnabled: false,
         isAdvancedStealth: false,
         configuredViewport: { width: 1288, height: 711 },
diff --git a/packages/core/tests/unit/agent-temperature.test.ts b/packages/core/tests/unit/agent-temperature.test.ts
index 8f12b4a7e6..45184a9888 100644
--- a/packages/core/tests/unit/agent-temperature.test.ts
+++ b/packages/core/tests/unit/agent-temperature.test.ts
@@ -125,12 +125,6 @@ function createV3() {
     context: {
       awaitActivePage: vi.fn(async () => page),
     },
-    bus: {
-      emit: vi.fn(),
-      on: vi.fn(),
-      off: vi.fn(),
-      listenerCount: vi.fn(() => 0),
-    },
     isCaptchaAutoSolveEnabled: false,
     browserbaseApiKey: undefined,
     logger: vi.fn(),

From 2780db25f2bb8d1b177c2b129a7b4fca02a075c5 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 22 May 2026 13:37:55 -0700
Subject: [PATCH 15/27] fix(verifier): redact inline screenshot payloads

---
 packages/core/lib/v3/verifier/trajectory.ts   | 47 ++++++++++-
 .../tests/unit/verifier-trajectory.test.ts    | 64 ++++++++++++++-
 .../evals/framework/trajectoryRecorder.ts     | 78 +++++++++++++++++--
 .../framework/trajectoryRecorder.test.ts      | 65 ++++++++++++++++
 4 files changed, 245 insertions(+), 9 deletions(-)

diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index ae72dfac95..3bb623f16d 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -231,6 +231,37 @@ export function shouldPersistTrajectory(
   return !process.env.CI;
 }
 
+const REDACTED_INLINE_IMAGE = "[redacted inline image payload]";
+const INLINE_IMAGE_KEYS = new Set(["screenshotBase64"]);
+
+function shouldRedactBase64Key(key: string, actionName?: string): boolean {
+  return (
+    INLINE_IMAGE_KEYS.has(key) ||
+    (actionName === "screenshot" && key === "base64")
+  );
+}
+
+function redactInlineImagePayloads(
+  value: unknown,
+  actionName?: string,
+): unknown {
+  if (!value || typeof value !== "object") return value;
+  if (Buffer.isBuffer(value)) return value;
+
+  if (Array.isArray(value)) {
+    return value.map((item) => redactInlineImagePayloads(item, actionName));
+  }
+
+  const out: Record<string, unknown> = {};
+  for (const [key, nested] of Object.entries(value)) {
+    out[key] =
+      shouldRedactBase64Key(key, actionName) && typeof nested === "string"
+        ? REDACTED_INLINE_IMAGE
+        : redactInlineImagePayloads(nested, actionName);
+  }
+  return out;
+}
+
 /**
  * Write the on-disk trajectory layout under `dir`:
  *
@@ -273,7 +304,14 @@ export async function writeTrajectoryDir(
     const modalities: unknown[] = [];
     for (const m of step.agentEvidence.modalities) {
       if (m.type !== "image") {
-        modalities.push(m);
+        modalities.push(
+          m.type === "json"
+            ? {
+                ...m,
+                content: redactInlineImagePayloads(m.content, step.actionName),
+              }
+            : m,
+        );
         continue;
       }
       const suffix = multipleImages ? `_${imageSeq}` : "";
@@ -290,6 +328,13 @@ export async function writeTrajectoryDir(
       ...step,
       probeEvidence: probe,
       agentEvidence: { modalities },
+      toolOutput: {
+        ...step.toolOutput,
+        result: redactInlineImagePayloads(
+          step.toolOutput.result,
+          step.actionName,
+        ),
+      },
     });
   }
 
diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts
index cc6e674a1a..7c9351d135 100644
--- a/packages/core/tests/unit/verifier-trajectory.test.ts
+++ b/packages/core/tests/unit/verifier-trajectory.test.ts
@@ -1,4 +1,4 @@
-import { mkdir, mkdtemp, writeFile } from "node:fs/promises";
+import { mkdir, mkdtemp, readFile, writeFile } from "node:fs/promises";
 import { tmpdir } from "node:os";
 import path from "node:path";
 
@@ -8,6 +8,7 @@ import {
   loadTrajectoryFromDisk,
   nextResultFilename,
   normalizeRubric,
+  writeTrajectoryDir,
 } from "../../lib/v3/verifier/trajectory.js";
 
 describe("verifier trajectory utilities", () => {
@@ -160,6 +161,67 @@ describe("verifier trajectory utilities", () => {
     }
   });
 
+  it("redacts inline screenshot payloads when writing trajectories", async () => {
+    const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-"));
+    const inlineScreenshot =
+      Buffer.from("inline screenshot").toString("base64");
+
+    await writeTrajectoryDir(dir, {
+      task: { id: "task", instruction: "Do the task" },
+      status: "complete",
+      usage: { input_tokens: 0, output_tokens: 0 },
+      timing: {
+        startedAt: new Date(0).toISOString(),
+        endedAt: new Date(0).toISOString(),
+      },
+      steps: [
+        {
+          index: 0,
+          actionName: "click",
+          actionArgs: {},
+          reasoning: "",
+          agentEvidence: {
+            modalities: [
+              {
+                type: "json",
+                content: {
+                  output: {
+                    success: true,
+                    screenshotBase64: inlineScreenshot,
+                  },
+                },
+              },
+            ],
+          },
+          probeEvidence: {},
+          toolOutput: {
+            ok: true,
+            result: {
+              output: {
+                success: true,
+                screenshotBase64: inlineScreenshot,
+              },
+            },
+          },
+          startedAt: new Date(0).toISOString(),
+          finishedAt: new Date(0).toISOString(),
+        },
+      ],
+    });
+
+    const raw = await readFile(path.join(dir, "trajectory.json"), "utf8");
+    const trajectory = JSON.parse(raw);
+
+    expect(raw).not.toContain(inlineScreenshot);
+    expect(
+      trajectory.steps[0].agentEvidence.modalities[0].content.output
+        .screenshotBase64,
+    ).toBe("[redacted inline image payload]");
+    expect(trajectory.steps[0].toolOutput.result.output.screenshotBase64).toBe(
+      "[redacted inline image payload]",
+    );
+  });
+
   it("rejects screenshot paths outside the trajectory directory", async () => {
     const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-"));
     await writeFile(
diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index af52fa67a8..e28626860f 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -117,7 +117,10 @@ export class TrajectoryRecorder {
     partial.actionName = e.actionName;
     partial.actionArgs = e.actionArgs;
     partial.reasoning = e.reasoning;
-    partial.toolOutput = e.toolOutput;
+    partial.toolOutput = {
+      ...e.toolOutput,
+      result: redactInlineImagePayloads(e.toolOutput.result, e.actionName),
+    };
     partial.finishedAt = e.finishedAt;
     partial.agentEvidence = mergeAgentEvidence(
       partial.agentEvidence,
@@ -287,6 +290,62 @@ export class TrajectoryRecorder {
   }
 }
 
+const REDACTED_INLINE_IMAGE = "[redacted inline image payload]";
+const INLINE_IMAGE_KEYS = new Set(["screenshotBase64"]);
+
+function shouldRedactBase64Key(key: string, actionName?: string): boolean {
+  return (
+    INLINE_IMAGE_KEYS.has(key) ||
+    (actionName === "screenshot" && key === "base64")
+  );
+}
+
+function collectInlineImagePayloads(
+  value: unknown,
+  actionName?: string,
+  out: string[] = [],
+): string[] {
+  if (!value || typeof value !== "object") return out;
+  if (Buffer.isBuffer(value)) return out;
+
+  if (Array.isArray(value)) {
+    for (const item of value) {
+      collectInlineImagePayloads(item, actionName, out);
+    }
+    return out;
+  }
+
+  for (const [key, nested] of Object.entries(value)) {
+    if (shouldRedactBase64Key(key, actionName) && typeof nested === "string") {
+      out.push(nested);
+      continue;
+    }
+    collectInlineImagePayloads(nested, actionName, out);
+  }
+  return out;
+}
+
+function redactInlineImagePayloads(
+  value: unknown,
+  actionName?: string,
+): unknown {
+  if (!value || typeof value !== "object") return value;
+  if (Buffer.isBuffer(value)) return value;
+
+  if (Array.isArray(value)) {
+    return value.map((item) => redactInlineImagePayloads(item, actionName));
+  }
+
+  const out: Record<string, unknown> = {};
+  for (const [key, nested] of Object.entries(value)) {
+    out[key] =
+      shouldRedactBase64Key(key, actionName) && typeof nested === "string"
+        ? REDACTED_INLINE_IMAGE
+        : redactInlineImagePayloads(nested, actionName);
+  }
+  return out;
+}
+
 function mergeAgentEvidence(
   ...parts: Array<AgentEvidence | undefined>
 ): AgentEvidence {
@@ -313,21 +372,26 @@ function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence {
       mediaType: "image/png",
     });
   } else if (typeof result === "object") {
-    // Vision tools embed a screenshotBase64 alongside the JSON result; lift
-    // it to its own image modality so the verifier sees both.
-    const r = result as { screenshotBase64?: string } & Record<string, unknown>;
-    if (typeof r.screenshotBase64 === "string") {
+    // Vision tools embed screenshot bytes alongside JSON; lift those bytes to
+    // image modalities and redact the inline payloads from persisted text/json.
+    for (const imageBase64 of collectInlineImagePayloads(
+      result,
+      e.actionName,
+    )) {
       try {
         modalities.push({
           type: "image",
-          bytes: Buffer.from(r.screenshotBase64, "base64"),
+          bytes: Buffer.from(imageBase64, "base64"),
           mediaType: "image/png",
         });
       } catch {
         // Malformed base64; skip the image and keep the JSON modality.
       }
     }
-    modalities.push({ type: "json", content: result });
+    modalities.push({
+      type: "json",
+      content: redactInlineImagePayloads(result, e.actionName),
+    });
   }
   return { modalities };
 }
diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts
index 81f9ef8b53..5f72dadab4 100644
--- a/packages/evals/tests/framework/trajectoryRecorder.test.ts
+++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts
@@ -192,4 +192,69 @@ describe("TrajectoryRecorder", () => {
       explanation: "The task was completed.",
     });
   });
+
+  it("lifts inline screenshot payloads into image evidence and redacts JSON", async () => {
+    const inlineScreenshot =
+      Buffer.from("inline screenshot").toString("base64");
+    const recorder = new TrajectoryRecorder({
+      taskSpec: makeTaskSpec(),
+      persist: false,
+    });
+
+    recorder.record({
+      type: "step_finished",
+      stepIndex: 0,
+      actionName: "click",
+      actionArgs: { describe: "Open fare details" },
+      reasoning: "Click the fare details button.",
+      toolOutput: {
+        ok: true,
+        result: {
+          output: {
+            success: true,
+            describe: "Open fare details",
+            screenshotBase64: inlineScreenshot,
+          },
+        },
+      },
+      finishedAt: new Date(0).toISOString(),
+    });
+
+    const trajectory = await recorder.finish({ status: "complete" });
+    const step = trajectory.steps[0];
+    const rawTrajectory = JSON.stringify(trajectory);
+    const imageModalities = step.agentEvidence.modalities.filter(
+      (m) => m.type === "image",
+    );
+    const jsonModality = step.agentEvidence.modalities.find(
+      (m) => m.type === "json",
+    );
+
+    expect(rawTrajectory).not.toContain(inlineScreenshot);
+    expect(step.toolOutput.result).toMatchObject({
+      output: {
+        success: true,
+        describe: "Open fare details",
+        screenshotBase64: "[redacted inline image payload]",
+      },
+    });
+    expect(jsonModality).toMatchObject({
+      type: "json",
+      content: {
+        output: {
+          screenshotBase64: "[redacted inline image payload]",
+        },
+      },
+    });
+    expect(imageModalities).toHaveLength(1);
+    expect(imageModalities[0]).toMatchObject({
+      type: "image",
+      mediaType: "image/png",
+    });
+    if (imageModalities[0].type === "image") {
+      expect(imageModalities[0].bytes).toEqual(
+        Buffer.from(inlineScreenshot, "base64"),
+      );
+    }
+  });
 });

From 25fadb142613163835fb6d88cf35f922eb6b239d Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 22 May 2026 13:54:24 -0700
Subject: [PATCH 16/27] refactor(verifier): centralize trajectory evidence
 handling

---
 .../v3/agent/utils/captureAriaTreeProbe.ts    |   4 +-
 .../v3/agent/utils/cuaEvidenceStepTracker.ts  |  55 ++++++++
 .../v3/agent/utils/postStepProbeEvidence.ts   |  64 +++++++++
 .../lib/v3/agent/utils/toolOutputEvidence.ts  |  25 ++++
 .../core/lib/v3/handlers/v3AgentHandler.ts    |  67 ++-------
 .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 105 +++-----------
 packages/core/lib/v3/index.ts                 |   9 ++
 .../v3/types/public/agentEvidenceEvents.ts    |   2 +-
 .../lib/v3/verifier/evidenceNormalization.ts  | 112 +++++++++++++++
 packages/core/lib/v3/verifier/index.ts        |   7 +
 packages/core/lib/v3/verifier/trajectory.ts   |  32 +----
 .../unit/cua-evidence-step-tracker.test.ts    |  55 ++++++++
 .../unit/public-api/export-surface.test.ts    |   4 +
 .../evals/framework/trajectoryRecorder.ts     | 132 ++----------------
 .../framework/trajectoryRecorder.test.ts      |  18 ++-
 15 files changed, 391 insertions(+), 300 deletions(-)
 create mode 100644 packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts
 create mode 100644 packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts
 create mode 100644 packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
 create mode 100644 packages/core/lib/v3/verifier/evidenceNormalization.ts
 create mode 100644 packages/core/tests/unit/cua-evidence-step-tracker.test.ts

diff --git a/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts
index 8e3fcc050b..b68663eb04 100644
--- a/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts
+++ b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts
@@ -56,9 +56,7 @@ export async function captureAriaTreeProbe(
   try {
     // v3.extract() without a schema returns { pageText } where pageText is the
     // rendered accessibility tree — same path the agent's ariaTree tool uses.
-    const result = (await v3.extract({ timeout: timeoutMs })) as {
-      pageText?: string;
-    };
+    const result = await v3.extract({ timeout: timeoutMs });
     const pageText = result?.pageText;
     if (typeof pageText !== "string" || pageText.length === 0) return undefined;
 
diff --git a/packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts b/packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts
new file mode 100644
index 0000000000..356cc6a98c
--- /dev/null
+++ b/packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts
@@ -0,0 +1,55 @@
+import type { AgentScreenshotEvidenceEvent } from "../../types/public/agentEvidenceEvents.js";
+
+export interface PairedCuaActionStep {
+  stepIndex: number;
+  replayScreenshot?: AgentScreenshotEvidenceEvent;
+}
+
+export class CuaEvidenceStepTracker {
+  private nextStepIndex = 0;
+  private latestScreenshot?: AgentScreenshotEvidenceEvent;
+  private latestScreenshotConsumed = true;
+
+  reset(): void {
+    this.nextStepIndex = 0;
+    this.latestScreenshot = undefined;
+    this.latestScreenshotConsumed = true;
+  }
+
+  recordScreenshot(
+    screenshot: Buffer,
+    url: string,
+  ): AgentScreenshotEvidenceEvent {
+    const event: AgentScreenshotEvidenceEvent = {
+      type: "screenshot",
+      stepIndex: this.nextStepIndex++,
+      screenshot,
+      url,
+      evidenceRole: "agent",
+    };
+    this.latestScreenshot = event;
+    this.latestScreenshotConsumed = false;
+    return event;
+  }
+
+  pairAction(): PairedCuaActionStep {
+    if (this.latestScreenshot && !this.latestScreenshotConsumed) {
+      this.latestScreenshotConsumed = true;
+      return { stepIndex: this.latestScreenshot.stepIndex };
+    }
+
+    const stepIndex = this.nextStepIndex++;
+    if (this.latestScreenshot) {
+      return {
+        stepIndex,
+        replayScreenshot: { ...this.latestScreenshot, stepIndex },
+      };
+    }
+
+    return { stepIndex };
+  }
+
+  get latestScreenshotUrl(): string | undefined {
+    return this.latestScreenshot?.url;
+  }
+}
diff --git a/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts
new file mode 100644
index 0000000000..de9cd9d044
--- /dev/null
+++ b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts
@@ -0,0 +1,64 @@
+import type { AgentEvidenceCallback } from "../../types/public/agentEvidenceEvents.js";
+import type { LogLine } from "../../types/public/logs.js";
+import type { V3 } from "../../v3.js";
+import { captureAriaTreeProbe } from "./captureAriaTreeProbe.js";
+
+interface EmitPostStepProbeEvidenceOptions {
+  v3: V3;
+  stepIndices: number | number[];
+  url: string;
+  evidenceCallback?: AgentEvidenceCallback;
+  logger: (message: LogLine) => void;
+  warningMessage: string;
+}
+
+function errorMessage(error: unknown): string {
+  return error instanceof Error ? error.message : String(error);
+}
+
+export async function emitPostStepProbeEvidence({
+  v3,
+  stepIndices,
+  url,
+  evidenceCallback,
+  logger,
+  warningMessage,
+}: EmitPostStepProbeEvidenceOptions): Promise<void> {
+  if (!evidenceCallback) return;
+
+  const indices = Array.isArray(stepIndices) ? stepIndices : [stepIndices];
+  if (indices.length === 0) return;
+
+  let probeUrl = url;
+  let screenshot: Buffer | undefined;
+  try {
+    const page = await v3.context.awaitActivePage();
+    probeUrl = page.url();
+    screenshot = await page.screenshot({ fullPage: false });
+  } catch (e) {
+    logger({
+      category: "agent",
+      message: `${warningMessage}: ${errorMessage(e)}`,
+      level: 1,
+    });
+  }
+
+  const ariaTree = await captureAriaTreeProbe(v3);
+  for (const stepIndex of indices) {
+    if (screenshot) {
+      await evidenceCallback({
+        type: "screenshot",
+        stepIndex,
+        screenshot,
+        url: probeUrl,
+        evidenceRole: "probe",
+      });
+    }
+    await evidenceCallback({
+      type: "step_observed",
+      stepIndex,
+      url: probeUrl,
+      ariaTree,
+    });
+  }
+}
diff --git a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
new file mode 100644
index 0000000000..c8806334c4
--- /dev/null
+++ b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
@@ -0,0 +1,25 @@
+import type { AgentStepFinishedEvent } from "../../types/public/agentEvidenceEvents.js";
+
+export function inferToolOutput(
+  toolResult: unknown,
+): AgentStepFinishedEvent["toolOutput"] {
+  const error =
+    toolResult &&
+    typeof toolResult === "object" &&
+    "error" in toolResult &&
+    typeof (toolResult as { error?: unknown }).error === "string"
+      ? (toolResult as { error: string }).error
+      : undefined;
+
+  const isError =
+    toolResult &&
+    typeof toolResult === "object" &&
+    "isError" in toolResult &&
+    Boolean((toolResult as { isError?: unknown }).isError);
+
+  return {
+    ok: error === undefined && !isError,
+    result: toolResult,
+    error,
+  };
+}
diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts
index 965c30eded..20a9c16a74 100644
--- a/packages/core/lib/v3/handlers/v3AgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts
@@ -42,7 +42,8 @@ import {
   AgentAbortError,
 } from "../types/public/sdkErrors.js";
 import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js";
-import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js";
+import { emitPostStepProbeEvidence } from "../agent/utils/postStepProbeEvidence.js";
+import { inferToolOutput } from "../agent/utils/toolOutputEvidence.js";
 import {
   CaptchaSolver,
   CAPTCHA_SOLVED_MSG,
@@ -314,11 +315,6 @@ export class V3AgentHandler {
 
           const stepIndex = stepCounter++;
           stepIndicesInTurn.push(stepIndex);
-          const toolOk =
-            !toolResult ||
-            (typeof toolResult === "object" &&
-              !("error" in toolResult) &&
-              !("isError" in toolResult && toolResult.isError));
           await evidenceCallback?.({
             type: "step_finished",
             stepIndex,
@@ -328,17 +324,7 @@ export class V3AgentHandler {
                 ? (args as Record<string, unknown>)
                 : {},
             reasoning: event.text ?? "",
-            toolOutput: {
-              ok: toolOk,
-              result: toolResult,
-              error:
-                toolResult &&
-                typeof toolResult === "object" &&
-                "error" in toolResult &&
-                typeof (toolResult as { error?: unknown }).error === "string"
-                  ? (toolResult as { error: string }).error
-                  : undefined,
-            },
+            toolOutput: inferToolOutput(toolResult),
             finishedAt: new Date().toISOString(),
           });
         }
@@ -349,45 +335,14 @@ export class V3AgentHandler {
         // reflects the settled page state after the batch of tool calls; this
         // is more faithful than dropping probe evidence for all but the last
         // tool call, while still avoiding per-tool screenshot overhead.
-        const wantsEvidence = evidenceCallback !== undefined;
-        if (stepIndicesInTurn.length > 0 && wantsEvidence) {
-          let screenshot: Buffer | undefined;
-          let ariaTree: string | undefined;
-          try {
-            const page = await this.v3.context.awaitActivePage();
-            screenshot = await page.screenshot({ fullPage: false });
-            // Capture the a11y tree alongside the URL probe so the verifier
-            // can ground textual claims (prices, names, dates) without OCR.
-            // Best-effort: returns undefined on failure/timeout.
-            ariaTree = await captureAriaTreeProbe(this.v3);
-          } catch (e) {
-            this.logger({
-              category: "agent",
-              message: `Warning: harness probe failed: ${getErrorMessage(e)}`,
-              level: 1,
-            });
-          }
-          for (const stepIndex of stepIndicesInTurn) {
-            // DOM/hybrid: this post-step screenshot is a harness probe
-            // only. The agent's tier-1 evidence is the tool's return value
-            // captured separately in step_finished.
-            if (screenshot) {
-              await evidenceCallback?.({
-                type: "screenshot",
-                stepIndex,
-                screenshot,
-                url: state.currentPageUrl,
-                evidenceRole: "probe",
-              });
-            }
-            await evidenceCallback?.({
-              type: "step_observed",
-              stepIndex,
-              url: state.currentPageUrl,
-              ariaTree,
-            });
-          }
-        }
+        await emitPostStepProbeEvidence({
+          v3: this.v3,
+          stepIndices: stepIndicesInTurn,
+          url: state.currentPageUrl,
+          evidenceCallback,
+          logger: this.logger,
+          warningMessage: "Warning: harness probe failed",
+        });
       }
 
       if (lastFinalAnswer) {
diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index 8a611275f0..ac552b5eb1 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -7,7 +7,8 @@ import { GoogleCUAClient } from "../agent/GoogleCUAClient.js";
 import { OpenAICUAClient } from "../agent/OpenAICUAClient.js";
 import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js";
 import { ensureXPath } from "../agent/utils/xpath.js";
-import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js";
+import { emitPostStepProbeEvidence } from "../agent/utils/postStepProbeEvidence.js";
+import { CuaEvidenceStepTracker } from "../agent/utils/cuaEvidenceStepTracker.js";
 import {
   ActionExecutionResult,
   AgentAction,
@@ -17,10 +18,7 @@ import {
   SafetyConfirmationHandler,
 } from "../types/public/agent.js";
 import { LogLine } from "../types/public/logs.js";
-import type {
-  AgentEvidenceCallback,
-  AgentScreenshotEvidenceEvent,
-} from "../types/public/agentEvidenceEvents.js";
+import type { AgentEvidenceCallback } from "../types/public/agentEvidenceEvents.js";
 import { type Action, V3FunctionName } from "../types/public/methods.js";
 import { FlowLogger } from "../flowlogger/FlowLogger.js";
 import { toTitleCase } from "../../utils.js";
@@ -42,13 +40,7 @@ export class V3CuaAgentHandler {
   private captchaSolver: CaptchaSolver | null = null;
   private captchaClickGuardRemaining = 0;
   private currentInstruction = "";
-  // Monotonic step counter used by evidence callbacks. The CUA loop is internal to
-  // the agent client, so unlike v3AgentHandler we don't have per-tool-call
-  // step events; instead we tag every screenshot emission with an
-  // incrementing index.
-  private cuaStepCounter = 0;
-  private latestCuaScreenshot?: AgentScreenshotEvidenceEvent;
-  private latestCuaScreenshotConsumed = true;
+  private readonly cuaEvidenceSteps = new CuaEvidenceStepTracker();
   private evidenceCallback?: AgentEvidenceCallback;
 
   constructor(
@@ -205,9 +197,7 @@ export class V3CuaAgentHandler {
 
     this.setSafetyConfirmationHandler(options.callbacks?.onSafetyConfirmation);
     this.evidenceCallback = options.callbacks?.onEvidence;
-    this.cuaStepCounter = 0;
-    this.latestCuaScreenshot = undefined;
-    this.latestCuaScreenshotConsumed = true;
+    this.cuaEvidenceSteps.reset();
 
     this.highlightCursor = options.highlightCursor !== false;
     this.currentInstruction = options.instruction;
@@ -811,18 +801,10 @@ export class V3CuaAgentHandler {
   private async emitCuaScreenshot(
     screenshot: Buffer,
     url: string,
-  ): Promise<AgentScreenshotEvidenceEvent> {
-    const event: AgentScreenshotEvidenceEvent = {
-      type: "screenshot",
-      stepIndex: this.cuaStepCounter++,
-      screenshot,
-      url,
-      evidenceRole: "agent",
-    };
-    this.latestCuaScreenshot = event;
-    this.latestCuaScreenshotConsumed = false;
-    await this.evidenceCallback?.(event);
-    return event;
+  ): Promise<void> {
+    await this.evidenceCallback?.(
+      this.cuaEvidenceSteps.recordScreenshot(screenshot, url),
+    );
   }
 
   private async emitCuaScreenshotNonFatal(
@@ -849,25 +831,15 @@ export class V3CuaAgentHandler {
     let pageUrl =
       typeof action.pageUrl === "string"
         ? action.pageUrl
-        : this.latestCuaScreenshot?.url;
+        : (this.cuaEvidenceSteps.latestScreenshotUrl ?? "");
     try {
       pageUrl = (await this.v3.context.awaitActivePage()).url();
     } catch {
       // Keep the best pre-action URL fallback.
     }
-    let stepIndex: number;
-
-    if (this.latestCuaScreenshot && !this.latestCuaScreenshotConsumed) {
-      stepIndex = this.latestCuaScreenshot.stepIndex;
-      this.latestCuaScreenshotConsumed = true;
-    } else if (this.latestCuaScreenshot) {
-      stepIndex = this.cuaStepCounter++;
-      await this.evidenceCallback?.({
-        ...this.latestCuaScreenshot,
-        stepIndex,
-      });
-    } else {
-      stepIndex = this.cuaStepCounter++;
+    const { stepIndex, replayScreenshot } = this.cuaEvidenceSteps.pairAction();
+    if (replayScreenshot) {
+      await this.evidenceCallback?.(replayScreenshot);
     }
 
     const actionArgs = Object.fromEntries(
@@ -899,49 +871,14 @@ export class V3CuaAgentHandler {
     // page actually LOOKS LIKE after the action ran. Without this the
     // verifier has no visual evidence that keystrokes/clicks landed, and
     // has to trust the action history alone.
-    //
-    // Callback-gated to keep ordinary agent runs free of the extra
-    // screenshot cost — mirrors v3AgentHandler's post-step probe.
-    const wantsEvidence = this.evidenceCallback !== undefined;
-    let probeUrl = pageUrl;
-    let probeScreenshot: Buffer | undefined;
-    if (wantsEvidence) {
-      try {
-        const page = await this.v3.context.awaitActivePage();
-        probeUrl = page.url();
-        probeScreenshot = await page.screenshot({ fullPage: false });
-      } catch (e) {
-        this.logger({
-          category: "agent",
-          message: `Warning: CUA post-action probe failed: ${
-            e instanceof Error ? e.message : String(e)
-          }`,
-          level: 1,
-        });
-      }
-    }
-
-    if (probeScreenshot) {
-      await this.evidenceCallback?.({
-        type: "screenshot",
-        stepIndex,
-        screenshot: probeScreenshot,
-        url: probeUrl,
-        evidenceRole: "probe",
-      });
-    }
-
-    if (probeUrl && wantsEvidence) {
-      // Capture the a11y tree alongside the URL probe so the verifier can
-      // ground textual claims without OCR. Best-effort.
-      const ariaTree = await captureAriaTreeProbe(this.v3);
-      await this.evidenceCallback?.({
-        type: "step_observed",
-        stepIndex,
-        url: probeUrl,
-        ariaTree,
-      });
-    }
+    await emitPostStepProbeEvidence({
+      v3: this.v3,
+      stepIndices: stepIndex,
+      url: pageUrl,
+      evidenceCallback: this.evidenceCallback,
+      logger: this.logger,
+      warningMessage: "Warning: CUA post-action probe failed",
+    });
   }
 
   private async injectCursor(): Promise<void> {
diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts
index a5cbccf746..e2f403e9a4 100644
--- a/packages/core/lib/v3/index.ts
+++ b/packages/core/lib/v3/index.ts
@@ -25,9 +25,12 @@ import { getAISDKLanguageModel } from "./llm/LLMProvider.js";
 import { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js";
 import { maybeRunShutdownSupervisorFromArgv } from "./shutdown/supervisor.js";
 import {
+  buildAgentEvidenceFromStepFinished,
   loadTrajectoryFromDisk,
+  mergeAgentEvidence,
   nextResultFilename,
   normalizeRubric,
+  redactInlineImagePayloads,
   shouldPersistTrajectory,
   writeTrajectoryDir,
 } from "./verifier/index.js";
@@ -92,9 +95,12 @@ export type {
   VerifierRawSteps,
 } from "./verifier/index.js";
 export {
+  buildAgentEvidenceFromStepFinished,
   loadTrajectoryFromDisk,
+  mergeAgentEvidence,
   nextResultFilename,
   normalizeRubric,
+  redactInlineImagePayloads,
   shouldPersistTrajectory,
   writeTrajectoryDir,
 } from "./verifier/index.js";
@@ -148,9 +154,12 @@ const StagehandDefault = {
   toJsonSchema,
   connectToMCPServer,
   V3Evaluator,
+  buildAgentEvidenceFromStepFinished,
   loadTrajectoryFromDisk,
+  mergeAgentEvidence,
   nextResultFilename,
   normalizeRubric,
+  redactInlineImagePayloads,
   shouldPersistTrajectory,
   writeTrajectoryDir,
   tool,
diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
index b31f493145..cf8e560779 100644
--- a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
+++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
@@ -6,7 +6,7 @@
  * storage and normalization live in the evals/verifier layers.
  */
 
-export type AgentEvidenceRole = "probe" | "agent" | "agent_and_probe";
+export type AgentEvidenceRole = "probe" | "agent";
 
 export type AgentEvidenceEvent =
   | AgentScreenshotEvidenceEvent
diff --git a/packages/core/lib/v3/verifier/evidenceNormalization.ts b/packages/core/lib/v3/verifier/evidenceNormalization.ts
new file mode 100644
index 0000000000..0012e84d6e
--- /dev/null
+++ b/packages/core/lib/v3/verifier/evidenceNormalization.ts
@@ -0,0 +1,112 @@
+import type { AgentStepFinishedEvent } from "../types/public/agentEvidenceEvents.js";
+import type { AgentEvidence } from "./types.js";
+
+export const REDACTED_INLINE_IMAGE = "[redacted inline image payload]";
+
+const INLINE_IMAGE_KEYS = new Set(["screenshotBase64"]);
+
+function shouldRedactBase64Key(key: string, actionName?: string): boolean {
+  return (
+    INLINE_IMAGE_KEYS.has(key) ||
+    (actionName === "screenshot" && key === "base64")
+  );
+}
+
+export function collectInlineImagePayloads(
+  value: unknown,
+  actionName?: string,
+  out: string[] = [],
+): string[] {
+  if (!value || typeof value !== "object") return out;
+  if (Buffer.isBuffer(value)) return out;
+
+  if (Array.isArray(value)) {
+    for (const item of value) {
+      collectInlineImagePayloads(item, actionName, out);
+    }
+    return out;
+  }
+
+  for (const [key, nested] of Object.entries(value)) {
+    if (shouldRedactBase64Key(key, actionName) && typeof nested === "string") {
+      out.push(nested);
+      continue;
+    }
+    collectInlineImagePayloads(nested, actionName, out);
+  }
+  return out;
+}
+
+export function redactInlineImagePayloads(
+  value: unknown,
+  actionName?: string,
+): unknown {
+  if (!value || typeof value !== "object") return value;
+  if (Buffer.isBuffer(value)) return value;
+
+  if (Array.isArray(value)) {
+    return value.map((item) => redactInlineImagePayloads(item, actionName));
+  }
+
+  const out: Record<string, unknown> = {};
+  for (const [key, nested] of Object.entries(value)) {
+    out[key] =
+      shouldRedactBase64Key(key, actionName) && typeof nested === "string"
+        ? REDACTED_INLINE_IMAGE
+        : redactInlineImagePayloads(nested, actionName);
+  }
+  return out;
+}
+
+export function mergeAgentEvidence(
+  ...parts: Array<AgentEvidence | undefined>
+): AgentEvidence {
+  return {
+    modalities: parts.flatMap((p) => p?.modalities ?? []),
+  };
+}
+
+export function buildAgentEvidenceFromStepFinished(
+  event: AgentStepFinishedEvent,
+): AgentEvidence {
+  const modalities: AgentEvidence["modalities"] = [];
+  if (event.reasoning) {
+    modalities.push({ type: "text", content: event.reasoning });
+  }
+
+  const result = event.toolOutput.result;
+  if (result === undefined || result === null) {
+    return { modalities };
+  }
+
+  if (typeof result === "string") {
+    modalities.push({ type: "text", content: result });
+  } else if (Buffer.isBuffer(result)) {
+    modalities.push({
+      type: "image",
+      bytes: result,
+      mediaType: "image/png",
+    });
+  } else if (typeof result === "object") {
+    for (const imageBase64 of collectInlineImagePayloads(
+      result,
+      event.actionName,
+    )) {
+      try {
+        modalities.push({
+          type: "image",
+          bytes: Buffer.from(imageBase64, "base64"),
+          mediaType: "image/png",
+        });
+      } catch {
+        // Malformed base64; skip the image and keep the JSON modality.
+      }
+    }
+    modalities.push({
+      type: "json",
+      content: redactInlineImagePayloads(result, event.actionName),
+    });
+  }
+
+  return { modalities };
+}
diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts
index 2b14cfb16a..f1638facc7 100644
--- a/packages/core/lib/v3/verifier/index.ts
+++ b/packages/core/lib/v3/verifier/index.ts
@@ -21,6 +21,13 @@ export type {
   VerifierFinding,
   VerifierRawSteps,
 } from "./types.js";
+export {
+  buildAgentEvidenceFromStepFinished,
+  collectInlineImagePayloads,
+  mergeAgentEvidence,
+  redactInlineImagePayloads,
+  REDACTED_INLINE_IMAGE,
+} from "./evidenceNormalization.js";
 export {
   loadTrajectoryFromDisk,
   nextResultFilename,
diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index 3bb623f16d..75e1372bbe 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -7,6 +7,7 @@ import type {
   Trajectory,
   TrajectoryStep,
 } from "./types.js";
+import { redactInlineImagePayloads } from "./evidenceNormalization.js";
 
 type RawRubricCriterion = {
   criterion: unknown;
@@ -231,37 +232,6 @@ export function shouldPersistTrajectory(
   return !process.env.CI;
 }
 
-const REDACTED_INLINE_IMAGE = "[redacted inline image payload]";
-const INLINE_IMAGE_KEYS = new Set(["screenshotBase64"]);
-
-function shouldRedactBase64Key(key: string, actionName?: string): boolean {
-  return (
-    INLINE_IMAGE_KEYS.has(key) ||
-    (actionName === "screenshot" && key === "base64")
-  );
-}
-
-function redactInlineImagePayloads(
-  value: unknown,
-  actionName?: string,
-): unknown {
-  if (!value || typeof value !== "object") return value;
-  if (Buffer.isBuffer(value)) return value;
-
-  if (Array.isArray(value)) {
-    return value.map((item) => redactInlineImagePayloads(item, actionName));
-  }
-
-  const out: Record<string, unknown> = {};
-  for (const [key, nested] of Object.entries(value)) {
-    out[key] =
-      shouldRedactBase64Key(key, actionName) && typeof nested === "string"
-        ? REDACTED_INLINE_IMAGE
-        : redactInlineImagePayloads(nested, actionName);
-  }
-  return out;
-}
-
 /**
  * Write the on-disk trajectory layout under `dir`:
  *
diff --git a/packages/core/tests/unit/cua-evidence-step-tracker.test.ts b/packages/core/tests/unit/cua-evidence-step-tracker.test.ts
new file mode 100644
index 0000000000..112c820a97
--- /dev/null
+++ b/packages/core/tests/unit/cua-evidence-step-tracker.test.ts
@@ -0,0 +1,55 @@
+import { describe, expect, it } from "vitest";
+
+import { CuaEvidenceStepTracker } from "../../lib/v3/agent/utils/cuaEvidenceStepTracker.js";
+
+describe("CuaEvidenceStepTracker", () => {
+  it("pairs a fresh provider screenshot with the next action", () => {
+    const tracker = new CuaEvidenceStepTracker();
+    const screenshot = Buffer.from("screen");
+
+    const event = tracker.recordScreenshot(screenshot, "https://example.com");
+    const paired = tracker.pairAction();
+
+    expect(event).toMatchObject({
+      type: "screenshot",
+      stepIndex: 0,
+      evidenceRole: "agent",
+      url: "https://example.com",
+    });
+    expect(paired).toEqual({ stepIndex: 0 });
+  });
+
+  it("allocates an action step without screenshot evidence", () => {
+    const tracker = new CuaEvidenceStepTracker();
+
+    expect(tracker.pairAction()).toEqual({ stepIndex: 0 });
+  });
+
+  it("replays the latest consumed screenshot for later actions", () => {
+    const tracker = new CuaEvidenceStepTracker();
+    const screenshot = Buffer.from("screen");
+
+    tracker.recordScreenshot(screenshot, "https://example.com/start");
+    tracker.pairAction();
+    const paired = tracker.pairAction();
+
+    expect(paired.stepIndex).toBe(1);
+    expect(paired.replayScreenshot).toMatchObject({
+      type: "screenshot",
+      stepIndex: 1,
+      evidenceRole: "agent",
+      url: "https://example.com/start",
+    });
+    expect(paired.replayScreenshot?.screenshot).toEqual(screenshot);
+  });
+
+  it("resets step allocation and pending screenshot state", () => {
+    const tracker = new CuaEvidenceStepTracker();
+
+    tracker.recordScreenshot(Buffer.from("screen"), "https://example.com");
+    tracker.reset();
+
+    expect(tracker.pairAction()).toEqual({ stepIndex: 0 });
+    expect(tracker.latestScreenshotUrl).toBeUndefined();
+  });
+});
diff --git a/packages/core/tests/unit/public-api/export-surface.test.ts b/packages/core/tests/unit/public-api/export-surface.test.ts
index 7a1f1f65dc..163fd60094 100644
--- a/packages/core/tests/unit/public-api/export-surface.test.ts
+++ b/packages/core/tests/unit/public-api/export-surface.test.ts
@@ -32,6 +32,8 @@ const publicApiShape = {
   V3: Stagehand.V3,
   V3Evaluator: Stagehand.V3Evaluator,
   V3FunctionName: Stagehand.V3FunctionName,
+  buildAgentEvidenceFromStepFinished:
+    Stagehand.buildAgentEvidenceFromStepFinished,
   connectToMCPServer: Stagehand.connectToMCPServer,
   default: StagehandDefaultExport,
   defaultExtractSchema: Stagehand.defaultExtractSchema,
@@ -44,12 +46,14 @@ const publicApiShape = {
   jsonSchemaToZod: Stagehand.jsonSchemaToZod,
   loadApiKeyFromEnv: Stagehand.loadApiKeyFromEnv,
   loadTrajectoryFromDisk: Stagehand.loadTrajectoryFromDisk,
+  mergeAgentEvidence: Stagehand.mergeAgentEvidence,
   localBrowserLaunchOptionsSchema: Stagehand.localBrowserLaunchOptionsSchema,
   modelToAgentProviderMap: Stagehand.modelToAgentProviderMap,
   nextResultFilename: Stagehand.nextResultFilename,
   normalizeRubric: Stagehand.normalizeRubric,
   pageTextSchema: Stagehand.pageTextSchema,
   providerEnvVarMap: Stagehand.providerEnvVarMap,
+  redactInlineImagePayloads: Stagehand.redactInlineImagePayloads,
   shouldPersistTrajectory: Stagehand.shouldPersistTrajectory,
   toGeminiSchema: Stagehand.toGeminiSchema,
   toJsonSchema: Stagehand.toJsonSchema,
diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index e28626860f..da2b9b5da8 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -1,6 +1,9 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import {
+  buildAgentEvidenceFromStepFinished,
+  mergeAgentEvidence,
+  redactInlineImagePayloads,
   shouldPersistTrajectory,
   writeTrajectoryDir,
 } from "@browserbasehq/stagehand";
@@ -68,20 +71,11 @@ export class TrajectoryRecorder {
   // Events can arrive out-of-order across step indices; same-step events all
   // fire in one microtask.
   private readonly partialSteps = new Map<number, Partial<PartialStep>>();
-  private readonly observationByStep = new Map<
-    number,
-    AgentStepObservedEvent
-  >();
-  private readonly screenshotsByStep = new Map<
-    number,
-    AgentScreenshotEvidenceEvent
-  >();
   private finalAnswerEvent?: AgentFinalAnswerEvent;
   private startedAt = "";
   private endedAt = "";
 
   private onScreenshot(e: AgentScreenshotEvidenceEvent): void {
-    this.screenshotsByStep.set(e.stepIndex, e);
     const partial = this.ensurePartial(e.stepIndex);
 
     // Default to probe when the emit site doesn't tag a role: matches
@@ -89,7 +83,7 @@ export class TrajectoryRecorder {
     // NOT a probe — emitCuaActionStep fills that role post-action.
     const role = e.evidenceRole ?? "probe";
 
-    if (role === "probe" || role === "agent_and_probe") {
+    if (role === "probe") {
       const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) };
       probe.screenshot = e.screenshot;
       probe.url = e.url;
@@ -103,7 +97,7 @@ export class TrajectoryRecorder {
       };
     }
 
-    if (role === "agent" || role === "agent_and_probe") {
+    if (role === "agent") {
       partial.agentEvidence = mergeAgentEvidence(partial.agentEvidence, {
         modalities: [
           { type: "image", bytes: e.screenshot, mediaType: "image/png" },
@@ -124,12 +118,11 @@ export class TrajectoryRecorder {
     partial.finishedAt = e.finishedAt;
     partial.agentEvidence = mergeAgentEvidence(
       partial.agentEvidence,
-      buildAgentEvidence(e),
+      buildAgentEvidenceFromStepFinished(e),
     );
   }
 
   private onStepObserved(e: AgentStepObservedEvent): void {
-    this.observationByStep.set(e.stepIndex, e);
     const partial = this.ensurePartial(e.stepIndex);
     const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) };
     probe.url = e.url;
@@ -203,8 +196,6 @@ export class TrajectoryRecorder {
   /** Throw away in-memory state without writing to disk. Used on early abort. */
   cancel(): void {
     this.partialSteps.clear();
-    this.observationByStep.clear();
-    this.screenshotsByStep.clear();
     this.finalAnswerEvent = undefined;
   }
 
@@ -270,8 +261,9 @@ export class TrajectoryRecorder {
         p.toolOutput === undefined ||
         p.finishedAt === undefined
       ) {
-        // CUA emits screenshot-only entries between actions; skip them here
-        // and let writeTrajectoryDir record them via the probe channel.
+        // Provider-only screenshot refreshes are transport evidence for the
+        // next CUA action. If no action arrives for this index, there is no
+        // completed trajectory step to persist.
         continue;
       }
       out.push({
@@ -289,109 +281,3 @@ export class TrajectoryRecorder {
     return out;
   }
 }
-
-const REDACTED_INLINE_IMAGE = "[redacted inline image payload]";
-const INLINE_IMAGE_KEYS = new Set(["screenshotBase64"]);
-
-function shouldRedactBase64Key(key: string, actionName?: string): boolean {
-  return (
-    INLINE_IMAGE_KEYS.has(key) ||
-    (actionName === "screenshot" && key === "base64")
-  );
-}
-
-function collectInlineImagePayloads(
-  value: unknown,
-  actionName?: string,
-  out: string[] = [],
-): string[] {
-  if (!value || typeof value !== "object") return out;
-  if (Buffer.isBuffer(value)) return out;
-
-  if (Array.isArray(value)) {
-    for (const item of value) {
-      collectInlineImagePayloads(item, actionName, out);
-    }
-    return out;
-  }
-
-  for (const [key, nested] of Object.entries(value)) {
-    if (shouldRedactBase64Key(key, actionName) && typeof nested === "string") {
-      out.push(nested);
-      continue;
-    }
-    collectInlineImagePayloads(nested, actionName, out);
-  }
-  return out;
-}
-
-function redactInlineImagePayloads(
-  value: unknown,
-  actionName?: string,
-): unknown {
-  if (!value || typeof value !== "object") return value;
-  if (Buffer.isBuffer(value)) return value;
-
-  if (Array.isArray(value)) {
-    return value.map((item) => redactInlineImagePayloads(item, actionName));
-  }
-
-  const out: Record<string, unknown> = {};
-  for (const [key, nested] of Object.entries(value)) {
-    out[key] =
-      shouldRedactBase64Key(key, actionName) && typeof nested === "string"
-        ? REDACTED_INLINE_IMAGE
-        : redactInlineImagePayloads(nested, actionName);
-  }
-  return out;
-}
-
-function mergeAgentEvidence(
-  ...parts: Array<AgentEvidence | undefined>
-): AgentEvidence {
-  return {
-    modalities: parts.flatMap((p) => p?.modalities ?? []),
-  };
-}
-
-function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence {
-  const modalities: AgentEvidence["modalities"] = [];
-  if (e.reasoning) {
-    modalities.push({ type: "text", content: e.reasoning });
-  }
-  const result = e.toolOutput.result;
-  if (result === undefined || result === null) {
-    return { modalities };
-  }
-  if (typeof result === "string") {
-    modalities.push({ type: "text", content: result });
-  } else if (Buffer.isBuffer(result)) {
-    modalities.push({
-      type: "image",
-      bytes: result,
-      mediaType: "image/png",
-    });
-  } else if (typeof result === "object") {
-    // Vision tools embed screenshot bytes alongside JSON; lift those bytes to
-    // image modalities and redact the inline payloads from persisted text/json.
-    for (const imageBase64 of collectInlineImagePayloads(
-      result,
-      e.actionName,
-    )) {
-      try {
-        modalities.push({
-          type: "image",
-          bytes: Buffer.from(imageBase64, "base64"),
-          mediaType: "image/png",
-        });
-      } catch {
-        // Malformed base64; skip the image and keep the JSON modality.
-      }
-    }
-    modalities.push({
-      type: "json",
-      content: redactInlineImagePayloads(result, e.actionName),
-    });
-  }
-  return { modalities };
-}
diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts
index 5f72dadab4..38443c5dc0 100644
--- a/packages/evals/tests/framework/trajectoryRecorder.test.ts
+++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts
@@ -56,7 +56,14 @@ describe("TrajectoryRecorder", () => {
       stepIndex: 0,
       screenshot,
       url: "https://example.com/search",
-      evidenceRole: "agent_and_probe",
+      evidenceRole: "agent",
+    });
+    recorder.record({
+      type: "screenshot",
+      stepIndex: 0,
+      screenshot,
+      url: "https://example.com/search",
+      evidenceRole: "probe",
     });
     recorder.record({
       type: "step_finished",
@@ -128,7 +135,14 @@ describe("TrajectoryRecorder", () => {
       stepIndex: 0,
       screenshot,
       url: "https://example.com/search",
-      evidenceRole: "agent_and_probe",
+      evidenceRole: "agent",
+    });
+    recorder.record({
+      type: "screenshot",
+      stepIndex: 0,
+      screenshot,
+      url: "https://example.com/search",
+      evidenceRole: "probe",
     });
     recorder.record({
       type: "step_finished",

From 3dfa861419c405f8c0b7bb8d03425c8a6ef5be8c Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 22 May 2026 16:42:25 -0700
Subject: [PATCH 17/27] fix(agent): make onEvidence non-fatal by wrapping at
 boundary

User-supplied onEvidence callbacks must never abort the agent loop. Wrap
the callback once where each handler receives it; internal emit sites
keep calling it as a plain await. Also unify CUA step_finished.toolOutput
construction behind a shared inferCuaToolOutput helper alongside the
existing inferToolOutput.
---
 .../lib/v3/agent/utils/toolOutputEvidence.ts  | 11 ++++++
 .../v3/agent/utils/wrapEvidenceCallback.ts    | 27 +++++++++++++++
 .../core/lib/v3/handlers/v3AgentHandler.ts    |  5 +--
 .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 34 +++++--------------
 4 files changed, 50 insertions(+), 27 deletions(-)
 create mode 100644 packages/core/lib/v3/agent/utils/wrapEvidenceCallback.ts

diff --git a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
index c8806334c4..b4be376757 100644
--- a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
+++ b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
@@ -1,4 +1,5 @@
 import type { AgentStepFinishedEvent } from "../../types/public/agentEvidenceEvents.js";
+import type { ActionExecutionResult } from "../../types/public/agent.js";
 
 export function inferToolOutput(
   toolResult: unknown,
@@ -23,3 +24,13 @@ export function inferToolOutput(
     error,
   };
 }
+
+export function inferCuaToolOutput(
+  result: ActionExecutionResult | undefined,
+): AgentStepFinishedEvent["toolOutput"] {
+  return {
+    ok: result?.success !== false,
+    result: result ?? { success: true },
+    error: result?.error,
+  };
+}
diff --git a/packages/core/lib/v3/agent/utils/wrapEvidenceCallback.ts b/packages/core/lib/v3/agent/utils/wrapEvidenceCallback.ts
new file mode 100644
index 0000000000..1b35bc04c9
--- /dev/null
+++ b/packages/core/lib/v3/agent/utils/wrapEvidenceCallback.ts
@@ -0,0 +1,27 @@
+import type { AgentEvidenceCallback } from "../../types/public/agentEvidenceEvents.js";
+import type { LogLine } from "../../types/public/logs.js";
+
+// onEvidence is a user-supplied observability hook (trajectory recording,
+// verifier capture, etc.). Wrap it once at the boundary where the handler
+// receives it so a throwing user callback can never abort the agent loop —
+// internal emit sites can then call the wrapped callback directly without
+// per-site try/catch.
+export function wrapEvidenceCallback(
+  callback: AgentEvidenceCallback | undefined,
+  logger: (message: LogLine) => void,
+): AgentEvidenceCallback | undefined {
+  if (!callback) return undefined;
+  return async (event) => {
+    try {
+      await callback(event);
+    } catch (e) {
+      logger({
+        category: "agent",
+        message: `Warning: onEvidence callback failed for ${event.type}: ${
+          e instanceof Error ? e.message : String(e)
+        }`,
+        level: 1,
+      });
+    }
+  };
+}
diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts
index 20a9c16a74..5281c1a70f 100644
--- a/packages/core/lib/v3/handlers/v3AgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts
@@ -43,6 +43,7 @@ import {
 } from "../types/public/sdkErrors.js";
 import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js";
 import { emitPostStepProbeEvidence } from "../agent/utils/postStepProbeEvidence.js";
+import { wrapEvidenceCallback } from "../agent/utils/wrapEvidenceCallback.js";
 import { inferToolOutput } from "../agent/utils/toolOutputEvidence.js";
 import {
   CaptchaSolver,
@@ -438,7 +439,7 @@ export class V3AgentHandler {
         onStepFinish: this.createStepHandler(
           state,
           callbacks?.onStepFinish,
-          callbacks?.onEvidence,
+          wrapEvidenceCallback(callbacks?.onEvidence, this.logger),
         ),
         abortSignal: preparedOptions.signal,
         providerOptions: {
@@ -578,7 +579,7 @@ export class V3AgentHandler {
         onStepFinish: this.createStepHandler(
           state,
           callbacks?.onStepFinish,
-          callbacks?.onEvidence,
+          wrapEvidenceCallback(callbacks?.onEvidence, this.logger),
         ),
         onError: (event) => {
           captchaSolver?.dispose();
diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index ac552b5eb1..bcb92f9a00 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -8,6 +8,8 @@ import { OpenAICUAClient } from "../agent/OpenAICUAClient.js";
 import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js";
 import { ensureXPath } from "../agent/utils/xpath.js";
 import { emitPostStepProbeEvidence } from "../agent/utils/postStepProbeEvidence.js";
+import { wrapEvidenceCallback } from "../agent/utils/wrapEvidenceCallback.js";
+import { inferCuaToolOutput } from "../agent/utils/toolOutputEvidence.js";
 import { CuaEvidenceStepTracker } from "../agent/utils/cuaEvidenceStepTracker.js";
 import {
   ActionExecutionResult,
@@ -82,7 +84,7 @@ export class V3CuaAgentHandler {
       const page = await this.v3.context.awaitActivePage();
       const screenshotBuffer = await page.screenshot({ fullPage: false });
 
-      await this.emitCuaScreenshotNonFatal(screenshotBuffer, page.url());
+      await this.emitCuaScreenshot(screenshotBuffer, page.url());
 
       return screenshotBuffer.toString("base64"); // base64 png
     });
@@ -196,7 +198,10 @@ export class V3CuaAgentHandler {
         : optionsOrInstruction;
 
     this.setSafetyConfirmationHandler(options.callbacks?.onSafetyConfirmation);
-    this.evidenceCallback = options.callbacks?.onEvidence;
+    this.evidenceCallback = wrapEvidenceCallback(
+      options.callbacks?.onEvidence,
+      this.logger,
+    );
     this.cuaEvidenceSteps.reset();
 
     this.highlightCursor = options.highlightCursor !== false;
@@ -680,7 +685,7 @@ export class V3CuaAgentHandler {
       const currentUrl = page.url();
 
       // Mirror the same buffer the CUA client receives as agent evidence.
-      await this.emitCuaScreenshotNonFatal(screenshotBuffer, currentUrl);
+      await this.emitCuaScreenshot(screenshotBuffer, currentUrl);
 
       return await this.agentClient.captureScreenshot({
         base64Image: screenshotBuffer.toString("base64"),
@@ -807,23 +812,6 @@ export class V3CuaAgentHandler {
     );
   }
 
-  private async emitCuaScreenshotNonFatal(
-    screenshot: Buffer,
-    url: string,
-  ): Promise<void> {
-    try {
-      await this.emitCuaScreenshot(screenshot, url);
-    } catch (e) {
-      this.logger({
-        category: "agent",
-        message: `Warning: CUA screenshot evidence callback failed: ${
-          e instanceof Error ? e.message : String(e)
-        }`,
-        level: 1,
-      });
-    }
-  }
-
   private async emitCuaActionStep(
     action: AgentAction,
     result: ActionExecutionResult | undefined,
@@ -858,11 +846,7 @@ export class V3CuaAgentHandler {
       actionName: String(action.type),
       actionArgs,
       reasoning,
-      toolOutput: {
-        ok: result?.success !== false,
-        result: result ?? { success: true },
-        error: result?.error,
-      },
+      toolOutput: inferCuaToolOutput(result),
       finishedAt: new Date().toISOString(),
     });
 

From 4d203ca28f09618af8e06290e353261ee48f36a9 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 22 May 2026 18:49:02 -0700
Subject: [PATCH 18/27] test(agent): update warning-message assertion to
 generic onEvidence label

The non-fatal wrapper now logs `onEvidence callback failed for <event.type>`
from a single boundary helper rather than the per-site
`CUA screenshot evidence callback failed`. Update the assertion to match.
---
 packages/core/tests/unit/agent-captcha-hooks.test.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts
index 9cb626cf39..4789fb5c63 100644
--- a/packages/core/tests/unit/agent-captcha-hooks.test.ts
+++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts
@@ -553,7 +553,7 @@ describe("v3 cua handler screenshot behavior", () => {
     );
     expect(
       logs.some((line) =>
-        line.message.includes("CUA screenshot evidence callback failed"),
+        line.message.includes("onEvidence callback failed for screenshot"),
       ),
     ).toBe(true);
   });

From 2418db391edd981c3adbdd57e97ad941001c3cc2 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 22 May 2026 16:04:33 -0700
Subject: [PATCH 19/27] fix(verifier): preserve final evidence observations

---
 .../v3/agent/utils/postStepProbeEvidence.ts   |  60 +++++---
 .../lib/v3/agent/utils/toolOutputEvidence.ts  |  79 ++++++++--
 .../core/lib/v3/handlers/v3AgentHandler.ts    | 136 +++++++++++++-----
 .../core/lib/v3/handlers/v3CuaAgentHandler.ts |  39 +++--
 .../v3/types/public/agentEvidenceEvents.ts    |  21 +++
 .../lib/v3/verifier/evidenceNormalization.ts  |   6 +
 packages/core/lib/v3/verifier/trajectory.ts   |  32 ++++-
 packages/core/lib/v3/verifier/types.ts        |   4 +-
 packages/core/lib/v3Evaluator.ts              |   6 +
 .../tests/unit/tool-output-evidence.test.ts   |  58 ++++++++
 packages/core/tests/unit/v3-evaluator.test.ts |  33 +++++
 .../verifier-evidence-normalization.test.ts   |  20 +++
 .../tests/unit/verifier-trajectory.test.ts    |  17 +++
 .../evals/framework/trajectoryRecorder.ts     |  24 +++-
 .../framework/trajectoryRecorder.test.ts      |  53 +++++++
 15 files changed, 507 insertions(+), 81 deletions(-)
 create mode 100644 packages/core/tests/unit/tool-output-evidence.test.ts
 create mode 100644 packages/core/tests/unit/verifier-evidence-normalization.test.ts

diff --git a/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts
index de9cd9d044..10889d6cf8 100644
--- a/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts
+++ b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts
@@ -3,32 +3,32 @@ import type { LogLine } from "../../types/public/logs.js";
 import type { V3 } from "../../v3.js";
 import { captureAriaTreeProbe } from "./captureAriaTreeProbe.js";
 
-interface EmitPostStepProbeEvidenceOptions {
+interface CaptureProbeEvidenceOptions {
   v3: V3;
-  stepIndices: number | number[];
   url: string;
-  evidenceCallback?: AgentEvidenceCallback;
   logger: (message: LogLine) => void;
   warningMessage: string;
 }
 
+interface EmitPostStepProbeEvidenceOptions extends CaptureProbeEvidenceOptions {
+  stepIndices: number | number[];
+  evidenceCallback?: AgentEvidenceCallback;
+}
+
 function errorMessage(error: unknown): string {
   return error instanceof Error ? error.message : String(error);
 }
 
-export async function emitPostStepProbeEvidence({
+export async function captureProbeEvidence({
   v3,
-  stepIndices,
   url,
-  evidenceCallback,
   logger,
   warningMessage,
-}: EmitPostStepProbeEvidenceOptions): Promise<void> {
-  if (!evidenceCallback) return;
-
-  const indices = Array.isArray(stepIndices) ? stepIndices : [stepIndices];
-  if (indices.length === 0) return;
-
+}: CaptureProbeEvidenceOptions): Promise<{
+  url: string;
+  screenshot?: Buffer;
+  ariaTree?: string;
+}> {
   let probeUrl = url;
   let screenshot: Buffer | undefined;
   try {
@@ -44,21 +44,47 @@ export async function emitPostStepProbeEvidence({
   }
 
   const ariaTree = await captureAriaTreeProbe(v3);
+  return {
+    url: probeUrl,
+    ...(screenshot ? { screenshot } : {}),
+    ...(ariaTree !== undefined ? { ariaTree } : {}),
+  };
+}
+
+export async function emitPostStepProbeEvidence({
+  v3,
+  stepIndices,
+  url,
+  evidenceCallback,
+  logger,
+  warningMessage,
+}: EmitPostStepProbeEvidenceOptions): Promise<void> {
+  if (!evidenceCallback) return;
+
+  const indices = Array.isArray(stepIndices) ? stepIndices : [stepIndices];
+  if (indices.length === 0) return;
+
+  const probe = await captureProbeEvidence({
+    v3,
+    url,
+    logger,
+    warningMessage,
+  });
   for (const stepIndex of indices) {
-    if (screenshot) {
+    if (probe.screenshot) {
       await evidenceCallback({
         type: "screenshot",
         stepIndex,
-        screenshot,
-        url: probeUrl,
+        screenshot: probe.screenshot,
+        url: probe.url,
         evidenceRole: "probe",
       });
     }
     await evidenceCallback({
       type: "step_observed",
       stepIndex,
-      url: probeUrl,
-      ariaTree,
+      url: probe.url,
+      ariaTree: probe.ariaTree,
     });
   }
 }
diff --git a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
index b4be376757..9718181479 100644
--- a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
+++ b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
@@ -1,25 +1,76 @@
 import type { AgentStepFinishedEvent } from "../../types/public/agentEvidenceEvents.js";
 import type { ActionExecutionResult } from "../../types/public/agent.js";
 
+const ERROR_STRING_LIMIT = 1000;
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return value !== null && typeof value === "object" && !Array.isArray(value);
+}
+
+function hasOwn(value: Record<string, unknown>, key: string): boolean {
+  return Object.prototype.hasOwnProperty.call(value, key);
+}
+
+function normalizeError(value: unknown): string | undefined {
+  if (value === undefined || value === null || value === false) {
+    return undefined;
+  }
+  if (value instanceof Error) {
+    return value.message;
+  }
+  if (typeof value === "string") {
+    return value;
+  }
+  if (
+    typeof value === "number" ||
+    typeof value === "boolean" ||
+    typeof value === "bigint"
+  ) {
+    return String(value);
+  }
+
+  let serialized: string;
+  try {
+    serialized = JSON.stringify(value) ?? String(value);
+  } catch {
+    serialized = String(value);
+  }
+  if (serialized.length <= ERROR_STRING_LIMIT) {
+    return serialized;
+  }
+  return `${serialized.slice(0, ERROR_STRING_LIMIT)}... [truncated]`;
+}
+
+function statusCandidates(toolResult: unknown): Record<string, unknown>[] {
+  if (!isRecord(toolResult)) {
+    return [];
+  }
+
+  const candidates = [toolResult];
+  const output = toolResult.output;
+  if (isRecord(output)) {
+    candidates.push(output);
+  }
+  return candidates;
+}
+
 export function inferToolOutput(
   toolResult: unknown,
 ): AgentStepFinishedEvent["toolOutput"] {
-  const error =
-    toolResult &&
-    typeof toolResult === "object" &&
-    "error" in toolResult &&
-    typeof (toolResult as { error?: unknown }).error === "string"
-      ? (toolResult as { error: string }).error
-      : undefined;
-
-  const isError =
-    toolResult &&
-    typeof toolResult === "object" &&
-    "isError" in toolResult &&
-    Boolean((toolResult as { isError?: unknown }).isError);
+  const candidates = statusCandidates(toolResult);
+  const error = candidates
+    .map((candidate) =>
+      hasOwn(candidate, "error") ? normalizeError(candidate.error) : undefined,
+    )
+    .find((message): message is string => message !== undefined);
+
+  const successFalse = candidates.some(
+    (candidate) => candidate.success === false,
+  );
+  const isError = candidates.some((candidate) => Boolean(candidate.isError));
 
   return {
-    ok: error === undefined && !isError,
+    ok: error === undefined && !isError && !successFalse,
     result: toolResult,
     error,
   };
diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts
index 5281c1a70f..4481c3dc68 100644
--- a/packages/core/lib/v3/handlers/v3AgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts
@@ -42,7 +42,10 @@ import {
   AgentAbortError,
 } from "../types/public/sdkErrors.js";
 import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js";
-import { emitPostStepProbeEvidence } from "../agent/utils/postStepProbeEvidence.js";
+import {
+  captureProbeEvidence,
+  emitPostStepProbeEvidence,
+} from "../agent/utils/postStepProbeEvidence.js";
 import { wrapEvidenceCallback } from "../agent/utils/wrapEvidenceCallback.js";
 import { inferToolOutput } from "../agent/utils/toolOutputEvidence.js";
 import {
@@ -55,6 +58,19 @@ function getErrorMessage(error: unknown): string {
   return error instanceof Error ? error.message : String(error);
 }
 
+type FinalAnswerDraft = {
+  message: string;
+  output?: Record<string, unknown>;
+};
+
+interface StepHandlerOptions {
+  userCallback?:
+    | GenerateTextOnStepFinishCallback<ToolSet>
+    | StreamTextOnStepFinishCallback<ToolSet>;
+  evidenceCallback?: AgentEvidenceCallback;
+  onFinalAnswer?: (answer: FinalAnswerDraft) => void;
+}
+
 /**
  * Prepends a system message with cache control to the messages array.
  * The cache control providerOptions are used by Anthropic and ignored by other providers.
@@ -248,10 +264,7 @@ export class V3AgentHandler {
 
   private createStepHandler(
     state: AgentState,
-    userCallback?:
-      | GenerateTextOnStepFinishCallback<ToolSet>
-      | StreamTextOnStepFinishCallback<ToolSet>,
-    evidenceCallback?: AgentEvidenceCallback,
+    { userCallback, evidenceCallback, onFinalAnswer }: StepHandlerOptions,
   ) {
     // Monotonic step counter scoped to this execute() call. Each tool call in
     // the agent loop becomes one trajectory step. The counter feeds stepIndex
@@ -265,9 +278,7 @@ export class V3AgentHandler {
       });
 
       const stepIndicesInTurn: number[] = [];
-      let lastFinalAnswer:
-        | { message: string; output?: Record<string, unknown> }
-        | undefined;
+      let lastFinalAnswer: FinalAnswerDraft | undefined;
 
       if (event.toolCalls && event.toolCalls.length > 0) {
         for (let i = 0; i < event.toolCalls.length; i++) {
@@ -316,6 +327,7 @@ export class V3AgentHandler {
 
           const stepIndex = stepCounter++;
           stepIndicesInTurn.push(stepIndex);
+          const finishedAt = new Date().toISOString();
           await evidenceCallback?.({
             type: "step_finished",
             stepIndex,
@@ -326,7 +338,7 @@ export class V3AgentHandler {
                 : {},
             reasoning: event.text ?? "",
             toolOutput: inferToolOutput(toolResult),
-            finishedAt: new Date().toISOString(),
+            finishedAt,
           });
         }
         state.currentPageUrl = (await this.v3.context.awaitActivePage()).url();
@@ -347,10 +359,7 @@ export class V3AgentHandler {
       }
 
       if (lastFinalAnswer) {
-        await evidenceCallback?.({
-          type: "final_answer",
-          ...lastFinalAnswer,
-        });
+        onFinalAnswer?.(lastFinalAnswer);
       }
 
       if (userCallback) {
@@ -378,6 +387,7 @@ export class V3AgentHandler {
       completed: false,
       currentPageUrl: "",
     };
+    let finalAnswerFromDoneTool: FinalAnswerDraft | undefined;
 
     let messages: ModelMessage[] = [];
     let captchaSolver: CaptchaSolver | undefined;
@@ -425,6 +435,11 @@ export class V3AgentHandler {
         }
       }
 
+      const evidenceCallback = wrapEvidenceCallback(
+        callbacks?.onEvidence,
+        this.logger,
+      );
+
       const result = await this.llmClient.generateText({
         model: wrappedModel,
         messages: prependSystemMessage(systemPrompt, messages),
@@ -436,11 +451,13 @@ export class V3AgentHandler {
           callbacks?.prepareStep,
           captchaSolver,
         ),
-        onStepFinish: this.createStepHandler(
-          state,
-          callbacks?.onStepFinish,
-          wrapEvidenceCallback(callbacks?.onEvidence, this.logger),
-        ),
+        onStepFinish: this.createStepHandler(state, {
+          userCallback: callbacks?.onStepFinish,
+          evidenceCallback,
+          onFinalAnswer: (answer) => {
+            finalAnswerFromDoneTool = answer;
+          },
+        }),
         abortSignal: preparedOptions.signal,
         providerOptions: {
           google: { mediaResolution: "MEDIA_RESOLUTION_HIGH" },
@@ -457,6 +474,15 @@ export class V3AgentHandler {
         preparedOptions.output,
         this.logger,
       );
+      const output = doneResult.output ?? finalAnswerFromDoneTool?.output;
+      await this.emitFinalEvidence(
+        state,
+        {
+          message: state.finalMessage,
+          output,
+        },
+        evidenceCallback,
+      );
 
       return this.consolidateMetricsAndResult(
         startTime,
@@ -464,7 +490,7 @@ export class V3AgentHandler {
         doneResult.messages,
         result,
         maxSteps,
-        doneResult.output,
+        output,
       );
     } catch (error) {
       // Re-throw validation errors that should propagate to the caller
@@ -510,6 +536,7 @@ export class V3AgentHandler {
     // Highlight cursor defaults to true for hybrid mode, can be overridden
     const shouldHighlightCursor =
       streamOptions?.highlightCursor ?? this.mode === "hybrid";
+    let finalAnswerFromDoneTool: FinalAnswerDraft | undefined;
 
     const {
       options,
@@ -564,6 +591,11 @@ export class V3AgentHandler {
       rejectResult(error);
     };
 
+    const evidenceCallback = wrapEvidenceCallback(
+      callbacks?.onEvidence,
+      this.logger,
+    );
+
     let streamResult: ReturnType<typeof this.llmClient.streamText>;
     try {
       streamResult = this.llmClient.streamText({
@@ -576,11 +608,13 @@ export class V3AgentHandler {
           callbacks?.prepareStep,
           captchaSolver,
         ),
-        onStepFinish: this.createStepHandler(
-          state,
-          callbacks?.onStepFinish,
-          wrapEvidenceCallback(callbacks?.onEvidence, this.logger),
-        ),
+        onStepFinish: this.createStepHandler(state, {
+          userCallback: callbacks?.onStepFinish,
+          evidenceCallback,
+          onFinalAnswer: (answer) => {
+            finalAnswerFromDoneTool = answer;
+          },
+        }),
         onError: (event) => {
           captchaSolver?.dispose();
           if (callbacks?.onError) {
@@ -606,17 +640,29 @@ export class V3AgentHandler {
             options.instruction,
             options.output,
             this.logger,
-          ).then((doneResult) => {
-            const result = this.consolidateMetricsAndResult(
-              startTime,
-              state,
-              doneResult.messages,
-              event,
-              maxSteps,
-              doneResult.output,
-            );
-            resolveResult(result);
-          });
+          )
+            .then(async (doneResult) => {
+              const output =
+                doneResult.output ?? finalAnswerFromDoneTool?.output;
+              await this.emitFinalEvidence(
+                state,
+                {
+                  message: state.finalMessage,
+                  output,
+                },
+                evidenceCallback,
+              );
+              const result = this.consolidateMetricsAndResult(
+                startTime,
+                state,
+                doneResult.messages,
+                event,
+                maxSteps,
+                output,
+              );
+              resolveResult(result);
+            })
+            .catch(handleError);
         },
         onAbort: (event) => {
           captchaSolver?.dispose();
@@ -645,6 +691,26 @@ export class V3AgentHandler {
     return agentStreamResult;
   }
 
+  private async emitFinalEvidence(
+    state: AgentState,
+    finalAnswer: { message: string; output?: Record<string, unknown> },
+    evidenceCallback?: AgentEvidenceCallback,
+  ): Promise<void> {
+    if (!evidenceCallback) return;
+
+    const observation = await captureProbeEvidence({
+      v3: this.v3,
+      url: state.currentPageUrl,
+      logger: this.logger,
+      warningMessage: "Warning: final harness probe failed",
+    });
+    await evidenceCallback({
+      type: "final_answer",
+      ...finalAnswer,
+      observation,
+    });
+  }
+
   private consolidateMetricsAndResult(
     startTime: number,
     state: AgentState,
diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index bcb92f9a00..a41aa7fd07 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -7,10 +7,13 @@ import { GoogleCUAClient } from "../agent/GoogleCUAClient.js";
 import { OpenAICUAClient } from "../agent/OpenAICUAClient.js";
 import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js";
 import { ensureXPath } from "../agent/utils/xpath.js";
-import { emitPostStepProbeEvidence } from "../agent/utils/postStepProbeEvidence.js";
+import {
+  captureProbeEvidence,
+  emitPostStepProbeEvidence,
+} from "../agent/utils/postStepProbeEvidence.js";
 import { wrapEvidenceCallback } from "../agent/utils/wrapEvidenceCallback.js";
-import { inferCuaToolOutput } from "../agent/utils/toolOutputEvidence.js";
 import { CuaEvidenceStepTracker } from "../agent/utils/cuaEvidenceStepTracker.js";
+import { inferToolOutput } from "../agent/utils/toolOutputEvidence.js";
 import {
   ActionExecutionResult,
   AgentAction,
@@ -131,6 +134,7 @@ export class V3CuaAgentHandler {
         defaultDelay;
       try {
         let executionResult: ActionExecutionResult | undefined;
+        const startedAt = new Date().toISOString();
         // Try to inject cursor before each action if enabled
         if (this.highlightCursor) {
           try {
@@ -161,7 +165,7 @@ export class V3CuaAgentHandler {
 
         action.timestamp = Date.now();
         if (shouldLog) {
-          await this.emitCuaActionStep(action, executionResult);
+          await this.emitCuaActionStep(action, executionResult, startedAt);
         }
 
         await new Promise((r) => setTimeout(r, waitBetween));
@@ -258,11 +262,26 @@ export class V3CuaAgentHandler {
     let result: AgentResult;
     try {
       result = await this.agent.execute({ options, logger: this.logger });
-      await this.evidenceCallback?.({
-        type: "final_answer",
-        message: result.message,
-        output: result.output,
-      });
+      if (this.evidenceCallback) {
+        let finalUrl = "";
+        try {
+          finalUrl = (await this.v3.context.awaitActivePage()).url();
+        } catch {
+          finalUrl = this.cuaEvidenceSteps.latestScreenshotUrl ?? "";
+        }
+        const observation = await captureProbeEvidence({
+          v3: this.v3,
+          url: finalUrl,
+          logger: this.logger,
+          warningMessage: "Warning: CUA final probe failed",
+        });
+        await this.evidenceCallback({
+          type: "final_answer",
+          message: result.message,
+          output: result.output,
+          observation,
+        });
+      }
     } finally {
       this.evidenceCallback = undefined;
       this.captchaSolver?.dispose();
@@ -815,6 +834,7 @@ export class V3CuaAgentHandler {
   private async emitCuaActionStep(
     action: AgentAction,
     result: ActionExecutionResult | undefined,
+    startedAt: string,
   ): Promise<void> {
     let pageUrl =
       typeof action.pageUrl === "string"
@@ -846,7 +866,8 @@ export class V3CuaAgentHandler {
       actionName: String(action.type),
       actionArgs,
       reasoning,
-      toolOutput: inferCuaToolOutput(result),
+      toolOutput: inferToolOutput(result ?? { success: true }),
+      startedAt,
       finishedAt: new Date().toISOString(),
     });
 
diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
index cf8e560779..23f90a5ef2 100644
--- a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
+++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
@@ -52,6 +52,8 @@ export interface AgentStepFinishedEvent {
     result: unknown;
     error?: string;
   };
+  /** ISO 8601 timestamp at which the step's tool execution started, when available. */
+  startedAt?: string;
   /** ISO 8601 timestamp at which the step finished. */
   finishedAt: string;
 }
@@ -70,6 +72,17 @@ export interface AgentStepObservedEvent {
   scroll?: { top: number; pageHeight: number };
 }
 
+export interface AgentFinalObservation {
+  /** Page URL at the time of terminal capture. */
+  url: string;
+  /** PNG bytes from page.screenshot(), when capture succeeds. */
+  screenshot?: Buffer;
+  /** Accessibility tree snapshot, when captured. */
+  ariaTree?: string;
+  /** Viewport scroll context, when captured. */
+  scroll?: { top: number; pageHeight: number };
+}
+
 /** Final answer emitted by the agent, when available. */
 export interface AgentFinalAnswerEvent {
   type: "final_answer";
@@ -77,6 +90,14 @@ export interface AgentFinalAnswerEvent {
   message: string;
   /** Optional structured output if the agent's output schema was set. */
   output?: Record<string, unknown>;
+  /**
+   * Independent terminal browser observation captured after the agent finishes.
+   *
+   * This preserves the legacy verifier behavior of evaluating against a final
+   * page screenshot even when the last agent output is a final answer rather
+   * than a browser action.
+   */
+  observation?: AgentFinalObservation;
 }
 
 export type AgentEvidenceCallback = (
diff --git a/packages/core/lib/v3/verifier/evidenceNormalization.ts b/packages/core/lib/v3/verifier/evidenceNormalization.ts
index 0012e84d6e..486ca68e63 100644
--- a/packages/core/lib/v3/verifier/evidenceNormalization.ts
+++ b/packages/core/lib/v3/verifier/evidenceNormalization.ts
@@ -81,6 +81,12 @@ export function buildAgentEvidenceFromStepFinished(
 
   if (typeof result === "string") {
     modalities.push({ type: "text", content: result });
+  } else if (
+    typeof result === "number" ||
+    typeof result === "boolean" ||
+    typeof result === "bigint"
+  ) {
+    modalities.push({ type: "text", content: String(result) });
   } else if (Buffer.isBuffer(result)) {
     modalities.push({
       type: "image",
diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index 75e1372bbe..413ecc15fd 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -21,6 +21,10 @@ type RawRubric = {
   items?: unknown;
 };
 
+type PersistedProbeEvidence = ProbeEvidence & {
+  screenshotPath?: string;
+};
+
 /**
  * Convert dataset or generated rubric JSON into the public Stagehand shape.
  * Snake-case dataset fields are accepted here so serialized quirks do not leak
@@ -109,6 +113,7 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
   const trajectoryPath = path.join(trajectoryDir, "trajectory.json");
   const raw = await fs.readFile(trajectoryPath, "utf8");
   const parsed = JSON.parse(raw) as Trajectory & {
+    finalObservation?: PersistedProbeEvidence;
     steps: Array<
       TrajectoryStep & {
         agentEvidence: {
@@ -126,7 +131,7 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
             | { type: "json"; content: unknown }
           >;
         };
-        probeEvidence: ProbeEvidence;
+        probeEvidence: PersistedProbeEvidence;
       }
     >;
   };
@@ -151,9 +156,9 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
     return resolved;
   };
 
-  for (const step of parsed.steps) {
-    // Rehydrate tier-2 probe screenshot from its on-disk file reference.
-    const probe = step.probeEvidence;
+  const hydrateProbeScreenshot = async (
+    probe: PersistedProbeEvidence | undefined,
+  ): Promise<void> => {
     if (probe?.screenshotPath && !probe.screenshot) {
       const resolved = resolveWithinTrajectoryDir(probe.screenshotPath);
       try {
@@ -163,6 +168,11 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
         // evidence_insufficient path will handle it.
       }
     }
+  };
+
+  for (const step of parsed.steps) {
+    // Rehydrate tier-2 probe screenshot from its on-disk file reference.
+    await hydrateProbeScreenshot(step.probeEvidence);
 
     // Decode image modalities from disk references back to Buffer.
     if (step.agentEvidence?.modalities) {
@@ -205,6 +215,8 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
     }
   }
 
+  await hydrateProbeScreenshot(parsed.finalObservation);
+
   return parsed;
 }
 
@@ -308,11 +320,23 @@ export async function writeTrajectoryDir(
     });
   }
 
+  const finalObservation: ProbeEvidence | undefined =
+    trajectory.finalObservation === undefined
+      ? undefined
+      : { ...trajectory.finalObservation };
+  if (finalObservation?.screenshot) {
+    const relPath = "screenshots/probe/final.png";
+    await fs.writeFile(path.join(dir, relPath), finalObservation.screenshot);
+    finalObservation.screenshotPath = relPath;
+    delete finalObservation.screenshot;
+  }
+
   // Image modalities carry imagePath instead of raw bytes on disk; cast
   // through unknown rather than widen Trajectory's type contract.
   const serialized = {
     ...trajectory,
     steps: serializableSteps,
+    ...(finalObservation ? { finalObservation } : {}),
   } as unknown;
 
   await fs.writeFile(
diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts
index 04addaf9a4..c51ea2d47e 100644
--- a/packages/core/lib/v3/verifier/types.ts
+++ b/packages/core/lib/v3/verifier/types.ts
@@ -136,7 +136,7 @@ export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error";
  *   .trajectories/<run-id>/<task-id>/
  *     ├── task_data.json    — TaskSpec + result metadata
  *     ├── trajectory.json   — this object, with screenshotPath instead of bytes
- *     ├── screenshot_1.png  — probeEvidence.screenshot for step 1, etc.
+ *     ├── screenshots/      — step probe/agent images plus final observation
  *     ├── scores/
  *     │   └── result.json       — Result from V3Evaluator.verify()
  *     ├── core.log          — captured action log
@@ -146,6 +146,8 @@ export interface Trajectory {
   task: TaskSpec;
   steps: TrajectoryStep[];
   finalAnswer?: string;
+  /** Terminal page observation captured after the agent finishes. */
+  finalObservation?: ProbeEvidence;
   status: TrajectoryStatus;
   usage: TrajectoryUsage;
   timing: { startedAt: string; endedAt: string };
diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts
index 5294f6c508..ee1bbc6d35 100644
--- a/packages/core/lib/v3Evaluator.ts
+++ b/packages/core/lib/v3Evaluator.ts
@@ -227,6 +227,10 @@ function collectLegacyScreenshots(trajectory: Trajectory): Buffer[] {
     }
   }
 
+  if (Buffer.isBuffer(trajectory.finalObservation?.screenshot)) {
+    screenshots.push(trajectory.finalObservation.screenshot);
+  }
+
   return screenshots;
 }
 
@@ -234,12 +238,14 @@ function renderLegacyAgentReasoning(
   trajectory: Trajectory,
 ): string | undefined {
   const stepLines = (trajectory.steps ?? []).map((step) => {
+    const status = step.toolOutput?.ok === false ? "Tool status: failed" : "";
     const output = step.toolOutput?.error
       ? `Tool error: ${step.toolOutput.error}`
       : `Tool output: ${stringifyForPrompt(step.toolOutput?.result)}`;
     return [
       `Step ${step.index}: ${step.actionName}`,
       step.reasoning ? `Reasoning: ${step.reasoning}` : undefined,
+      status || undefined,
       output,
     ]
       .filter(Boolean)
diff --git a/packages/core/tests/unit/tool-output-evidence.test.ts b/packages/core/tests/unit/tool-output-evidence.test.ts
new file mode 100644
index 0000000000..87b01d7529
--- /dev/null
+++ b/packages/core/tests/unit/tool-output-evidence.test.ts
@@ -0,0 +1,58 @@
+import { describe, expect, it } from "vitest";
+
+import { inferToolOutput } from "../../lib/v3/agent/utils/toolOutputEvidence.js";
+
+describe("inferToolOutput", () => {
+  it("preserves raw results while normalizing top-level failure status", () => {
+    const result = { success: false };
+
+    expect(inferToolOutput(result)).toEqual({
+      ok: false,
+      result,
+      error: undefined,
+    });
+  });
+
+  it("normalizes one-level AI SDK output wrappers", () => {
+    const result = {
+      toolCallId: "call-1",
+      output: { success: false, error: { message: "not found" } },
+    };
+
+    expect(inferToolOutput(result)).toEqual({
+      ok: false,
+      result,
+      error: '{"message":"not found"}',
+    });
+  });
+
+  it("handles isError and non-string errors", () => {
+    const result = { isError: true, error: new Error("bad input") };
+
+    expect(inferToolOutput(result)).toEqual({
+      ok: false,
+      result,
+      error: "bad input",
+    });
+  });
+
+  it("normalizes non-json error values", () => {
+    const result = { error: Symbol("bad input") };
+
+    expect(inferToolOutput(result)).toEqual({
+      ok: false,
+      result,
+      error: "Symbol(bad input)",
+    });
+  });
+
+  it("does not recursively treat page data as tool status", () => {
+    const result = { data: { success: false, error: "page field" } };
+
+    expect(inferToolOutput(result)).toEqual({
+      ok: true,
+      result,
+      error: undefined,
+    });
+  });
+});
diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts
index 1e9e3a0f19..2f488e5a8d 100644
--- a/packages/core/tests/unit/v3-evaluator.test.ts
+++ b/packages/core/tests/unit/v3-evaluator.test.ts
@@ -82,6 +82,39 @@ describe("V3Evaluator verifier facade", () => {
     expect(result.perCriterion).toBeUndefined();
   });
 
+  it("passes final observation screenshots to the legacy verifier adapter", async () => {
+    const taskSpec: TaskSpec = {
+      id: "final-observation",
+      instruction: "Complete the task",
+    };
+    const finalScreenshot = Buffer.from("final screenshot");
+    const trajectory = {
+      ...makeTrajectory(taskSpec),
+      finalObservation: {
+        url: "https://example.com/done",
+        screenshot: finalScreenshot,
+      },
+    };
+    const ask = vi.fn().mockResolvedValue({
+      evaluation: "YES",
+      reasoning: "The final screenshot shows completion.",
+    });
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "legacy",
+    });
+    Object.defineProperty(evaluator, "legacyEvaluator", {
+      value: { ask },
+    });
+
+    await evaluator.verify(trajectory);
+
+    expect(ask).toHaveBeenCalledWith(
+      expect.objectContaining({
+        screenshot: [finalScreenshot],
+      }),
+    );
+  });
+
   it("keeps legacy tool output detail until the overall reasoning budget is reached", async () => {
     const taskSpec: TaskSpec = {
       id: "reasoning-budget",
diff --git a/packages/core/tests/unit/verifier-evidence-normalization.test.ts b/packages/core/tests/unit/verifier-evidence-normalization.test.ts
new file mode 100644
index 0000000000..7bf0d59b5a
--- /dev/null
+++ b/packages/core/tests/unit/verifier-evidence-normalization.test.ts
@@ -0,0 +1,20 @@
+import { describe, expect, it } from "vitest";
+
+import { buildAgentEvidenceFromStepFinished } from "../../lib/v3/verifier/evidenceNormalization.js";
+
+describe("buildAgentEvidenceFromStepFinished", () => {
+  it("captures primitive tool results as text evidence", () => {
+    const evidence = buildAgentEvidenceFromStepFinished({
+      type: "step_finished",
+      stepIndex: 0,
+      actionName: "check",
+      actionArgs: {},
+      reasoning: "",
+      toolOutput: { ok: true, result: false },
+      startedAt: new Date(0).toISOString(),
+      finishedAt: new Date(1).toISOString(),
+    });
+
+    expect(evidence.modalities).toEqual([{ type: "text", content: "false" }]);
+  });
+});
diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts
index 7c9351d135..752d01beed 100644
--- a/packages/core/tests/unit/verifier-trajectory.test.ts
+++ b/packages/core/tests/unit/verifier-trajectory.test.ts
@@ -64,8 +64,10 @@ describe("verifier trajectory utilities", () => {
   it("loads trajectory screenshots and image modalities from disk", async () => {
     const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-"));
     const screenshot = Buffer.from("probe screenshot");
+    const finalScreenshot = Buffer.from("final screenshot");
     const agentImage = Buffer.from("agent image");
     await writeFile(path.join(dir, "screenshot_1.png"), screenshot);
+    await writeFile(path.join(dir, "final.png"), finalScreenshot);
     await mkdir(path.join(dir, "screenshots", "agent"), { recursive: true });
     await writeFile(
       path.join(dir, "screenshots", "agent", "1.png"),
@@ -102,6 +104,10 @@ describe("verifier trajectory utilities", () => {
             finishedAt: new Date(0).toISOString(),
           },
         ],
+        finalObservation: {
+          url: "https://example.com/done",
+          screenshotPath: "final.png",
+        },
       }),
     );
 
@@ -109,6 +115,7 @@ describe("verifier trajectory utilities", () => {
     const modality = trajectory.steps[0].agentEvidence.modalities[0];
 
     expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot);
+    expect(trajectory.finalObservation?.screenshot).toEqual(finalScreenshot);
     expect(modality.type).toBe("image");
     if (modality.type === "image") {
       expect(modality.bytes).toEqual(agentImage);
@@ -174,6 +181,10 @@ describe("verifier trajectory utilities", () => {
         startedAt: new Date(0).toISOString(),
         endedAt: new Date(0).toISOString(),
       },
+      finalObservation: {
+        url: "https://example.com/done",
+        screenshot: Buffer.from("final screenshot"),
+      },
       steps: [
         {
           index: 0,
@@ -220,6 +231,12 @@ describe("verifier trajectory utilities", () => {
     expect(trajectory.steps[0].toolOutput.result.output.screenshotBase64).toBe(
       "[redacted inline image payload]",
     );
+    expect(trajectory.finalObservation.screenshotPath).toBe(
+      "screenshots/probe/final.png",
+    );
+    await expect(
+      readFile(path.join(dir, "screenshots", "probe", "final.png")),
+    ).resolves.toEqual(Buffer.from("final screenshot"));
   });
 
   it("rejects screenshot paths outside the trajectory directory", async () => {
diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index da2b9b5da8..c48450908a 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -31,6 +31,7 @@ interface PartialStep {
   agentEvidence: AgentEvidence;
   probeEvidence: ProbeEvidence;
   toolOutput: { ok: boolean; result: unknown; error?: string };
+  startedAt: string;
   finishedAt: string;
 }
 
@@ -72,6 +73,7 @@ export class TrajectoryRecorder {
   // fire in one microtask.
   private readonly partialSteps = new Map<number, Partial<PartialStep>>();
   private finalAnswerEvent?: AgentFinalAnswerEvent;
+  private finalObservation?: ProbeEvidence;
   private startedAt = "";
   private endedAt = "";
 
@@ -115,6 +117,7 @@ export class TrajectoryRecorder {
       ...e.toolOutput,
       result: redactInlineImagePayloads(e.toolOutput.result, e.actionName),
     };
+    partial.startedAt = e.startedAt ?? e.finishedAt;
     partial.finishedAt = e.finishedAt;
     partial.agentEvidence = mergeAgentEvidence(
       partial.agentEvidence,
@@ -133,6 +136,20 @@ export class TrajectoryRecorder {
 
   private onFinalAnswer(e: AgentFinalAnswerEvent): void {
     this.finalAnswerEvent = e;
+    if (e.observation) {
+      this.finalObservation = {
+        url: e.observation.url,
+        ...(e.observation.screenshot
+          ? { screenshot: e.observation.screenshot }
+          : {}),
+        ...(e.observation.ariaTree !== undefined
+          ? { ariaTree: e.observation.ariaTree }
+          : {}),
+        ...(e.observation.scroll !== undefined
+          ? { scroll: e.observation.scroll }
+          : {}),
+      };
+    }
   }
 
   constructor(opts: TrajectoryRecorderOptions) {
@@ -181,6 +198,9 @@ export class TrajectoryRecorder {
       task: this.taskSpec,
       steps,
       finalAnswer: opts.finalAnswer ?? this.finalAnswerEvent?.message,
+      ...(this.finalObservation
+        ? { finalObservation: this.finalObservation }
+        : {}),
       status: opts.status,
       usage: { ...ZERO_USAGE, ...(opts.usage ?? {}) },
       timing: { startedAt: this.startedAt, endedAt: this.endedAt },
@@ -197,6 +217,7 @@ export class TrajectoryRecorder {
   cancel(): void {
     this.partialSteps.clear();
     this.finalAnswerEvent = undefined;
+    this.finalObservation = undefined;
   }
 
   /** Where the trajectory dir lives (whether or not it was persisted). */
@@ -259,6 +280,7 @@ export class TrajectoryRecorder {
       if (
         p.actionName === undefined ||
         p.toolOutput === undefined ||
+        p.startedAt === undefined ||
         p.finishedAt === undefined
       ) {
         // Provider-only screenshot refreshes are transport evidence for the
@@ -274,7 +296,7 @@ export class TrajectoryRecorder {
         agentEvidence: p.agentEvidence ?? { modalities: [] },
         probeEvidence: p.probeEvidence ?? {},
         toolOutput: p.toolOutput,
-        startedAt: this.startedAt,
+        startedAt: p.startedAt,
         finishedAt: p.finishedAt,
       });
     }
diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts
index 38443c5dc0..743d3b4ecd 100644
--- a/packages/evals/tests/framework/trajectoryRecorder.test.ts
+++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts
@@ -75,6 +75,7 @@ describe("TrajectoryRecorder", () => {
         ok: true,
         result: { economy: "$100", business: "$250" },
       },
+      startedAt: new Date(0).toISOString(),
       finishedAt: new Date(0).toISOString(),
     });
     recorder.record({
@@ -86,6 +87,11 @@ describe("TrajectoryRecorder", () => {
     recorder.record({
       type: "final_answer",
       message: "Business is $150 more than economy.",
+      observation: {
+        url: "https://example.com/checkout",
+        screenshot: Buffer.from("final-screen"),
+        ariaTree: "RootWebArea\nStaticText: Complete",
+      },
     });
 
     const trajectory = await recorder.finish({
@@ -109,6 +115,7 @@ describe("TrajectoryRecorder", () => {
       },
     });
     expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot);
+    expect(trajectory.steps[0].startedAt).toBe(new Date(0).toISOString());
     expect(trajectory.steps[0].agentEvidence.modalities).toEqual(
       expect.arrayContaining([
         { type: "image", bytes: screenshot, mediaType: "image/png" },
@@ -117,6 +124,13 @@ describe("TrajectoryRecorder", () => {
       ]),
     );
     expect(trajectory.finalAnswer).toBe("Business is $150 more than economy.");
+    expect(trajectory.finalObservation).toMatchObject({
+      url: "https://example.com/checkout",
+      ariaTree: "RootWebArea\nStaticText: Complete",
+    });
+    expect(trajectory.finalObservation?.screenshot).toEqual(
+      Buffer.from("final-screen"),
+    );
   });
 
   it("persists trajectory files and evaluator results", async () => {
@@ -151,6 +165,7 @@ describe("TrajectoryRecorder", () => {
       actionArgs: { instruction: "Search fares" },
       reasoning: "Search for fares.",
       toolOutput: { ok: true, result: "done" },
+      startedAt: new Date(0).toISOString(),
       finishedAt: new Date(0).toISOString(),
     });
     recorder.record({
@@ -158,6 +173,14 @@ describe("TrajectoryRecorder", () => {
       stepIndex: 0,
       url: "https://example.com/search",
     });
+    recorder.record({
+      type: "final_answer",
+      message: "Complete.",
+      observation: {
+        url: "https://example.com/complete",
+        screenshot: Buffer.from("final-screen"),
+      },
+    });
 
     await recorder.finish({ status: "complete" });
     await recorder.persistResult({
@@ -179,6 +202,9 @@ describe("TrajectoryRecorder", () => {
     await expect(
       fs.readFile(path.join(taskDir, "screenshots", "probe", "1.png")),
     ).resolves.toEqual(screenshot);
+    await expect(
+      fs.readFile(path.join(taskDir, "screenshots", "probe", "final.png")),
+    ).resolves.toEqual(Buffer.from("final-screen"));
     await expect(
       fs.readFile(path.join(taskDir, "screenshots", "agent", "1.png")),
     ).resolves.toEqual(screenshot);
@@ -192,6 +218,9 @@ describe("TrajectoryRecorder", () => {
     expect(trajectory.steps[0].probeEvidence.screenshotPath).toBe(
       "screenshots/probe/1.png",
     );
+    expect(trajectory.finalObservation.screenshotPath).toBe(
+      "screenshots/probe/final.png",
+    );
     expect(trajectory.steps[0].agentEvidence.modalities).toContainEqual({
       type: "image",
       imagePath: "screenshots/agent/1.png",
@@ -207,6 +236,29 @@ describe("TrajectoryRecorder", () => {
     });
   });
 
+  it("normalizes missing step startedAt to finishedAt", async () => {
+    const recorder = new TrajectoryRecorder({
+      taskSpec: makeTaskSpec(),
+      persist: false,
+    });
+    const finishedAt = new Date(1).toISOString();
+
+    recorder.record({
+      type: "step_finished",
+      stepIndex: 0,
+      actionName: "extract",
+      actionArgs: { instruction: "Read fares" },
+      reasoning: "",
+      toolOutput: { ok: true, result: false },
+      finishedAt,
+    });
+
+    const trajectory = await recorder.finish({ status: "complete" });
+
+    expect(trajectory.steps[0].startedAt).toBe(finishedAt);
+    expect(trajectory.steps[0].finishedAt).toBe(finishedAt);
+  });
+
   it("lifts inline screenshot payloads into image evidence and redacts JSON", async () => {
     const inlineScreenshot =
       Buffer.from("inline screenshot").toString("base64");
@@ -231,6 +283,7 @@ describe("TrajectoryRecorder", () => {
           },
         },
       },
+      startedAt: new Date(0).toISOString(),
       finishedAt: new Date(0).toISOString(),
     });
 

From 125246268e9b79b66aea483b9d3e6c97ab6c9d4d Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 22 May 2026 20:10:39 -0700
Subject: [PATCH 20/27] Remove verifier trajectory timestamps

---
 .../core/lib/v3/handlers/v3AgentHandler.ts    |  2 --
 .../core/lib/v3/handlers/v3CuaAgentHandler.ts |  6 +---
 .../v3/types/public/agentEvidenceEvents.ts    |  4 ---
 packages/core/lib/v3/verifier/trajectory.ts   | 16 ----------
 packages/core/lib/v3/verifier/types.ts        |  8 +----
 packages/core/tests/unit/v3-evaluator.test.ts |  6 ----
 .../verifier-evidence-normalization.test.ts   |  2 --
 .../tests/unit/verifier-trajectory.test.ts    | 24 --------------
 .../evals/framework/trajectoryRecorder.ts     | 25 ++-------------
 .../framework/trajectoryRecorder.test.ts      | 31 -------------------
 10 files changed, 5 insertions(+), 119 deletions(-)

diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts
index 4481c3dc68..68858f41be 100644
--- a/packages/core/lib/v3/handlers/v3AgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts
@@ -327,7 +327,6 @@ export class V3AgentHandler {
 
           const stepIndex = stepCounter++;
           stepIndicesInTurn.push(stepIndex);
-          const finishedAt = new Date().toISOString();
           await evidenceCallback?.({
             type: "step_finished",
             stepIndex,
@@ -338,7 +337,6 @@ export class V3AgentHandler {
                 : {},
             reasoning: event.text ?? "",
             toolOutput: inferToolOutput(toolResult),
-            finishedAt,
           });
         }
         state.currentPageUrl = (await this.v3.context.awaitActivePage()).url();
diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index a41aa7fd07..5c495b5c2f 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -134,7 +134,6 @@ export class V3CuaAgentHandler {
         defaultDelay;
       try {
         let executionResult: ActionExecutionResult | undefined;
-        const startedAt = new Date().toISOString();
         // Try to inject cursor before each action if enabled
         if (this.highlightCursor) {
           try {
@@ -165,7 +164,7 @@ export class V3CuaAgentHandler {
 
         action.timestamp = Date.now();
         if (shouldLog) {
-          await this.emitCuaActionStep(action, executionResult, startedAt);
+          await this.emitCuaActionStep(action, executionResult);
         }
 
         await new Promise((r) => setTimeout(r, waitBetween));
@@ -834,7 +833,6 @@ export class V3CuaAgentHandler {
   private async emitCuaActionStep(
     action: AgentAction,
     result: ActionExecutionResult | undefined,
-    startedAt: string,
   ): Promise<void> {
     let pageUrl =
       typeof action.pageUrl === "string"
@@ -867,8 +865,6 @@ export class V3CuaAgentHandler {
       actionArgs,
       reasoning,
       toolOutput: inferToolOutput(result ?? { success: true }),
-      startedAt,
-      finishedAt: new Date().toISOString(),
     });
 
     // Post-action tier-2 probe. The pre-action screenshot from
diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
index 23f90a5ef2..dcd0e89e70 100644
--- a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
+++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
@@ -52,10 +52,6 @@ export interface AgentStepFinishedEvent {
     result: unknown;
     error?: string;
   };
-  /** ISO 8601 timestamp at which the step's tool execution started, when available. */
-  startedAt?: string;
-  /** ISO 8601 timestamp at which the step finished. */
-  finishedAt: string;
 }
 
 /**
diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index 413ecc15fd..b8722a64df 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -253,7 +253,6 @@ export function shouldPersistTrajectory(
  *     ├── screenshots/
  *     │   ├── probe/<N>.png
  *     │   └── agent/<N>[_M].png
- *     ├── times.json
  *     ├── scores/            (empty; populated separately)
  *     └── core.log
  *
@@ -357,19 +356,6 @@ export async function writeTrajectoryDir(
     ),
   );
 
-  await fs.writeFile(
-    path.join(dir, "times.json"),
-    JSON.stringify(
-      {
-        timing: trajectory.timing,
-        usage: trajectory.usage,
-        stepCount: trajectory.steps.length,
-      },
-      null,
-      2,
-    ),
-  );
-
   await fs.mkdir(path.join(dir, "scores"), { recursive: true });
   await fs.writeFile(path.join(dir, "core.log"), coreLog(trajectory));
 }
@@ -384,8 +370,6 @@ function coreLog(trajectory: Trajectory): string {
           url: step.probeEvidence.url ?? null,
           ok: step.toolOutput.ok,
           reasoning: step.reasoning || undefined,
-          startedAt: step.startedAt,
-          finishedAt: step.finishedAt,
         }),
       )
       .join("\n") + "\n"
diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts
index c51ea2d47e..5431397092 100644
--- a/packages/core/lib/v3/verifier/types.ts
+++ b/packages/core/lib/v3/verifier/types.ts
@@ -119,10 +119,6 @@ export interface TrajectoryStep {
   agentEvidence: AgentEvidence;
   probeEvidence: ProbeEvidence;
   toolOutput: ToolOutput;
-  /** ISO 8601 timestamp when the step's tool execution started. */
-  startedAt: string;
-  /** ISO 8601 timestamp when the step's tool execution finished. */
-  finishedAt: string;
 }
 
 /** Terminal status of the agent run. */
@@ -139,8 +135,7 @@ export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error";
  *     ├── screenshots/      — step probe/agent images plus final observation
  *     ├── scores/
  *     │   └── result.json       — Result from V3Evaluator.verify()
- *     ├── core.log          — captured action log
- *     └── times.json        — step timing + token usage
+ *     └── core.log          — captured action log
  */
 export interface Trajectory {
   task: TaskSpec;
@@ -150,7 +145,6 @@ export interface Trajectory {
   finalObservation?: ProbeEvidence;
   status: TrajectoryStatus;
   usage: TrajectoryUsage;
-  timing: { startedAt: string; endedAt: string };
 }
 
 /** Score for a single rubric criterion after evidence analysis + rescoring. */
diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts
index 2f488e5a8d..b18650d8ea 100644
--- a/packages/core/tests/unit/v3-evaluator.test.ts
+++ b/packages/core/tests/unit/v3-evaluator.test.ts
@@ -197,10 +197,6 @@ function makeEmptyTrajectory(taskSpec: TaskSpec): Trajectory {
       input_tokens: 0,
       output_tokens: 0,
     },
-    timing: {
-      startedAt: new Date(0).toISOString(),
-      endedAt: new Date(0).toISOString(),
-    },
   };
 }
 
@@ -228,8 +224,6 @@ function makeTrajectory(
           ok: true,
           result: options.toolResult ?? "done",
         },
-        startedAt: new Date(0).toISOString(),
-        finishedAt: new Date(0).toISOString(),
       },
     ],
     finalAnswer: options.finalAnswer,
diff --git a/packages/core/tests/unit/verifier-evidence-normalization.test.ts b/packages/core/tests/unit/verifier-evidence-normalization.test.ts
index 7bf0d59b5a..5b6ee249b2 100644
--- a/packages/core/tests/unit/verifier-evidence-normalization.test.ts
+++ b/packages/core/tests/unit/verifier-evidence-normalization.test.ts
@@ -11,8 +11,6 @@ describe("buildAgentEvidenceFromStepFinished", () => {
       actionArgs: {},
       reasoning: "",
       toolOutput: { ok: true, result: false },
-      startedAt: new Date(0).toISOString(),
-      finishedAt: new Date(1).toISOString(),
     });
 
     expect(evidence.modalities).toEqual([{ type: "text", content: "false" }]);
diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts
index 752d01beed..e57f2bb3c4 100644
--- a/packages/core/tests/unit/verifier-trajectory.test.ts
+++ b/packages/core/tests/unit/verifier-trajectory.test.ts
@@ -79,10 +79,6 @@ describe("verifier trajectory utilities", () => {
         task: { id: "task", instruction: "Do the task" },
         status: "complete",
         usage: { input_tokens: 0, output_tokens: 0 },
-        timing: {
-          startedAt: new Date(0).toISOString(),
-          endedAt: new Date(0).toISOString(),
-        },
         steps: [
           {
             index: 0,
@@ -100,8 +96,6 @@ describe("verifier trajectory utilities", () => {
             },
             probeEvidence: { screenshotPath: "screenshot_1.png" },
             toolOutput: { ok: true, result: null },
-            startedAt: new Date(0).toISOString(),
-            finishedAt: new Date(0).toISOString(),
           },
         ],
         finalObservation: {
@@ -131,10 +125,6 @@ describe("verifier trajectory utilities", () => {
         task: { id: "task", instruction: "Do the task" },
         status: "complete",
         usage: { input_tokens: 0, output_tokens: 0 },
-        timing: {
-          startedAt: new Date(0).toISOString(),
-          endedAt: new Date(0).toISOString(),
-        },
         steps: [
           {
             index: 0,
@@ -152,8 +142,6 @@ describe("verifier trajectory utilities", () => {
             },
             probeEvidence: {},
             toolOutput: { ok: true, result: null },
-            startedAt: new Date(0).toISOString(),
-            finishedAt: new Date(0).toISOString(),
           },
         ],
       }),
@@ -177,10 +165,6 @@ describe("verifier trajectory utilities", () => {
       task: { id: "task", instruction: "Do the task" },
       status: "complete",
       usage: { input_tokens: 0, output_tokens: 0 },
-      timing: {
-        startedAt: new Date(0).toISOString(),
-        endedAt: new Date(0).toISOString(),
-      },
       finalObservation: {
         url: "https://example.com/done",
         screenshot: Buffer.from("final screenshot"),
@@ -214,8 +198,6 @@ describe("verifier trajectory utilities", () => {
               },
             },
           },
-          startedAt: new Date(0).toISOString(),
-          finishedAt: new Date(0).toISOString(),
         },
       ],
     });
@@ -247,10 +229,6 @@ describe("verifier trajectory utilities", () => {
         task: { id: "task", instruction: "Do the task" },
         status: "complete",
         usage: { input_tokens: 0, output_tokens: 0 },
-        timing: {
-          startedAt: new Date(0).toISOString(),
-          endedAt: new Date(0).toISOString(),
-        },
         steps: [
           {
             index: 0,
@@ -260,8 +238,6 @@ describe("verifier trajectory utilities", () => {
             agentEvidence: { modalities: [] },
             probeEvidence: { screenshotPath: "../../../etc/passwd" },
             toolOutput: { ok: true, result: null },
-            startedAt: new Date(0).toISOString(),
-            finishedAt: new Date(0).toISOString(),
           },
         ],
       }),
diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index c48450908a..84b57b43a4 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -31,8 +31,6 @@ interface PartialStep {
   agentEvidence: AgentEvidence;
   probeEvidence: ProbeEvidence;
   toolOutput: { ok: boolean; result: unknown; error?: string };
-  startedAt: string;
-  finishedAt: string;
 }
 
 export interface TrajectoryRecorderOptions {
@@ -74,8 +72,6 @@ export class TrajectoryRecorder {
   private readonly partialSteps = new Map<number, Partial<PartialStep>>();
   private finalAnswerEvent?: AgentFinalAnswerEvent;
   private finalObservation?: ProbeEvidence;
-  private startedAt = "";
-  private endedAt = "";
 
   private onScreenshot(e: AgentScreenshotEvidenceEvent): void {
     const partial = this.ensurePartial(e.stepIndex);
@@ -117,8 +113,6 @@ export class TrajectoryRecorder {
       ...e.toolOutput,
       result: redactInlineImagePayloads(e.toolOutput.result, e.actionName),
     };
-    partial.startedAt = e.startedAt ?? e.finishedAt;
-    partial.finishedAt = e.finishedAt;
     partial.agentEvidence = mergeAgentEvidence(
       partial.agentEvidence,
       buildAgentEvidenceFromStepFinished(e),
@@ -160,15 +154,13 @@ export class TrajectoryRecorder {
     this.persistEnabled = shouldPersistTrajectory(opts.persist);
   }
 
-  /** Mark the beginning of collection. Call once before agent.execute(). */
+  /** Mark the beginning of collection. Retained as a no-op for compatibility. */
   start(): void {
-    if (this.startedAt) return;
-    this.startedAt = new Date().toISOString();
+    return;
   }
 
   /** Ingest an evidence callback event from agent.execute(). */
   record(event: AgentEvidenceEvent): void {
-    if (!this.startedAt) this.start();
     switch (event.type) {
       case "screenshot":
         this.onScreenshot(event);
@@ -190,9 +182,6 @@ export class TrajectoryRecorder {
    * write the on-disk layout. Idempotent.
    */
   async finish(opts: TrajectoryFinishOptions): Promise<Trajectory> {
-    if (!this.startedAt) this.start();
-    this.endedAt = new Date().toISOString();
-
     const steps = this.assembleSteps();
     const trajectory: Trajectory = {
       task: this.taskSpec,
@@ -203,7 +192,6 @@ export class TrajectoryRecorder {
         : {}),
       status: opts.status,
       usage: { ...ZERO_USAGE, ...(opts.usage ?? {}) },
-      timing: { startedAt: this.startedAt, endedAt: this.endedAt },
     };
 
     if (this.persistEnabled) {
@@ -277,12 +265,7 @@ export class TrajectoryRecorder {
     const indices = [...this.partialSteps.keys()].sort((a, b) => a - b);
     for (const i of indices) {
       const p = this.partialSteps.get(i)!;
-      if (
-        p.actionName === undefined ||
-        p.toolOutput === undefined ||
-        p.startedAt === undefined ||
-        p.finishedAt === undefined
-      ) {
+      if (p.actionName === undefined || p.toolOutput === undefined) {
         // Provider-only screenshot refreshes are transport evidence for the
         // next CUA action. If no action arrives for this index, there is no
         // completed trajectory step to persist.
@@ -296,8 +279,6 @@ export class TrajectoryRecorder {
         agentEvidence: p.agentEvidence ?? { modalities: [] },
         probeEvidence: p.probeEvidence ?? {},
         toolOutput: p.toolOutput,
-        startedAt: p.startedAt,
-        finishedAt: p.finishedAt,
       });
     }
     return out;
diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts
index 743d3b4ecd..57f4c93c55 100644
--- a/packages/evals/tests/framework/trajectoryRecorder.test.ts
+++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts
@@ -75,8 +75,6 @@ describe("TrajectoryRecorder", () => {
         ok: true,
         result: { economy: "$100", business: "$250" },
       },
-      startedAt: new Date(0).toISOString(),
-      finishedAt: new Date(0).toISOString(),
     });
     recorder.record({
       type: "step_observed",
@@ -115,7 +113,6 @@ describe("TrajectoryRecorder", () => {
       },
     });
     expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot);
-    expect(trajectory.steps[0].startedAt).toBe(new Date(0).toISOString());
     expect(trajectory.steps[0].agentEvidence.modalities).toEqual(
       expect.arrayContaining([
         { type: "image", bytes: screenshot, mediaType: "image/png" },
@@ -165,8 +162,6 @@ describe("TrajectoryRecorder", () => {
       actionArgs: { instruction: "Search fares" },
       reasoning: "Search for fares.",
       toolOutput: { ok: true, result: "done" },
-      startedAt: new Date(0).toISOString(),
-      finishedAt: new Date(0).toISOString(),
     });
     recorder.record({
       type: "step_observed",
@@ -195,7 +190,6 @@ describe("TrajectoryRecorder", () => {
         "scores",
         "screenshots",
         "task_data.json",
-        "times.json",
         "trajectory.json",
       ]),
     );
@@ -236,29 +230,6 @@ describe("TrajectoryRecorder", () => {
     });
   });
 
-  it("normalizes missing step startedAt to finishedAt", async () => {
-    const recorder = new TrajectoryRecorder({
-      taskSpec: makeTaskSpec(),
-      persist: false,
-    });
-    const finishedAt = new Date(1).toISOString();
-
-    recorder.record({
-      type: "step_finished",
-      stepIndex: 0,
-      actionName: "extract",
-      actionArgs: { instruction: "Read fares" },
-      reasoning: "",
-      toolOutput: { ok: true, result: false },
-      finishedAt,
-    });
-
-    const trajectory = await recorder.finish({ status: "complete" });
-
-    expect(trajectory.steps[0].startedAt).toBe(finishedAt);
-    expect(trajectory.steps[0].finishedAt).toBe(finishedAt);
-  });
-
   it("lifts inline screenshot payloads into image evidence and redacts JSON", async () => {
     const inlineScreenshot =
       Buffer.from("inline screenshot").toString("base64");
@@ -283,8 +254,6 @@ describe("TrajectoryRecorder", () => {
           },
         },
       },
-      startedAt: new Date(0).toISOString(),
-      finishedAt: new Date(0).toISOString(),
     });
 
     const trajectory = await recorder.finish({ status: "complete" });

From b4a1537e4faca2eaa361fe4ac190469042029521 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Sat, 23 May 2026 19:06:36 -0700
Subject: [PATCH 21/27] refactor(verifier): simplify evidence event sequencing

---
 .../v3/agent/utils/cuaEvidenceStepTracker.ts  |  55 -----
 .../v3/agent/utils/postStepProbeEvidence.ts   |  27 +--
 .../core/lib/v3/handlers/v3AgentHandler.ts    |  18 +-
 .../core/lib/v3/handlers/v3CuaAgentHandler.ts |  25 +-
 .../v3/types/public/agentEvidenceEvents.ts    |  14 +-
 .../unit/cua-evidence-step-tracker.test.ts    |  55 -----
 .../tests/unit/tool-output-evidence.test.ts   |  85 +++----
 .../verifier-evidence-normalization.test.ts   |  49 +++-
 .../evals/framework/trajectoryRecorder.ts     | 150 ++++++------
 .../framework/trajectoryRecorder.test.ts      | 214 +++++++-----------
 10 files changed, 260 insertions(+), 432 deletions(-)
 delete mode 100644 packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts
 delete mode 100644 packages/core/tests/unit/cua-evidence-step-tracker.test.ts

diff --git a/packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts b/packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts
deleted file mode 100644
index 356cc6a98c..0000000000
--- a/packages/core/lib/v3/agent/utils/cuaEvidenceStepTracker.ts
+++ /dev/null
@@ -1,55 +0,0 @@
-import type { AgentScreenshotEvidenceEvent } from "../../types/public/agentEvidenceEvents.js";
-
-export interface PairedCuaActionStep {
-  stepIndex: number;
-  replayScreenshot?: AgentScreenshotEvidenceEvent;
-}
-
-export class CuaEvidenceStepTracker {
-  private nextStepIndex = 0;
-  private latestScreenshot?: AgentScreenshotEvidenceEvent;
-  private latestScreenshotConsumed = true;
-
-  reset(): void {
-    this.nextStepIndex = 0;
-    this.latestScreenshot = undefined;
-    this.latestScreenshotConsumed = true;
-  }
-
-  recordScreenshot(
-    screenshot: Buffer,
-    url: string,
-  ): AgentScreenshotEvidenceEvent {
-    const event: AgentScreenshotEvidenceEvent = {
-      type: "screenshot",
-      stepIndex: this.nextStepIndex++,
-      screenshot,
-      url,
-      evidenceRole: "agent",
-    };
-    this.latestScreenshot = event;
-    this.latestScreenshotConsumed = false;
-    return event;
-  }
-
-  pairAction(): PairedCuaActionStep {
-    if (this.latestScreenshot && !this.latestScreenshotConsumed) {
-      this.latestScreenshotConsumed = true;
-      return { stepIndex: this.latestScreenshot.stepIndex };
-    }
-
-    const stepIndex = this.nextStepIndex++;
-    if (this.latestScreenshot) {
-      return {
-        stepIndex,
-        replayScreenshot: { ...this.latestScreenshot, stepIndex },
-      };
-    }
-
-    return { stepIndex };
-  }
-
-  get latestScreenshotUrl(): string | undefined {
-    return this.latestScreenshot?.url;
-  }
-}
diff --git a/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts
index 10889d6cf8..f68315dbad 100644
--- a/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts
+++ b/packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts
@@ -11,7 +11,6 @@ interface CaptureProbeEvidenceOptions {
 }
 
 interface EmitPostStepProbeEvidenceOptions extends CaptureProbeEvidenceOptions {
-  stepIndices: number | number[];
   evidenceCallback?: AgentEvidenceCallback;
 }
 
@@ -53,7 +52,6 @@ export async function captureProbeEvidence({
 
 export async function emitPostStepProbeEvidence({
   v3,
-  stepIndices,
   url,
   evidenceCallback,
   logger,
@@ -61,30 +59,23 @@ export async function emitPostStepProbeEvidence({
 }: EmitPostStepProbeEvidenceOptions): Promise<void> {
   if (!evidenceCallback) return;
 
-  const indices = Array.isArray(stepIndices) ? stepIndices : [stepIndices];
-  if (indices.length === 0) return;
-
   const probe = await captureProbeEvidence({
     v3,
     url,
     logger,
     warningMessage,
   });
-  for (const stepIndex of indices) {
-    if (probe.screenshot) {
-      await evidenceCallback({
-        type: "screenshot",
-        stepIndex,
-        screenshot: probe.screenshot,
-        url: probe.url,
-        evidenceRole: "probe",
-      });
-    }
+  if (probe.screenshot) {
     await evidenceCallback({
-      type: "step_observed",
-      stepIndex,
+      type: "screenshot",
+      screenshot: probe.screenshot,
       url: probe.url,
-      ariaTree: probe.ariaTree,
+      evidenceRole: "probe",
     });
   }
+  await evidenceCallback({
+    type: "step_observed",
+    url: probe.url,
+    ariaTree: probe.ariaTree,
+  });
 }
diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts
index 68858f41be..c3b6a5892e 100644
--- a/packages/core/lib/v3/handlers/v3AgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts
@@ -266,10 +266,6 @@ export class V3AgentHandler {
     state: AgentState,
     { userCallback, evidenceCallback, onFinalAnswer }: StepHandlerOptions,
   ) {
-    // Monotonic step counter scoped to this execute() call. Each tool call in
-    // the agent loop becomes one trajectory step. The counter feeds stepIndex
-    // on evidence callback events.
-    let stepCounter = 0;
     return async (event: StepResult<ToolSet>) => {
       this.logger({
         category: "agent",
@@ -277,7 +273,6 @@ export class V3AgentHandler {
         level: 2,
       });
 
-      const stepIndicesInTurn: number[] = [];
       let lastFinalAnswer: FinalAnswerDraft | undefined;
 
       if (event.toolCalls && event.toolCalls.length > 0) {
@@ -325,11 +320,8 @@ export class V3AgentHandler {
             state.actions.push(action);
           }
 
-          const stepIndex = stepCounter++;
-          stepIndicesInTurn.push(stepIndex);
           await evidenceCallback?.({
             type: "step_finished",
-            stepIndex,
             actionName: toolCall.toolName,
             actionArgs:
               typeof args === "object" && args !== null
@@ -341,14 +333,12 @@ export class V3AgentHandler {
         }
         state.currentPageUrl = (await this.v3.context.awaitActivePage()).url();
 
-        // Harness probe — take a single screenshot / a11y snapshot per AI SDK
-        // step and attach it to every tool call in that turn. The observation
-        // reflects the settled page state after the batch of tool calls; this
-        // is more faithful than dropping probe evidence for all but the last
-        // tool call, while still avoiding per-tool screenshot overhead.
+        // Harness probe — one screenshot / a11y snapshot per AI SDK step.
+        // The recorder applies the probe to every step_finished received
+        // since the previous probe, so a multi-tool turn shares the same
+        // post-turn observation.
         await emitPostStepProbeEvidence({
           v3: this.v3,
-          stepIndices: stepIndicesInTurn,
           url: state.currentPageUrl,
           evidenceCallback,
           logger: this.logger,
diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index 5c495b5c2f..901dce71da 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -12,7 +12,6 @@ import {
   emitPostStepProbeEvidence,
 } from "../agent/utils/postStepProbeEvidence.js";
 import { wrapEvidenceCallback } from "../agent/utils/wrapEvidenceCallback.js";
-import { CuaEvidenceStepTracker } from "../agent/utils/cuaEvidenceStepTracker.js";
 import { inferToolOutput } from "../agent/utils/toolOutputEvidence.js";
 import {
   ActionExecutionResult,
@@ -45,7 +44,7 @@ export class V3CuaAgentHandler {
   private captchaSolver: CaptchaSolver | null = null;
   private captchaClickGuardRemaining = 0;
   private currentInstruction = "";
-  private readonly cuaEvidenceSteps = new CuaEvidenceStepTracker();
+  private lastAgentScreenshotUrl?: string;
   private evidenceCallback?: AgentEvidenceCallback;
 
   constructor(
@@ -205,7 +204,7 @@ export class V3CuaAgentHandler {
       options.callbacks?.onEvidence,
       this.logger,
     );
-    this.cuaEvidenceSteps.reset();
+    this.lastAgentScreenshotUrl = undefined;
 
     this.highlightCursor = options.highlightCursor !== false;
     this.currentInstruction = options.instruction;
@@ -266,7 +265,7 @@ export class V3CuaAgentHandler {
         try {
           finalUrl = (await this.v3.context.awaitActivePage()).url();
         } catch {
-          finalUrl = this.cuaEvidenceSteps.latestScreenshotUrl ?? "";
+          finalUrl = this.lastAgentScreenshotUrl ?? "";
         }
         const observation = await captureProbeEvidence({
           v3: this.v3,
@@ -825,9 +824,13 @@ export class V3CuaAgentHandler {
     screenshot: Buffer,
     url: string,
   ): Promise<void> {
-    await this.evidenceCallback?.(
-      this.cuaEvidenceSteps.recordScreenshot(screenshot, url),
-    );
+    this.lastAgentScreenshotUrl = url;
+    await this.evidenceCallback?.({
+      type: "screenshot",
+      screenshot,
+      url,
+      evidenceRole: "agent",
+    });
   }
 
   private async emitCuaActionStep(
@@ -837,16 +840,12 @@ export class V3CuaAgentHandler {
     let pageUrl =
       typeof action.pageUrl === "string"
         ? action.pageUrl
-        : (this.cuaEvidenceSteps.latestScreenshotUrl ?? "");
+        : (this.lastAgentScreenshotUrl ?? "");
     try {
       pageUrl = (await this.v3.context.awaitActivePage()).url();
     } catch {
       // Keep the best pre-action URL fallback.
     }
-    const { stepIndex, replayScreenshot } = this.cuaEvidenceSteps.pairAction();
-    if (replayScreenshot) {
-      await this.evidenceCallback?.(replayScreenshot);
-    }
 
     const actionArgs = Object.fromEntries(
       Object.entries(action).filter(([key]) => key !== "screenshot"),
@@ -860,7 +859,6 @@ export class V3CuaAgentHandler {
 
     await this.evidenceCallback?.({
       type: "step_finished",
-      stepIndex,
       actionName: String(action.type),
       actionArgs,
       reasoning,
@@ -874,7 +872,6 @@ export class V3CuaAgentHandler {
     // has to trust the action history alone.
     await emitPostStepProbeEvidence({
       v3: this.v3,
-      stepIndices: stepIndex,
       url: pageUrl,
       evidenceCallback: this.evidenceCallback,
       logger: this.logger,
diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
index dcd0e89e70..25e2bd51f4 100644
--- a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
+++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
@@ -2,8 +2,11 @@
  * Evidence events emitted through AgentExecuteOptions.callbacks.onEvidence.
  *
  * These events describe observations made by Stagehand during an agent run.
- * They are intentionally transport-level callback payloads; verifier-specific
- * storage and normalization live in the evals/verifier layers.
+ * They are emitted in temporal order; consumers should treat the stream as
+ * sequential (pair an agent-role screenshot with the next step_finished,
+ * apply a step_observed/probe to all steps_finished since the last probe).
+ * Verifier-specific storage and normalization live in the evals/verifier
+ * layers.
  */
 
 export type AgentEvidenceRole = "probe" | "agent";
@@ -23,8 +26,6 @@ export type AgentEvidenceEvent =
  */
 export interface AgentScreenshotEvidenceEvent {
   type: "screenshot";
-  /** Zero-based index of the step this screenshot corresponds to. */
-  stepIndex: number;
   /** PNG bytes from page.screenshot(). */
   screenshot: Buffer;
   /** Page URL at the time of capture. */
@@ -38,7 +39,6 @@ export interface AgentScreenshotEvidenceEvent {
  */
 export interface AgentStepFinishedEvent {
   type: "step_finished";
-  stepIndex: number;
   /** Name of the tool/action that ran, e.g. "act", "extract", "click". */
   actionName: string;
   /** Arguments passed to the tool/action. */
@@ -55,11 +55,11 @@ export interface AgentStepFinishedEvent {
 }
 
 /**
- * Independent post-step browser observation.
+ * Independent post-step browser observation. Emitted once per agent turn;
+ * consumers apply it to every step_finished received since the previous probe.
  */
 export interface AgentStepObservedEvent {
   type: "step_observed";
-  stepIndex: number;
   /** Page URL after the step's tool/action execution. */
   url: string;
   /** Accessibility tree snapshot, when captured. */
diff --git a/packages/core/tests/unit/cua-evidence-step-tracker.test.ts b/packages/core/tests/unit/cua-evidence-step-tracker.test.ts
deleted file mode 100644
index 112c820a97..0000000000
--- a/packages/core/tests/unit/cua-evidence-step-tracker.test.ts
+++ /dev/null
@@ -1,55 +0,0 @@
-import { describe, expect, it } from "vitest";
-
-import { CuaEvidenceStepTracker } from "../../lib/v3/agent/utils/cuaEvidenceStepTracker.js";
-
-describe("CuaEvidenceStepTracker", () => {
-  it("pairs a fresh provider screenshot with the next action", () => {
-    const tracker = new CuaEvidenceStepTracker();
-    const screenshot = Buffer.from("screen");
-
-    const event = tracker.recordScreenshot(screenshot, "https://example.com");
-    const paired = tracker.pairAction();
-
-    expect(event).toMatchObject({
-      type: "screenshot",
-      stepIndex: 0,
-      evidenceRole: "agent",
-      url: "https://example.com",
-    });
-    expect(paired).toEqual({ stepIndex: 0 });
-  });
-
-  it("allocates an action step without screenshot evidence", () => {
-    const tracker = new CuaEvidenceStepTracker();
-
-    expect(tracker.pairAction()).toEqual({ stepIndex: 0 });
-  });
-
-  it("replays the latest consumed screenshot for later actions", () => {
-    const tracker = new CuaEvidenceStepTracker();
-    const screenshot = Buffer.from("screen");
-
-    tracker.recordScreenshot(screenshot, "https://example.com/start");
-    tracker.pairAction();
-    const paired = tracker.pairAction();
-
-    expect(paired.stepIndex).toBe(1);
-    expect(paired.replayScreenshot).toMatchObject({
-      type: "screenshot",
-      stepIndex: 1,
-      evidenceRole: "agent",
-      url: "https://example.com/start",
-    });
-    expect(paired.replayScreenshot?.screenshot).toEqual(screenshot);
-  });
-
-  it("resets step allocation and pending screenshot state", () => {
-    const tracker = new CuaEvidenceStepTracker();
-
-    tracker.recordScreenshot(Buffer.from("screen"), "https://example.com");
-    tracker.reset();
-
-    expect(tracker.pairAction()).toEqual({ stepIndex: 0 });
-    expect(tracker.latestScreenshotUrl).toBeUndefined();
-  });
-});
diff --git a/packages/core/tests/unit/tool-output-evidence.test.ts b/packages/core/tests/unit/tool-output-evidence.test.ts
index 87b01d7529..fd7c2aabde 100644
--- a/packages/core/tests/unit/tool-output-evidence.test.ts
+++ b/packages/core/tests/unit/tool-output-evidence.test.ts
@@ -3,56 +3,45 @@ import { describe, expect, it } from "vitest";
 import { inferToolOutput } from "../../lib/v3/agent/utils/toolOutputEvidence.js";
 
 describe("inferToolOutput", () => {
-  it("preserves raw results while normalizing top-level failure status", () => {
-    const result = { success: false };
-
-    expect(inferToolOutput(result)).toEqual({
-      ok: false,
-      result,
-      error: undefined,
-    });
-  });
-
-  it("normalizes one-level AI SDK output wrappers", () => {
-    const result = {
-      toolCallId: "call-1",
-      output: { success: false, error: { message: "not found" } },
-    };
-
-    expect(inferToolOutput(result)).toEqual({
-      ok: false,
-      result,
-      error: '{"message":"not found"}',
-    });
-  });
-
-  it("handles isError and non-string errors", () => {
-    const result = { isError: true, error: new Error("bad input") };
-
-    expect(inferToolOutput(result)).toEqual({
-      ok: false,
-      result,
-      error: "bad input",
-    });
-  });
-
-  it("normalizes non-json error values", () => {
-    const result = { error: Symbol("bad input") };
-
-    expect(inferToolOutput(result)).toEqual({
-      ok: false,
-      result,
-      error: "Symbol(bad input)",
-    });
-  });
-
-  it("does not recursively treat page data as tool status", () => {
-    const result = { data: { success: false, error: "page field" } };
-
+  it.each<[string, unknown, boolean, string | undefined]>([
+    [
+      "preserves raw results while normalizing top-level failure status",
+      { success: false },
+      false,
+      undefined,
+    ],
+    [
+      "normalizes one-level AI SDK output wrappers",
+      {
+        toolCallId: "call-1",
+        output: { success: false, error: { message: "not found" } },
+      },
+      false,
+      '{"message":"not found"}',
+    ],
+    [
+      "handles isError and non-string errors",
+      { isError: true, error: new Error("bad input") },
+      false,
+      "bad input",
+    ],
+    [
+      "normalizes non-json error values",
+      { error: Symbol("bad input") },
+      false,
+      "Symbol(bad input)",
+    ],
+    [
+      "does not recursively treat page data as tool status",
+      { data: { success: false, error: "page field" } },
+      true,
+      undefined,
+    ],
+  ])("%s", (_, result, ok, error) => {
     expect(inferToolOutput(result)).toEqual({
-      ok: true,
+      ok,
       result,
-      error: undefined,
+      error,
     });
   });
 });
diff --git a/packages/core/tests/unit/verifier-evidence-normalization.test.ts b/packages/core/tests/unit/verifier-evidence-normalization.test.ts
index 5b6ee249b2..174b0a87f5 100644
--- a/packages/core/tests/unit/verifier-evidence-normalization.test.ts
+++ b/packages/core/tests/unit/verifier-evidence-normalization.test.ts
@@ -1,12 +1,14 @@
 import { describe, expect, it } from "vitest";
 
-import { buildAgentEvidenceFromStepFinished } from "../../lib/v3/verifier/evidenceNormalization.js";
+import {
+  buildAgentEvidenceFromStepFinished,
+  REDACTED_INLINE_IMAGE,
+} from "../../lib/v3/verifier/evidenceNormalization.js";
 
 describe("buildAgentEvidenceFromStepFinished", () => {
   it("captures primitive tool results as text evidence", () => {
     const evidence = buildAgentEvidenceFromStepFinished({
       type: "step_finished",
-      stepIndex: 0,
       actionName: "check",
       actionArgs: {},
       reasoning: "",
@@ -15,4 +17,47 @@ describe("buildAgentEvidenceFromStepFinished", () => {
 
     expect(evidence.modalities).toEqual([{ type: "text", content: "false" }]);
   });
+
+  it("lifts inline screenshot payloads into image evidence and redacts JSON", () => {
+    const inlineScreenshot =
+      Buffer.from("inline screenshot").toString("base64");
+
+    const evidence = buildAgentEvidenceFromStepFinished({
+      type: "step_finished",
+      actionName: "click",
+      actionArgs: { describe: "Open fare details" },
+      reasoning: "",
+      toolOutput: {
+        ok: true,
+        result: {
+          output: {
+            success: true,
+            describe: "Open fare details",
+            screenshotBase64: inlineScreenshot,
+          },
+        },
+      },
+    });
+
+    const [imageModality, jsonModality] = evidence.modalities;
+
+    expect(JSON.stringify(evidence)).not.toContain(inlineScreenshot);
+    expect(jsonModality).toMatchObject({
+      type: "json",
+      content: {
+        output: {
+          screenshotBase64: REDACTED_INLINE_IMAGE,
+        },
+      },
+    });
+    expect(imageModality).toMatchObject({
+      type: "image",
+      mediaType: "image/png",
+    });
+    if (imageModality?.type === "image") {
+      expect(imageModality.bytes).toEqual(
+        Buffer.from(inlineScreenshot, "base64"),
+      );
+    }
+  });
 });
diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index 84b57b43a4..5785ca4388 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -23,14 +23,9 @@ import type {
   EvaluationResult,
 } from "@browserbasehq/stagehand";
 
-interface PartialStep {
-  index: number;
-  actionName: string;
-  actionArgs: Record<string, unknown>;
-  reasoning: string;
-  agentEvidence: AgentEvidence;
-  probeEvidence: ProbeEvidence;
-  toolOutput: { ok: boolean; result: unknown; error?: string };
+interface PendingScreenshot {
+  screenshot: Buffer;
+  url: string;
 }
 
 export interface TrajectoryRecorderOptions {
@@ -67,65 +62,79 @@ export class TrajectoryRecorder {
   private readonly outputDir: string;
   private readonly persistEnabled: boolean;
 
-  // Events can arrive out-of-order across step indices; same-step events all
-  // fire in one microtask.
-  private readonly partialSteps = new Map<number, Partial<PartialStep>>();
+  // Steps are appended in arrival order on each step_finished event.
+  private readonly steps: TrajectoryStep[] = [];
+  // The most recent agent-role screenshot is held until the next step_finished
+  // consumes it. A second agent-role screenshot before any step_finished
+  // overwrites the first — that's the desired behavior when a turn is skipped
+  // (e.g., captcha guard short-circuits before emitting step_finished).
+  private pendingAgentScreenshot?: PendingScreenshot;
+  // The most recent probe-role screenshot waits for the matching step_observed.
+  private pendingProbeScreenshot?: PendingScreenshot;
+  // Steps that haven't yet had a probe attached. The next step_observed fans
+  // out to all of them (one probe per agent turn, N tool calls per turn).
+  private stepsAwaitingProbe: number[] = [];
   private finalAnswerEvent?: AgentFinalAnswerEvent;
   private finalObservation?: ProbeEvidence;
 
   private onScreenshot(e: AgentScreenshotEvidenceEvent): void {
-    const partial = this.ensurePartial(e.stepIndex);
-
-    // Default to probe when the emit site doesn't tag a role: matches
-    // v3AgentHandler's post-step screenshot. For CUA the pre-action shot is
-    // NOT a probe — emitCuaActionStep fills that role post-action.
     const role = e.evidenceRole ?? "probe";
-
-    if (role === "probe") {
-      const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) };
-      probe.screenshot = e.screenshot;
-      probe.url = e.url;
-      partial.probeEvidence = probe;
-    } else if (!partial.probeEvidence?.url) {
-      // Capture URL even for tier-1-only events; a later post-action URL
-      // can still overwrite it.
-      partial.probeEvidence = {
-        ...(partial.probeEvidence ?? {}),
-        url: e.url,
-      };
-    }
-
     if (role === "agent") {
-      partial.agentEvidence = mergeAgentEvidence(partial.agentEvidence, {
-        modalities: [
-          { type: "image", bytes: e.screenshot, mediaType: "image/png" },
-        ],
-      });
+      this.pendingAgentScreenshot = { screenshot: e.screenshot, url: e.url };
+    } else {
+      this.pendingProbeScreenshot = { screenshot: e.screenshot, url: e.url };
     }
   }
 
   private onStepFinished(e: AgentStepFinishedEvent): void {
-    const partial = this.ensurePartial(e.stepIndex);
-    partial.actionName = e.actionName;
-    partial.actionArgs = e.actionArgs;
-    partial.reasoning = e.reasoning;
-    partial.toolOutput = {
-      ...e.toolOutput,
-      result: redactInlineImagePayloads(e.toolOutput.result, e.actionName),
-    };
-    partial.agentEvidence = mergeAgentEvidence(
-      partial.agentEvidence,
+    const agentEvidence: AgentEvidence = this.pendingAgentScreenshot
+      ? mergeAgentEvidence(
+          { modalities: [] },
+          {
+            modalities: [
+              {
+                type: "image",
+                bytes: this.pendingAgentScreenshot.screenshot,
+                mediaType: "image/png",
+              },
+            ],
+          },
+        )
+      : { modalities: [] };
+    const merged = mergeAgentEvidence(
+      agentEvidence,
       buildAgentEvidenceFromStepFinished(e),
     );
+
+    const step: TrajectoryStep = {
+      index: this.steps.length,
+      actionName: e.actionName,
+      actionArgs: e.actionArgs,
+      reasoning: e.reasoning,
+      agentEvidence: merged,
+      probeEvidence: {},
+      toolOutput: {
+        ...e.toolOutput,
+        result: redactInlineImagePayloads(e.toolOutput.result, e.actionName),
+      },
+    };
+    this.pendingAgentScreenshot = undefined;
+    this.steps.push(step);
+    this.stepsAwaitingProbe.push(step.index);
   }
 
   private onStepObserved(e: AgentStepObservedEvent): void {
-    const partial = this.ensurePartial(e.stepIndex);
-    const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) };
-    probe.url = e.url;
+    if (this.stepsAwaitingProbe.length === 0) return;
+    const probe: ProbeEvidence = { url: e.url };
+    if (this.pendingProbeScreenshot)
+      probe.screenshot = this.pendingProbeScreenshot.screenshot;
     if (e.ariaTree !== undefined) probe.ariaTree = e.ariaTree;
     if (e.scroll !== undefined) probe.scroll = e.scroll;
-    partial.probeEvidence = probe;
+    for (const idx of this.stepsAwaitingProbe) {
+      this.steps[idx].probeEvidence = probe;
+    }
+    this.stepsAwaitingProbe = [];
+    this.pendingProbeScreenshot = undefined;
   }
 
   private onFinalAnswer(e: AgentFinalAnswerEvent): void {
@@ -182,10 +191,9 @@ export class TrajectoryRecorder {
    * write the on-disk layout. Idempotent.
    */
   async finish(opts: TrajectoryFinishOptions): Promise<Trajectory> {
-    const steps = this.assembleSteps();
     const trajectory: Trajectory = {
       task: this.taskSpec,
-      steps,
+      steps: this.steps,
       finalAnswer: opts.finalAnswer ?? this.finalAnswerEvent?.message,
       ...(this.finalObservation
         ? { finalObservation: this.finalObservation }
@@ -203,7 +211,10 @@ export class TrajectoryRecorder {
 
   /** Throw away in-memory state without writing to disk. Used on early abort. */
   cancel(): void {
-    this.partialSteps.clear();
+    this.steps.length = 0;
+    this.pendingAgentScreenshot = undefined;
+    this.pendingProbeScreenshot = undefined;
+    this.stepsAwaitingProbe = [];
     this.finalAnswerEvent = undefined;
     this.finalObservation = undefined;
   }
@@ -250,37 +261,4 @@ export class TrajectoryRecorder {
       JSON.stringify({ ...taskData, result }, null, 2),
     );
   }
-
-  private ensurePartial(stepIndex: number): Partial<PartialStep> {
-    let p = this.partialSteps.get(stepIndex);
-    if (!p) {
-      p = { index: stepIndex };
-      this.partialSteps.set(stepIndex, p);
-    }
-    return p;
-  }
-
-  private assembleSteps(): TrajectoryStep[] {
-    const out: TrajectoryStep[] = [];
-    const indices = [...this.partialSteps.keys()].sort((a, b) => a - b);
-    for (const i of indices) {
-      const p = this.partialSteps.get(i)!;
-      if (p.actionName === undefined || p.toolOutput === undefined) {
-        // Provider-only screenshot refreshes are transport evidence for the
-        // next CUA action. If no action arrives for this index, there is no
-        // completed trajectory step to persist.
-        continue;
-      }
-      out.push({
-        index: i,
-        actionName: p.actionName,
-        actionArgs: p.actionArgs ?? {},
-        reasoning: p.reasoning ?? "",
-        agentEvidence: p.agentEvidence ?? { modalities: [] },
-        probeEvidence: p.probeEvidence ?? {},
-        toolOutput: p.toolOutput,
-      });
-    }
-    return out;
-  }
 }
diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts
index 57f4c93c55..0623d88b89 100644
--- a/packages/evals/tests/framework/trajectoryRecorder.test.ts
+++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts
@@ -30,44 +30,82 @@ function makeTaskSpec(): TaskSpec {
     id: "recorder-task",
     instruction: "Compare economy and business fares.",
     initUrl: "https://example.com",
-    precomputedRubric: {
-      items: [
-        {
-          criterion: "Report fare delta",
-          description: "Report the difference between two fares.",
-          maxPoints: 1,
-        },
-      ],
-    },
   };
 }
 
+function recordSimpleStep(recorder: TrajectoryRecorder, screenshot: Buffer) {
+  recorder.record({
+    type: "screenshot",
+    screenshot,
+    url: "https://example.com/search",
+    evidenceRole: "agent",
+  });
+  recorder.record({
+    type: "step_finished",
+    actionName: "act",
+    actionArgs: { instruction: "Search fares" },
+    reasoning: "Search for fares.",
+    toolOutput: { ok: true, result: "done" },
+  });
+  recorder.record({
+    type: "screenshot",
+    screenshot,
+    url: "https://example.com/search",
+    evidenceRole: "probe",
+  });
+  recorder.record({
+    type: "step_observed",
+    url: "https://example.com/search",
+  });
+}
+
+function recordFinalAnswer(
+  recorder: TrajectoryRecorder,
+  opts: { message: string; screenshot: Buffer; ariaTree?: string },
+): void {
+  recorder.record({
+    type: "final_answer",
+    message: opts.message,
+    observation: {
+      url: "https://example.com/complete",
+      screenshot: opts.screenshot,
+      ...(opts.ariaTree !== undefined ? { ariaTree: opts.ariaTree } : {}),
+    },
+  });
+}
+
 describe("TrajectoryRecorder", () => {
-  it("assembles trajectory evidence from callback events", async () => {
+  it("assembles ordered callback events into trajectory steps", async () => {
     const recorder = new TrajectoryRecorder({
       taskSpec: makeTaskSpec(),
       persist: false,
     });
     const screenshot = Buffer.from("screen-1");
+    const staleScreenshot = Buffer.from("stale-screen");
+    const probeScreenshot = Buffer.from("probe-screen");
 
     recorder.start();
     recorder.record({
       type: "screenshot",
-      stepIndex: 0,
-      screenshot,
-      url: "https://example.com/search",
+      screenshot: staleScreenshot,
+      url: "https://example.com/stale",
       evidenceRole: "agent",
     });
     recorder.record({
       type: "screenshot",
-      stepIndex: 0,
       screenshot,
       url: "https://example.com/search",
-      evidenceRole: "probe",
+      evidenceRole: "agent",
+    });
+    recorder.record({
+      type: "step_finished",
+      actionName: "click",
+      actionArgs: { describe: "Open fares" },
+      reasoning: "Open fare details.",
+      toolOutput: { ok: true, result: "opened" },
     });
     recorder.record({
       type: "step_finished",
-      stepIndex: 0,
       actionName: "extract",
       actionArgs: { instruction: "Read fares" },
       reasoning: "Read visible fare cells.",
@@ -76,20 +114,21 @@ describe("TrajectoryRecorder", () => {
         result: { economy: "$100", business: "$250" },
       },
     });
+    recorder.record({
+      type: "screenshot",
+      screenshot: probeScreenshot,
+      url: "https://example.com/search",
+      evidenceRole: "probe",
+    });
     recorder.record({
       type: "step_observed",
-      stepIndex: 0,
       url: "https://example.com/search",
       ariaTree: "RootWebArea\nStaticText: Economy $100",
     });
-    recorder.record({
-      type: "final_answer",
+    recordFinalAnswer(recorder, {
       message: "Business is $150 more than economy.",
-      observation: {
-        url: "https://example.com/checkout",
-        screenshot: Buffer.from("final-screen"),
-        ariaTree: "RootWebArea\nStaticText: Complete",
-      },
+      screenshot: Buffer.from("final-screen"),
+      ariaTree: "RootWebArea\nStaticText: Complete",
     });
 
     const trajectory = await recorder.finish({
@@ -97,32 +136,35 @@ describe("TrajectoryRecorder", () => {
       usage: { input_tokens: 10, output_tokens: 5 },
     });
 
-    expect(trajectory.steps).toHaveLength(1);
+    expect(trajectory.steps).toHaveLength(2);
     expect(trajectory.steps[0]).toMatchObject({
       index: 0,
-      actionName: "extract",
-      actionArgs: { instruction: "Read fares" },
-      reasoning: "Read visible fare cells.",
-      toolOutput: {
-        ok: true,
-        result: { economy: "$100", business: "$250" },
-      },
+      actionName: "click",
       probeEvidence: {
         url: "https://example.com/search",
         ariaTree: "RootWebArea\nStaticText: Economy $100",
       },
     });
-    expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot);
+    expect(trajectory.steps[1]).toMatchObject({
+      index: 1,
+      actionName: "extract",
+      toolOutput: { ok: true, result: { economy: "$100", business: "$250" } },
+    });
+    expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(
+      probeScreenshot,
+    );
+    expect(trajectory.steps[1].probeEvidence.screenshot).toEqual(
+      probeScreenshot,
+    );
     expect(trajectory.steps[0].agentEvidence.modalities).toEqual(
       expect.arrayContaining([
         { type: "image", bytes: screenshot, mediaType: "image/png" },
-        { type: "text", content: "Read visible fare cells." },
-        { type: "json", content: { economy: "$100", business: "$250" } },
+        { type: "text", content: "Open fare details." },
       ]),
     );
     expect(trajectory.finalAnswer).toBe("Business is $150 more than economy.");
     expect(trajectory.finalObservation).toMatchObject({
-      url: "https://example.com/checkout",
+      url: "https://example.com/complete",
       ariaTree: "RootWebArea\nStaticText: Complete",
     });
     expect(trajectory.finalObservation?.screenshot).toEqual(
@@ -141,40 +183,10 @@ describe("TrajectoryRecorder", () => {
     const screenshot = Buffer.from("screen-1");
 
     recorder.start();
-    recorder.record({
-      type: "screenshot",
-      stepIndex: 0,
-      screenshot,
-      url: "https://example.com/search",
-      evidenceRole: "agent",
-    });
-    recorder.record({
-      type: "screenshot",
-      stepIndex: 0,
-      screenshot,
-      url: "https://example.com/search",
-      evidenceRole: "probe",
-    });
-    recorder.record({
-      type: "step_finished",
-      stepIndex: 0,
-      actionName: "act",
-      actionArgs: { instruction: "Search fares" },
-      reasoning: "Search for fares.",
-      toolOutput: { ok: true, result: "done" },
-    });
-    recorder.record({
-      type: "step_observed",
-      stepIndex: 0,
-      url: "https://example.com/search",
-    });
-    recorder.record({
-      type: "final_answer",
+    recordSimpleStep(recorder, screenshot);
+    recordFinalAnswer(recorder, {
       message: "Complete.",
-      observation: {
-        url: "https://example.com/complete",
-        screenshot: Buffer.from("final-screen"),
-      },
+      screenshot: Buffer.from("final-screen"),
     });
 
     await recorder.finish({ status: "complete" });
@@ -229,68 +241,4 @@ describe("TrajectoryRecorder", () => {
       explanation: "The task was completed.",
     });
   });
-
-  it("lifts inline screenshot payloads into image evidence and redacts JSON", async () => {
-    const inlineScreenshot =
-      Buffer.from("inline screenshot").toString("base64");
-    const recorder = new TrajectoryRecorder({
-      taskSpec: makeTaskSpec(),
-      persist: false,
-    });
-
-    recorder.record({
-      type: "step_finished",
-      stepIndex: 0,
-      actionName: "click",
-      actionArgs: { describe: "Open fare details" },
-      reasoning: "Click the fare details button.",
-      toolOutput: {
-        ok: true,
-        result: {
-          output: {
-            success: true,
-            describe: "Open fare details",
-            screenshotBase64: inlineScreenshot,
-          },
-        },
-      },
-    });
-
-    const trajectory = await recorder.finish({ status: "complete" });
-    const step = trajectory.steps[0];
-    const rawTrajectory = JSON.stringify(trajectory);
-    const imageModalities = step.agentEvidence.modalities.filter(
-      (m) => m.type === "image",
-    );
-    const jsonModality = step.agentEvidence.modalities.find(
-      (m) => m.type === "json",
-    );
-
-    expect(rawTrajectory).not.toContain(inlineScreenshot);
-    expect(step.toolOutput.result).toMatchObject({
-      output: {
-        success: true,
-        describe: "Open fare details",
-        screenshotBase64: "[redacted inline image payload]",
-      },
-    });
-    expect(jsonModality).toMatchObject({
-      type: "json",
-      content: {
-        output: {
-          screenshotBase64: "[redacted inline image payload]",
-        },
-      },
-    });
-    expect(imageModalities).toHaveLength(1);
-    expect(imageModalities[0]).toMatchObject({
-      type: "image",
-      mediaType: "image/png",
-    });
-    if (imageModalities[0].type === "image") {
-      expect(imageModalities[0].bytes).toEqual(
-        Buffer.from(inlineScreenshot, "base64"),
-      );
-    }
-  });
 });

From d6fb72b8880feae4fbfd6c7450bd8c20c4299d64 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Sun, 24 May 2026 10:23:59 -0700
Subject: [PATCH 22/27] refactor(verifier): tighten evidence event types and
 recorder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up cleanup on the sequential-recorder refactor:

- Drop step.index from TrajectoryStep; array position is the canonical
  index. Trajectory writer and v3Evaluator use entries()/map index.
- Drop unused scroll field from AgentStepObservedEvent, AgentFinalObservation,
  and ProbeEvidence — no producer ever set it.
- Require evidenceRole on AgentScreenshotEvidenceEvent; the role routes the
  event into different recorder slots, so a missing role can't silently
  misroute.
- Flatten the identity mergeAgentEvidence in onStepFinished.
- Drop unused url field from the recorder's pending screenshot slots.
- Remove the no-op TrajectoryRecorder.start() method and test call sites.
- Remove the dead early-return guard in onStepObserved.
---
 .../v3/types/public/agentEvidenceEvents.ts    |  6 +-
 packages/core/lib/v3/verifier/trajectory.ts   | 10 +--
 packages/core/lib/v3/verifier/types.ts        |  3 -
 packages/core/lib/v3Evaluator.ts              |  4 +-
 packages/core/tests/unit/v3-evaluator.test.ts |  1 -
 .../tests/unit/verifier-trajectory.test.ts    |  4 --
 .../evals/framework/trajectoryRecorder.ts     | 62 ++++++-------------
 .../framework/trajectoryRecorder.test.ts      |  4 --
 8 files changed, 27 insertions(+), 67 deletions(-)

diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
index 25e2bd51f4..d74b56a67c 100644
--- a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
+++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
@@ -31,7 +31,7 @@ export interface AgentScreenshotEvidenceEvent {
   /** Page URL at the time of capture. */
   url: string;
   /** Role this screenshot plays in downstream evidence collection. */
-  evidenceRole?: AgentEvidenceRole;
+  evidenceRole: AgentEvidenceRole;
 }
 
 /**
@@ -64,8 +64,6 @@ export interface AgentStepObservedEvent {
   url: string;
   /** Accessibility tree snapshot, when captured. */
   ariaTree?: string;
-  /** Viewport scroll context, when captured. */
-  scroll?: { top: number; pageHeight: number };
 }
 
 export interface AgentFinalObservation {
@@ -75,8 +73,6 @@ export interface AgentFinalObservation {
   screenshot?: Buffer;
   /** Accessibility tree snapshot, when captured. */
   ariaTree?: string;
-  /** Viewport scroll context, when captured. */
-  scroll?: { top: number; pageHeight: number };
 }
 
 /** Final answer emitted by the agent, when available. */
diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index b8722a64df..223e4b1b92 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -268,10 +268,10 @@ export async function writeTrajectoryDir(
   await fs.mkdir(path.join(dir, "screenshots", "agent"), { recursive: true });
 
   const serializableSteps: unknown[] = [];
-  for (const step of trajectory.steps) {
+  for (const [i, step] of trajectory.steps.entries()) {
     const probe: ProbeEvidence = { ...step.probeEvidence };
     if (probe.screenshot) {
-      const relPath = `screenshots/probe/${step.index + 1}.png`;
+      const relPath = `screenshots/probe/${i + 1}.png`;
       await fs.writeFile(path.join(dir, relPath), probe.screenshot);
       probe.screenshotPath = relPath;
       delete probe.screenshot;
@@ -296,7 +296,7 @@ export async function writeTrajectoryDir(
         continue;
       }
       const suffix = multipleImages ? `_${imageSeq}` : "";
-      const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`;
+      const relPath = `screenshots/agent/${i + 1}${suffix}.png`;
       await fs.writeFile(path.join(dir, relPath), m.bytes);
       modalities.push({
         type: "image",
@@ -363,9 +363,9 @@ export async function writeTrajectoryDir(
 function coreLog(trajectory: Trajectory): string {
   return (
     trajectory.steps
-      .map((step) =>
+      .map((step, i) =>
         JSON.stringify({
-          step: step.index,
+          step: i,
           action: step.actionName,
           url: step.probeEvidence.url ?? null,
           ok: step.toolOutput.ok,
diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts
index 5431397092..4aa76ea6f1 100644
--- a/packages/core/lib/v3/verifier/types.ts
+++ b/packages/core/lib/v3/verifier/types.ts
@@ -89,8 +89,6 @@ export interface ProbeEvidence {
   screenshot?: Buffer;
   /** Reference to the persisted screenshot file under the trajectory dir. */
   screenshotPath?: string;
-  /** Viewport scroll context. Lets the verifier reason about whether the agent saw the full page. */
-  scroll?: { top: number; pageHeight: number };
   /** Accessibility tree snapshot. */
   ariaTree?: string;
   /** Verifier-requested probes, keyed by criterion id. */
@@ -111,7 +109,6 @@ export interface ToolOutput {
 
 /** One step in a trajectory: action + reasoning + evidence + outcome. */
 export interface TrajectoryStep {
-  index: number;
   actionName: string;
   actionArgs: Record<string, unknown>;
   /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */
diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts
index ee1bbc6d35..5e6e5ee92f 100644
--- a/packages/core/lib/v3Evaluator.ts
+++ b/packages/core/lib/v3Evaluator.ts
@@ -237,13 +237,13 @@ function collectLegacyScreenshots(trajectory: Trajectory): Buffer[] {
 function renderLegacyAgentReasoning(
   trajectory: Trajectory,
 ): string | undefined {
-  const stepLines = (trajectory.steps ?? []).map((step) => {
+  const stepLines = (trajectory.steps ?? []).map((step, i) => {
     const status = step.toolOutput?.ok === false ? "Tool status: failed" : "";
     const output = step.toolOutput?.error
       ? `Tool error: ${step.toolOutput.error}`
       : `Tool output: ${stringifyForPrompt(step.toolOutput?.result)}`;
     return [
-      `Step ${step.index}: ${step.actionName}`,
+      `Step ${i}: ${step.actionName}`,
       step.reasoning ? `Reasoning: ${step.reasoning}` : undefined,
       status || undefined,
       output,
diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts
index b18650d8ea..e6f61d54a8 100644
--- a/packages/core/tests/unit/v3-evaluator.test.ts
+++ b/packages/core/tests/unit/v3-evaluator.test.ts
@@ -212,7 +212,6 @@ function makeTrajectory(
     ...makeEmptyTrajectory(taskSpec),
     steps: [
       {
-        index: 0,
         actionName: "act",
         actionArgs: {},
         reasoning: "I completed the task.",
diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts
index e57f2bb3c4..7cd38662df 100644
--- a/packages/core/tests/unit/verifier-trajectory.test.ts
+++ b/packages/core/tests/unit/verifier-trajectory.test.ts
@@ -81,7 +81,6 @@ describe("verifier trajectory utilities", () => {
         usage: { input_tokens: 0, output_tokens: 0 },
         steps: [
           {
-            index: 0,
             actionName: "act",
             actionArgs: {},
             reasoning: "",
@@ -127,7 +126,6 @@ describe("verifier trajectory utilities", () => {
         usage: { input_tokens: 0, output_tokens: 0 },
         steps: [
           {
-            index: 0,
             actionName: "act",
             actionArgs: {},
             reasoning: "",
@@ -171,7 +169,6 @@ describe("verifier trajectory utilities", () => {
       },
       steps: [
         {
-          index: 0,
           actionName: "click",
           actionArgs: {},
           reasoning: "",
@@ -231,7 +228,6 @@ describe("verifier trajectory utilities", () => {
         usage: { input_tokens: 0, output_tokens: 0 },
         steps: [
           {
-            index: 0,
             actionName: "act",
             actionArgs: {},
             reasoning: "",
diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index 5785ca4388..d24b53ae9e 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -23,11 +23,6 @@ import type {
   EvaluationResult,
 } from "@browserbasehq/stagehand";
 
-interface PendingScreenshot {
-  screenshot: Buffer;
-  url: string;
-}
-
 export interface TrajectoryRecorderOptions {
   taskSpec: TaskSpec;
   /**
@@ -68,9 +63,9 @@ export class TrajectoryRecorder {
   // consumes it. A second agent-role screenshot before any step_finished
   // overwrites the first — that's the desired behavior when a turn is skipped
   // (e.g., captcha guard short-circuits before emitting step_finished).
-  private pendingAgentScreenshot?: PendingScreenshot;
+  private pendingAgentScreenshot?: Buffer;
   // The most recent probe-role screenshot waits for the matching step_observed.
-  private pendingProbeScreenshot?: PendingScreenshot;
+  private pendingProbeScreenshot?: Buffer;
   // Steps that haven't yet had a probe attached. The next step_observed fans
   // out to all of them (one probe per agent turn, N tool calls per turn).
   private stepsAwaitingProbe: number[] = [];
@@ -78,36 +73,30 @@ export class TrajectoryRecorder {
   private finalObservation?: ProbeEvidence;
 
   private onScreenshot(e: AgentScreenshotEvidenceEvent): void {
-    const role = e.evidenceRole ?? "probe";
-    if (role === "agent") {
-      this.pendingAgentScreenshot = { screenshot: e.screenshot, url: e.url };
+    if (e.evidenceRole === "agent") {
+      this.pendingAgentScreenshot = e.screenshot;
     } else {
-      this.pendingProbeScreenshot = { screenshot: e.screenshot, url: e.url };
+      this.pendingProbeScreenshot = e.screenshot;
     }
   }
 
   private onStepFinished(e: AgentStepFinishedEvent): void {
-    const agentEvidence: AgentEvidence = this.pendingAgentScreenshot
-      ? mergeAgentEvidence(
-          { modalities: [] },
-          {
-            modalities: [
-              {
-                type: "image",
-                bytes: this.pendingAgentScreenshot.screenshot,
-                mediaType: "image/png",
-              },
-            ],
-          },
-        )
-      : { modalities: [] };
+    const modalities: AgentEvidence["modalities"] = [];
+    if (this.pendingAgentScreenshot) {
+      modalities.push({
+        type: "image",
+        bytes: this.pendingAgentScreenshot,
+        mediaType: "image/png",
+      });
+    }
     const merged = mergeAgentEvidence(
-      agentEvidence,
+      { modalities },
       buildAgentEvidenceFromStepFinished(e),
     );
 
-    const step: TrajectoryStep = {
-      index: this.steps.length,
+    this.pendingAgentScreenshot = undefined;
+    this.stepsAwaitingProbe.push(this.steps.length);
+    this.steps.push({
       actionName: e.actionName,
       actionArgs: e.actionArgs,
       reasoning: e.reasoning,
@@ -117,19 +106,14 @@ export class TrajectoryRecorder {
         ...e.toolOutput,
         result: redactInlineImagePayloads(e.toolOutput.result, e.actionName),
       },
-    };
-    this.pendingAgentScreenshot = undefined;
-    this.steps.push(step);
-    this.stepsAwaitingProbe.push(step.index);
+    });
   }
 
   private onStepObserved(e: AgentStepObservedEvent): void {
-    if (this.stepsAwaitingProbe.length === 0) return;
     const probe: ProbeEvidence = { url: e.url };
     if (this.pendingProbeScreenshot)
-      probe.screenshot = this.pendingProbeScreenshot.screenshot;
+      probe.screenshot = this.pendingProbeScreenshot;
     if (e.ariaTree !== undefined) probe.ariaTree = e.ariaTree;
-    if (e.scroll !== undefined) probe.scroll = e.scroll;
     for (const idx of this.stepsAwaitingProbe) {
       this.steps[idx].probeEvidence = probe;
     }
@@ -148,9 +132,6 @@ export class TrajectoryRecorder {
         ...(e.observation.ariaTree !== undefined
           ? { ariaTree: e.observation.ariaTree }
           : {}),
-        ...(e.observation.scroll !== undefined
-          ? { scroll: e.observation.scroll }
-          : {}),
       };
     }
   }
@@ -163,11 +144,6 @@ export class TrajectoryRecorder {
     this.persistEnabled = shouldPersistTrajectory(opts.persist);
   }
 
-  /** Mark the beginning of collection. Retained as a no-op for compatibility. */
-  start(): void {
-    return;
-  }
-
   /** Ingest an evidence callback event from agent.execute(). */
   record(event: AgentEvidenceEvent): void {
     switch (event.type) {
diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts
index 0623d88b89..4dcf379e86 100644
--- a/packages/evals/tests/framework/trajectoryRecorder.test.ts
+++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts
@@ -84,7 +84,6 @@ describe("TrajectoryRecorder", () => {
     const staleScreenshot = Buffer.from("stale-screen");
     const probeScreenshot = Buffer.from("probe-screen");
 
-    recorder.start();
     recorder.record({
       type: "screenshot",
       screenshot: staleScreenshot,
@@ -138,7 +137,6 @@ describe("TrajectoryRecorder", () => {
 
     expect(trajectory.steps).toHaveLength(2);
     expect(trajectory.steps[0]).toMatchObject({
-      index: 0,
       actionName: "click",
       probeEvidence: {
         url: "https://example.com/search",
@@ -146,7 +144,6 @@ describe("TrajectoryRecorder", () => {
       },
     });
     expect(trajectory.steps[1]).toMatchObject({
-      index: 1,
       actionName: "extract",
       toolOutput: { ok: true, result: { economy: "$100", business: "$250" } },
     });
@@ -182,7 +179,6 @@ describe("TrajectoryRecorder", () => {
     });
     const screenshot = Buffer.from("screen-1");
 
-    recorder.start();
     recordSimpleStep(recorder, screenshot);
     recordFinalAnswer(recorder, {
       message: "Complete.",

From 754a54b0f55a7ddac7f964c3efb0438fae6bfe58 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Sun, 24 May 2026 11:36:27 -0700
Subject: [PATCH 23/27] refactor(verifier): drop unused inferCuaToolOutput

The CUA handler calls inferToolOutput directly now that the general helper
handles the {success: boolean, error?: ...} shape via normalizeError.
---
 .../core/lib/v3/agent/utils/toolOutputEvidence.ts     | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
index 9718181479..50426b0d12 100644
--- a/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
+++ b/packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
@@ -1,5 +1,4 @@
 import type { AgentStepFinishedEvent } from "../../types/public/agentEvidenceEvents.js";
-import type { ActionExecutionResult } from "../../types/public/agent.js";
 
 const ERROR_STRING_LIMIT = 1000;
 
@@ -75,13 +74,3 @@ export function inferToolOutput(
     error,
   };
 }
-
-export function inferCuaToolOutput(
-  result: ActionExecutionResult | undefined,
-): AgentStepFinishedEvent["toolOutput"] {
-  return {
-    ok: result?.success !== false,
-    result: result ?? { success: true },
-    error: result?.error,
-  };
-}

From a748399434bf0d83b8559a43a3cc11ab0c423382 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Sun, 24 May 2026 11:45:41 -0700
Subject: [PATCH 24/27] only emit step when evidenceCallback is provided

---
 packages/core/lib/v3/handlers/v3CuaAgentHandler.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index 901dce71da..73e86a9171 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -162,7 +162,7 @@ export class V3CuaAgentHandler {
         }
 
         action.timestamp = Date.now();
-        if (shouldLog) {
+        if (shouldLog && this.evidenceCallback) {
           await this.emitCuaActionStep(action, executionResult);
         }
 

From 61160d9b86ad0acf55efa1314c3abc9db0fa0616 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Sun, 24 May 2026 11:57:41 -0700
Subject: [PATCH 25/27] perf(verifier): dedupe shared probe screenshots in
 writeTrajectoryDir

A single post-turn probe is fanned across every step of a multi-tool
turn, so those steps share the same screenshot Buffer by reference.
writeTrajectoryDir was writing an identical PNG per step
(probe/1.png, probe/2.png, ...). Dedupe by Buffer identity: write the
PNG once and point every sharing step's screenshotPath at the same
file. Behavior-preserving for single-probe steps.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/core/lib/v3/verifier/trajectory.ts | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index 223e4b1b92..1465d363e2 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -268,11 +268,19 @@ export async function writeTrajectoryDir(
   await fs.mkdir(path.join(dir, "screenshots", "agent"), { recursive: true });
 
   const serializableSteps: unknown[] = [];
+  // A single post-turn probe is fanned across every step of a multi-tool turn,
+  // so the same screenshot Buffer is shared by reference. Dedupe by identity:
+  // write the PNG once and point every sharing step's screenshotPath at it.
+  const probePathByBuffer = new Map<Buffer, string>();
   for (const [i, step] of trajectory.steps.entries()) {
     const probe: ProbeEvidence = { ...step.probeEvidence };
     if (probe.screenshot) {
-      const relPath = `screenshots/probe/${i + 1}.png`;
-      await fs.writeFile(path.join(dir, relPath), probe.screenshot);
+      let relPath = probePathByBuffer.get(probe.screenshot);
+      if (!relPath) {
+        relPath = `screenshots/probe/${i + 1}.png`;
+        await fs.writeFile(path.join(dir, relPath), probe.screenshot);
+        probePathByBuffer.set(probe.screenshot, relPath);
+      }
       probe.screenshotPath = relPath;
       delete probe.screenshot;
     }

From 98bd986e699fc978d6dc108b79a6e241601ece8b Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Mon, 25 May 2026 08:16:12 -0700
Subject: [PATCH 26/27] fix(cua,verifier): record failed actions + share agent
 screenshot across batched steps

Two trajectory-fidelity gaps in CUA runs:

1. Failed actions were dropped. emitCuaActionStep only ran after a successful
   executeAction; a throwing action jumped to catch and rethrew, so no
   step_finished was recorded. Now the catch emits a step_finished
   {ok:false, error} (with a best-effort post-failure probe) before rethrowing,
   in a nested try/catch so evidence emission never masks the original error.
   emitCuaActionStep now takes an explicit toolOutput instead of deriving it
   from `result ?? {success:true}`.

2. Batched actions lost the agent screenshot. A CUA provider can choose several
   actions from one screenshot, but the recorder cleared the pending agent
   screenshot after the first step_finished, so later steps got no tier-1 frame.
   Renamed to latestAgentScreenshot; it now applies to every step until a newer
   agent screenshot replaces it (wiped on cancel()). writeTrajectoryDir dedupes
   the now-shared agent Buffer by identity so it isn't written once per step.
   Public onEvidence contract doc updated to describe the replay semantics.

Tests: failed-action emits step_finished{ok:false} and rethrows; batched
two-action turn shares the agent screenshot across both steps.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 49 ++++++++++++++---
 .../v3/types/public/agentEvidenceEvents.ts    | 10 ++--
 packages/core/lib/v3/verifier/trajectory.ts   | 16 ++++--
 .../tests/unit/agent-captcha-hooks.test.ts    | 55 +++++++++++++++++++
 .../evals/framework/trajectoryRecorder.ts     | 23 ++++----
 .../framework/trajectoryRecorder.test.ts      |  7 +++
 6 files changed, 133 insertions(+), 27 deletions(-)

diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index 73e86a9171..d4b359f8e3 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -22,7 +22,10 @@ import {
   SafetyConfirmationHandler,
 } from "../types/public/agent.js";
 import { LogLine } from "../types/public/logs.js";
-import type { AgentEvidenceCallback } from "../types/public/agentEvidenceEvents.js";
+import type {
+  AgentEvidenceCallback,
+  AgentStepFinishedEvent,
+} from "../types/public/agentEvidenceEvents.js";
 import { type Action, V3FunctionName } from "../types/public/methods.js";
 import { FlowLogger } from "../flowlogger/FlowLogger.js";
 import { toTitleCase } from "../../utils.js";
@@ -131,6 +134,10 @@ export class V3CuaAgentHandler {
       const waitBetween =
         (this.options.clientOptions?.waitBetweenActions as number) ||
         defaultDelay;
+      // Skip logging for screenshot actions - they're no-ops; the CUA client
+      // takes its own screenshot via screenshotProvider between API turns.
+      // Computed outside the try so the catch can still record a failed step.
+      const shouldLog = action.type !== "screenshot";
       try {
         let executionResult: ActionExecutionResult | undefined;
         // Try to inject cursor before each action if enabled
@@ -142,9 +149,6 @@ export class V3CuaAgentHandler {
           }
         }
         await new Promise((r) => setTimeout(r, 300));
-        // Skip logging for screenshot actions - they're no-ops; the CUA client
-        // takes its own screenshot via screenshotProvider between API turns.
-        const shouldLog = action.type !== "screenshot";
         if (shouldLog) {
           executionResult = await FlowLogger.runWithLogging(
             {
@@ -163,7 +167,10 @@ export class V3CuaAgentHandler {
 
         action.timestamp = Date.now();
         if (shouldLog && this.evidenceCallback) {
-          await this.emitCuaActionStep(action, executionResult);
+          await this.emitCuaActionStep(
+            action,
+            inferToolOutput(executionResult ?? { success: true }),
+          );
         }
 
         await new Promise((r) => setTimeout(r, waitBetween));
@@ -174,6 +181,30 @@ export class V3CuaAgentHandler {
           message: `Error executing action ${action.type}: ${msg}`,
           level: 0,
         });
+        // Record the failed action as an ok:false step (with a best-effort
+        // post-failure probe, since a throwing action can still partially
+        // mutate the page) before rethrowing — otherwise the failure is
+        // dropped from the persisted trajectory. Evidence emission must never
+        // mask the original action error.
+        if (shouldLog && this.evidenceCallback) {
+          try {
+            await this.emitCuaActionStep(action, {
+              ok: false,
+              result: undefined,
+              error: msg,
+            });
+          } catch (evidenceError) {
+            this.logger({
+              category: "agent",
+              message: `Failed to record failed-action evidence: ${
+                evidenceError instanceof Error
+                  ? evidenceError.message
+                  : String(evidenceError)
+              }`,
+              level: 1,
+            });
+          }
+        }
         throw error;
       }
     });
@@ -835,7 +866,7 @@ export class V3CuaAgentHandler {
 
   private async emitCuaActionStep(
     action: AgentAction,
-    result: ActionExecutionResult | undefined,
+    toolOutput: AgentStepFinishedEvent["toolOutput"],
   ): Promise<void> {
     let pageUrl =
       typeof action.pageUrl === "string"
@@ -848,7 +879,9 @@ export class V3CuaAgentHandler {
     }
 
     const actionArgs = Object.fromEntries(
-      Object.entries(action).filter(([key]) => key !== "screenshot"),
+      Object.entries(action).filter(
+        ([key]) => key !== "screenshot" && key !== "timestamp",
+      ),
     );
     const reasoning =
       typeof action.reasoning === "string"
@@ -862,7 +895,7 @@ export class V3CuaAgentHandler {
       actionName: String(action.type),
       actionArgs,
       reasoning,
-      toolOutput: inferToolOutput(result ?? { success: true }),
+      toolOutput,
     });
 
     // Post-action tier-2 probe. The pre-action screenshot from
diff --git a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
index d74b56a67c..30c4c2f2a2 100644
--- a/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
+++ b/packages/core/lib/v3/types/public/agentEvidenceEvents.ts
@@ -3,10 +3,12 @@
  *
  * These events describe observations made by Stagehand during an agent run.
  * They are emitted in temporal order; consumers should treat the stream as
- * sequential (pair an agent-role screenshot with the next step_finished,
- * apply a step_observed/probe to all steps_finished since the last probe).
- * Verifier-specific storage and normalization live in the evals/verifier
- * layers.
+ * sequential. An agent-role screenshot applies to every subsequent
+ * step_finished until a newer agent-role screenshot replaces it — a CUA
+ * provider may choose multiple actions from a single screenshot, so each of
+ * those steps shares that frame. A step_observed/probe applies to all
+ * step_finished events received since the last probe. Verifier-specific
+ * storage and normalization live in the evals/verifier layers.
  */
 
 export type AgentEvidenceRole = "probe" | "agent";
diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index 1465d363e2..4902f1f63c 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -269,9 +269,11 @@ export async function writeTrajectoryDir(
 
   const serializableSteps: unknown[] = [];
   // A single post-turn probe is fanned across every step of a multi-tool turn,
-  // so the same screenshot Buffer is shared by reference. Dedupe by identity:
-  // write the PNG once and point every sharing step's screenshotPath at it.
+  // and a single agent screenshot is shared across every action a CUA provider
+  // chose from it, so the same Buffer is shared by reference. Dedupe by
+  // identity: write the PNG once and point every sharing step at the same file.
   const probePathByBuffer = new Map<Buffer, string>();
+  const agentPathByBuffer = new Map<Buffer, string>();
   for (const [i, step] of trajectory.steps.entries()) {
     const probe: ProbeEvidence = { ...step.probeEvidence };
     if (probe.screenshot) {
@@ -303,9 +305,13 @@ export async function writeTrajectoryDir(
         );
         continue;
       }
-      const suffix = multipleImages ? `_${imageSeq}` : "";
-      const relPath = `screenshots/agent/${i + 1}${suffix}.png`;
-      await fs.writeFile(path.join(dir, relPath), m.bytes);
+      let relPath = agentPathByBuffer.get(m.bytes);
+      if (!relPath) {
+        const suffix = multipleImages ? `_${imageSeq}` : "";
+        relPath = `screenshots/agent/${i + 1}${suffix}.png`;
+        await fs.writeFile(path.join(dir, relPath), m.bytes);
+        agentPathByBuffer.set(m.bytes, relPath);
+      }
       modalities.push({
         type: "image",
         imagePath: relPath,
diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts
index 4789fb5c63..62b8d38246 100644
--- a/packages/core/tests/unit/agent-captcha-hooks.test.ts
+++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts
@@ -557,4 +557,59 @@ describe("v3 cua handler screenshot behavior", () => {
       ),
     ).toBe(true);
   });
+
+  it("records a failed action as step_finished {ok:false} and rethrows the original error", async () => {
+    const events: Array<{ type: string; [k: string]: unknown }> = [];
+    const onEvidence = vi.fn(async (event: { type: string }) => {
+      events.push(event as { type: string });
+    });
+
+    fakeCuaClient.executeImpl = vi.fn(async () => {
+      await fakeCuaClient.actionHandler?.({
+        type: "click",
+        button: "left",
+        x: 5,
+        y: 9,
+      });
+      return { success: true, message: "ok", actions: [], completed: true };
+    });
+
+    const handler = new V3CuaAgentHandler(
+      {
+        context: {
+          awaitActivePage: async () => page,
+        },
+        isCaptchaAutoSolveEnabled: false,
+        isAdvancedStealth: false,
+        configuredViewport: { width: 1288, height: 711 },
+        isAgentReplayActive: () => false,
+        updateMetrics: vi.fn(),
+      } as never,
+      logger,
+      {
+        modelName: "openai/gpt-5.4",
+        clientOptions: { waitBetweenActions: 1 },
+      } as never,
+    );
+    vi.spyOn(
+      handler as unknown as {
+        executeAction: (action: Record<string, unknown>) => Promise<unknown>;
+      },
+      "executeAction",
+    ).mockRejectedValue(new Error("click failed"));
+
+    await expect(
+      handler.execute({
+        instruction: "click the thing",
+        highlightCursor: false,
+        callbacks: { onEvidence },
+      }),
+    ).rejects.toThrow("click failed");
+
+    const stepFinished = events.find((e) => e.type === "step_finished");
+    expect(stepFinished).toMatchObject({
+      actionName: "click",
+      toolOutput: { ok: false, error: "click failed" },
+    });
+  });
 });
diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts
index d24b53ae9e..789bb72f4b 100644
--- a/packages/evals/framework/trajectoryRecorder.ts
+++ b/packages/evals/framework/trajectoryRecorder.ts
@@ -59,11 +59,12 @@ export class TrajectoryRecorder {
 
   // Steps are appended in arrival order on each step_finished event.
   private readonly steps: TrajectoryStep[] = [];
-  // The most recent agent-role screenshot is held until the next step_finished
-  // consumes it. A second agent-role screenshot before any step_finished
-  // overwrites the first — that's the desired behavior when a turn is skipped
-  // (e.g., captcha guard short-circuits before emitting step_finished).
-  private pendingAgentScreenshot?: Buffer;
+  // The most recent agent-role screenshot. It applies to every step_finished
+  // until a newer agent-role screenshot replaces it — a CUA provider can pick
+  // multiple actions from one screenshot, so each of those steps must carry
+  // that same tier-1 frame. (It is NOT cleared on consume; it is only replaced
+  // by a newer screenshot, or wiped on cancel().)
+  private latestAgentScreenshot?: Buffer;
   // The most recent probe-role screenshot waits for the matching step_observed.
   private pendingProbeScreenshot?: Buffer;
   // Steps that haven't yet had a probe attached. The next step_observed fans
@@ -74,7 +75,7 @@ export class TrajectoryRecorder {
 
   private onScreenshot(e: AgentScreenshotEvidenceEvent): void {
     if (e.evidenceRole === "agent") {
-      this.pendingAgentScreenshot = e.screenshot;
+      this.latestAgentScreenshot = e.screenshot;
     } else {
       this.pendingProbeScreenshot = e.screenshot;
     }
@@ -82,10 +83,10 @@ export class TrajectoryRecorder {
 
   private onStepFinished(e: AgentStepFinishedEvent): void {
     const modalities: AgentEvidence["modalities"] = [];
-    if (this.pendingAgentScreenshot) {
+    if (this.latestAgentScreenshot) {
       modalities.push({
         type: "image",
-        bytes: this.pendingAgentScreenshot,
+        bytes: this.latestAgentScreenshot,
         mediaType: "image/png",
       });
     }
@@ -94,7 +95,9 @@ export class TrajectoryRecorder {
       buildAgentEvidenceFromStepFinished(e),
     );
 
-    this.pendingAgentScreenshot = undefined;
+    // Intentionally not cleared here: the same agent screenshot applies to
+    // every step in a batched CUA turn. It's replaced when a newer agent
+    // screenshot arrives (onScreenshot) or wiped on cancel().
     this.stepsAwaitingProbe.push(this.steps.length);
     this.steps.push({
       actionName: e.actionName,
@@ -188,7 +191,7 @@ export class TrajectoryRecorder {
   /** Throw away in-memory state without writing to disk. Used on early abort. */
   cancel(): void {
     this.steps.length = 0;
-    this.pendingAgentScreenshot = undefined;
+    this.latestAgentScreenshot = undefined;
     this.pendingProbeScreenshot = undefined;
     this.stepsAwaitingProbe = [];
     this.finalAnswerEvent = undefined;
diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts
index 4dcf379e86..320c4a5259 100644
--- a/packages/evals/tests/framework/trajectoryRecorder.test.ts
+++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts
@@ -159,6 +159,13 @@ describe("TrajectoryRecorder", () => {
         { type: "text", content: "Open fare details." },
       ]),
     );
+    // Both actions were chosen from the same agent screenshot (one screenshot,
+    // two step_finished), so the second step must carry that frame too.
+    expect(trajectory.steps[1].agentEvidence.modalities).toEqual(
+      expect.arrayContaining([
+        { type: "image", bytes: screenshot, mediaType: "image/png" },
+      ]),
+    );
     expect(trajectory.finalAnswer).toBe("Business is $150 more than economy.");
     expect(trajectory.finalObservation).toMatchObject({
       url: "https://example.com/complete",

From 2c95836395410e258e18de6cb1597cea6cfee8b7 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Mon, 25 May 2026 08:41:23 -0700
Subject: [PATCH 27/27] perf(cua): gate emitCuaScreenshot on evidenceCallback

screenshotProvider/captureAndSendScreenshot call emitCuaScreenshot
unconditionally; early-return when no recorder is attached so a plain CUA
run does no extra work (the lastAgentScreenshotUrl bookkeeping is only read
by evidence-gated code). Mirrors the emitCuaActionStep call-site gating.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/core/lib/v3/handlers/v3CuaAgentHandler.ts | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
index d4b359f8e3..22513339c0 100644
--- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
+++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -855,8 +855,9 @@ export class V3CuaAgentHandler {
     screenshot: Buffer,
     url: string,
   ): Promise<void> {
+    if (!this.evidenceCallback) return;
     this.lastAgentScreenshotUrl = url;
-    await this.evidenceCallback?.({
+    await this.evidenceCallback({
       type: "screenshot",
       screenshot,
       url,