Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6f27af2
feat(verifier): record agent trajectories
miguelg719 May 15, 2026
40e7ab3
fix(verifier): align trajectory naming
miguelg719 May 15, 2026
c25367b
chore(evals): remove upstream trajectory references
miguelg719 May 15, 2026
8e9962c
docs(verifier): remove rollout comments from trajectory capture
miguelg719 May 15, 2026
bb514e3
test(evals): cover trajectory recorder in vitest
miguelg719 May 16, 2026
9138ddf
docs(verifier): trim trajectory event comments
miguelg719 May 16, 2026
1303315
refactor(verifier): extract writeTrajectoryDir + shouldPersistTrajectory
miguelg719 May 19, 2026
10b03ca
style(recorder): prettier — collapse runId fallback onto one line
miguelg719 May 19, 2026
6caeb1b
fix(verifier): guard bus.listenerCount and align export-surface snapshot
miguelg719 May 19, 2026
16669e8
refactor(verifier): collect evidence via agent callbacks
miguelg719 May 21, 2026
b493fa7
fix(cua): keep screenshot provider evidence non-fatal
miguelg719 May 21, 2026
043b3e1
fix(verifier): hydrate persisted agent image paths
miguelg719 May 21, 2026
8596d2a
fix(evals): avoid useless task data assignment
miguelg719 May 21, 2026
2ba6c1f
test(agent): drop stale bus mocks
miguelg719 May 21, 2026
2780db2
fix(verifier): redact inline screenshot payloads
miguelg719 May 22, 2026
25fadb1
refactor(verifier): centralize trajectory evidence handling
miguelg719 May 22, 2026
3dfa861
fix(agent): make onEvidence non-fatal by wrapping at boundary
miguelg719 May 22, 2026
4d203ca
test(agent): update warning-message assertion to generic onEvidence l…
miguelg719 May 23, 2026
2418db3
fix(verifier): preserve final evidence observations
miguelg719 May 22, 2026
1252462
Remove verifier trajectory timestamps
miguelg719 May 23, 2026
b4a1537
refactor(verifier): simplify evidence event sequencing
miguelg719 May 24, 2026
d6fb72b
refactor(verifier): tighten evidence event types and recorder
miguelg719 May 24, 2026
754a54b
refactor(verifier): drop unused inferCuaToolOutput
miguelg719 May 24, 2026
a748399
only emit step when evidenceCallback is provided
miguelg719 May 24, 2026
61160d9
perf(verifier): dedupe shared probe screenshots in writeTrajectoryDir
miguelg719 May 24, 2026
98bd986
fix(cua,verifier): record failed actions + share agent screenshot acr…
miguelg719 May 25, 2026
2c95836
perf(cua): gate emitCuaScreenshot on evidenceCallback
miguelg719 May 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/verifier-trajectory-events.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need these three different patch files or just 1?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can consolidate at the end if we want to have all be 1 patch

---

Capture verifier trajectory evidence from agent evidence callbacks for offline scoring.
73 changes: 73 additions & 0 deletions packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/**
* captureAriaTreeProbe — capture a truncated accessibility tree of the active
* page for use as tier-2 evidence in the trajectory recorder.
*
* Shared by v3AgentHandler and v3CuaAgentHandler. Listener-gated by the
* callers so ordinary agent runs (no TrajectoryRecorder attached) don't pay
* the cost.
*
* The a11y tree is the same payload the agent's `ariaTree` tool sees, but
* captured by the harness (not the agent) so the verifier has independent
* textual ground truth for grounding non-visual claims — prices, names,
* dates, list contents — without OCR'ing screenshots.
*
* Budget: defaults to ~8000 tokens (32k chars). Per-step a11y captures
* across a ~30-step trajectory at that cap sum to ~240k tokens total,
* which the verifier handles via per-criterion top-K selection. The cap
* is configurable via VERIFIER_ARIATREE_TOKEN_BUDGET so consumers can
* trade RAM/disk for fidelity. Truncated content is marked explicitly so
* the verifier knows it was clipped.
*/
import type { V3 } from "../../v3.js";

const APPROX_CHARS_PER_TOKEN = 4;
const DEFAULT_TOKEN_BUDGET = 8_000;
const DEFAULT_TIMEOUT_MS = 5_000;

interface CaptureAriaTreeOptions {
/** Soft cap on token count (chars/4 approximation). Default 8000. */
tokenBudget?: number;
/** Hard timeout on the capture. Default 5s. */
timeoutMs?: number;
}

/**
* Returns the truncated a11y tree as a plain string, or undefined when
* capture fails. Never throws — a11y capture is best-effort tier-2 evidence,
* not a hard requirement, so failures are silently absorbed (the verifier
* surfaces this via evidence_insufficient).
*/
export async function captureAriaTreeProbe(
v3: V3,
opts: CaptureAriaTreeOptions = {},
): Promise<string | undefined> {
const envBudget = parseInt(
process.env.VERIFIER_ARIATREE_TOKEN_BUDGET ?? "",
10,
);
const tokenBudget =
opts.tokenBudget ??
(Number.isFinite(envBudget) && envBudget > 0
? envBudget
: DEFAULT_TOKEN_BUDGET);
const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const maxChars = tokenBudget * APPROX_CHARS_PER_TOKEN;

try {
// v3.extract() without a schema returns { pageText } where pageText is the
// rendered accessibility tree — same path the agent's ariaTree tool uses.
const result = await v3.extract({ timeout: timeoutMs });
const pageText = result?.pageText;
if (typeof pageText !== "string" || pageText.length === 0) return undefined;

if (pageText.length > maxChars) {
return (
pageText.slice(0, maxChars) +
`\n\n[CONTENT TRUNCATED at ~${tokenBudget} tokens — set VERIFIER_ARIATREE_TOKEN_BUDGET to raise]`
);
}
return pageText;
} catch {
return undefined;
}
}
81 changes: 81 additions & 0 deletions packages/core/lib/v3/agent/utils/postStepProbeEvidence.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import type { AgentEvidenceCallback } from "../../types/public/agentEvidenceEvents.js";
import type { LogLine } from "../../types/public/logs.js";
import type { V3 } from "../../v3.js";
import { captureAriaTreeProbe } from "./captureAriaTreeProbe.js";

interface CaptureProbeEvidenceOptions {
v3: V3;
url: string;
logger: (message: LogLine) => void;
warningMessage: string;
}

interface EmitPostStepProbeEvidenceOptions extends CaptureProbeEvidenceOptions {
evidenceCallback?: AgentEvidenceCallback;
}

function errorMessage(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}

export async function captureProbeEvidence({
v3,
url,
logger,
warningMessage,
}: CaptureProbeEvidenceOptions): Promise<{
url: string;
screenshot?: Buffer;
ariaTree?: string;
}> {
let probeUrl = url;
let screenshot: Buffer | undefined;
try {
const page = await v3.context.awaitActivePage();
probeUrl = page.url();
screenshot = await page.screenshot({ fullPage: false });
} catch (e) {
logger({
category: "agent",
message: `${warningMessage}: ${errorMessage(e)}`,
level: 1,
});
}

const ariaTree = await captureAriaTreeProbe(v3);
return {
url: probeUrl,
...(screenshot ? { screenshot } : {}),
...(ariaTree !== undefined ? { ariaTree } : {}),
};
}

export async function emitPostStepProbeEvidence({
v3,
url,
evidenceCallback,
logger,
warningMessage,
}: EmitPostStepProbeEvidenceOptions): Promise<void> {
if (!evidenceCallback) return;

const probe = await captureProbeEvidence({
v3,
url,
logger,
warningMessage,
});
if (probe.screenshot) {
await evidenceCallback({
type: "screenshot",
screenshot: probe.screenshot,
url: probe.url,
evidenceRole: "probe",
});
}
await evidenceCallback({
type: "step_observed",
url: probe.url,
ariaTree: probe.ariaTree,
});
}
76 changes: 76 additions & 0 deletions packages/core/lib/v3/agent/utils/toolOutputEvidence.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import type { AgentStepFinishedEvent } from "../../types/public/agentEvidenceEvents.js";

const ERROR_STRING_LIMIT = 1000;

function isRecord(value: unknown): value is Record<string, unknown> {
return value !== null && typeof value === "object" && !Array.isArray(value);
}

function hasOwn(value: Record<string, unknown>, key: string): boolean {
return Object.prototype.hasOwnProperty.call(value, key);
}

function normalizeError(value: unknown): string | undefined {
if (value === undefined || value === null || value === false) {
return undefined;
}
if (value instanceof Error) {
return value.message;
}
if (typeof value === "string") {
return value;
}
if (
typeof value === "number" ||
typeof value === "boolean" ||
typeof value === "bigint"
) {
return String(value);
}

let serialized: string;
try {
serialized = JSON.stringify(value) ?? String(value);
} catch {
serialized = String(value);
}
if (serialized.length <= ERROR_STRING_LIMIT) {
return serialized;
}
return `${serialized.slice(0, ERROR_STRING_LIMIT)}... [truncated]`;
}

function statusCandidates(toolResult: unknown): Record<string, unknown>[] {
if (!isRecord(toolResult)) {
return [];
}

const candidates = [toolResult];
const output = toolResult.output;
if (isRecord(output)) {
candidates.push(output);
}
return candidates;
}

export function inferToolOutput(
toolResult: unknown,
): AgentStepFinishedEvent["toolOutput"] {
const candidates = statusCandidates(toolResult);
const error = candidates
.map((candidate) =>
hasOwn(candidate, "error") ? normalizeError(candidate.error) : undefined,
)
.find((message): message is string => message !== undefined);

const successFalse = candidates.some(
(candidate) => candidate.success === false,
);
const isError = candidates.some((candidate) => Boolean(candidate.isError));

return {
ok: error === undefined && !isError && !successFalse,
result: toolResult,
error,
Comment thread
miguelg719 marked this conversation as resolved.
};
}
27 changes: 27 additions & 0 deletions packages/core/lib/v3/agent/utils/wrapEvidenceCallback.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import type { AgentEvidenceCallback } from "../../types/public/agentEvidenceEvents.js";
import type { LogLine } from "../../types/public/logs.js";

// onEvidence is a user-supplied observability hook (trajectory recording,
// verifier capture, etc.). Wrap it once at the boundary where the handler
// receives it so a throwing user callback can never abort the agent loop —
// internal emit sites can then call the wrapped callback directly without
// per-site try/catch.
export function wrapEvidenceCallback(
callback: AgentEvidenceCallback | undefined,
logger: (message: LogLine) => void,
): AgentEvidenceCallback | undefined {
if (!callback) return undefined;
return async (event) => {
try {
await callback(event);
} catch (e) {
logger({
category: "agent",
message: `Warning: onEvidence callback failed for ${event.type}: ${
e instanceof Error ? e.message : String(e)
}`,
level: 1,
});
}
};
}
Loading
Loading