Skip to content
Open
37 changes: 37 additions & 0 deletions packages/core/lib/v3/verifier/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -494,3 +494,40 @@ export interface ParseFailureStepNumbersOptions {
/** Optional inclusive upper bound for accepted step numbers. */
maxStep?: number;
}

export interface RubricVerifierOptions {
/** Factory that returns a configured LLMClient. Called per pipeline step so callers can supply step-specific clients. */
getClient: () => LLMClient;
/** Logger; defaults to a no-op so the verifier stays quiet inside V3Evaluator. */
logger?: (line: LogLine) => void;
}

export interface ErrorTaxonomySubCategory {
/** Sub-code (e.g., "2.3"). */
code: string;
/** Human-readable name (e.g., "Output fabrication"). */
name: string;
/** Detailed description ported from the .md. Markdown formatting preserved. */
description: string;
}

export interface ErrorTaxonomyCategory {
/** Top-level number (1-8). */
number: number;
/** Top-level name (e.g., "Hallucination Errors"). */
name: string;
/** One-sentence summary of the category. */
summary: string;
/** Sub-categories. The last one is always an "Other" catch-all. */
subCategories: ErrorTaxonomySubCategory[];
}

export interface ParseFailureStepNumbersOptions {
/**
* Maximum unique step numbers to expand from ranges. Protects the verifier
* from malformed model output such as "0-2147483647".
*/
maxExpandedSteps?: number;
/** Optional inclusive upper bound for accepted step numbers. */
maxStep?: number;
}
159 changes: 159 additions & 0 deletions packages/evals/framework/verifierAdapter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import {
V3Evaluator,
normalizeRubric,
type AgentInstance,
type AgentExecuteOptions,
type AgentResult,
type EvaluationResult,
type Rubric,
type TaskSpec,
type Trajectory,
type V3,
} from "@browserbasehq/stagehand";

import { RubricCache } from "./rubricCache.js";
import { TrajectoryRecorder } from "./trajectoryRecorder.js";

export interface RunWithVerifierOptions {
v3: V3;
agent: AgentInstance;
taskSpec: TaskSpec;
/**
* Dataset name for rubric cache partitioning. Each task lives under
* `.rubric-cache/<dataset>/<task-id>.json`.
*/
dataset: string;
/** Agent execute options. `instruction` is filled from taskSpec.instruction. */
agentOptions?: Omit<AgentExecuteOptions, "instruction">;
/** Override the run id (defaults to ISO timestamp). */
runId?: string;
/** Override trajectory persistence root. */
trajectoryRoot?: string;
}

export interface RunWithVerifierResult {
trajectory: Trajectory;
evaluationResult: EvaluationResult;
agentResult: AgentResult;
/** Resolved rubric (precomputed, cached, or freshly generated). */
rubric: Rubric;
/** Where the trajectory was persisted (or would have been, if disabled). */
trajectoryDir: string;
}

export async function runWithVerifier(
opts: RunWithVerifierOptions,
): Promise<RunWithVerifierResult> {
const { v3, agent, taskSpec, dataset, agentOptions, runId, trajectoryRoot } =
opts;
const evaluator = new V3Evaluator(v3, { backend: "verifier" });

// ── Resolve rubric ──────────────────────────────────────────────────────
let resolvedRubric: Rubric;
if (taskSpec.precomputedRubric) {
resolvedRubric = normalizeRubric(taskSpec.precomputedRubric)!;
} else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
resolvedRubric = await evaluator.generateRubric(taskSpec);
} else {
const cache = new RubricCache({ dataset });
resolvedRubric = await cache.getOrGenerate(taskSpec, evaluator);
}

// Hand a fully-hydrated TaskSpec to the verifier so it doesn't regenerate.
const hydratedTaskSpec: TaskSpec = {
...taskSpec,
precomputedRubric: resolvedRubric,
};

// ── Record trajectory around agent.execute() ───────────────────────────
const recorder = new TrajectoryRecorder({
taskSpec: hydratedTaskSpec,
runId,
outputRoot: trajectoryRoot,
});
const { callbacks: userCallbacks, ...restAgentOptions } = agentOptions ?? {};

let agentResult: AgentResult;
let recorderStatus: "complete" | "aborted" | "error" = "complete";
try {
agentResult = await agent.execute({
...restAgentOptions,
instruction: taskSpec.instruction,
callbacks: {
...userCallbacks,
onEvidence: async (event) => {
recorder.record(event);
await userCallbacks?.onEvidence?.(event);
},
},
});
} catch (e) {
recorderStatus = "error";
const trajectory = await recorder.finish({ status: recorderStatus });
Copy link
Copy Markdown
Contributor

@cubic-dev-ai cubic-dev-ai Bot May 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: If recorder.finish() rejects inside the catch block, the original agent error is lost. Wrap the persistence call in its own try/catch so the original error is always rethrown.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At packages/evals/framework/verifierAdapter.ts, line 110:

<comment>If `recorder.finish()` rejects inside the catch block, the original agent error is lost. Wrap the persistence call in its own try/catch so the original error is always rethrown.</comment>

<file context>
@@ -0,0 +1,160 @@
+    });
+  } catch (e) {
+    recorderStatus = "error";
+    const trajectory = await recorder.finish({ status: recorderStatus });
+    // Re-throw after persisting so the bench task can decide how to report.
+    const wrapped = e instanceof Error ? e : new Error(String(e));
</file context>
Fix with Cubic

// Re-throw after persisting so the bench task can decide how to report.
const wrapped = e instanceof Error ? e : new Error(String(e));
Object.assign(wrapped, { trajectoryDir: recorder.directory, trajectory });
throw wrapped;
}

const trajectory = await recorder.finish({
status: recorderStatus,
finalAnswer: agentResult.message,
usage: agentResult.usage,
});

// ── Verify ──────────────────────────────────────────────────────────────
const evaluationResult = await evaluator.verify(trajectory);
await recorder.persistResult(evaluationResult);

return {
trajectory,
evaluationResult,
agentResult,
rubric: resolvedRubric,
trajectoryDir: recorder.directory,
};
}

/**
* Decide bench task success from an EvaluationResult using the --success flag's
* semantics.
*
* `outcome` (default) — strict binary outcome.
* `process` — rubric process score ≥ threshold (default 0.8).
* `both` — both conditions must hold.
*/
export type EvalSuccessMode = "outcome" | "process" | "both";

export function resolveEvalSuccessMode(mode: unknown): EvalSuccessMode {
if (typeof mode !== "string") return "outcome";
const normalized = mode.trim().toLowerCase();
if (
normalized === "outcome" ||
normalized === "process" ||
normalized === "both"
) {
return normalized;
}
return "outcome";
}

export function evaluationResultToSuccess(
result: EvaluationResult,
mode: unknown = "outcome",
processThreshold = 0.8,
): boolean {
const resolvedMode = resolveEvalSuccessMode(mode);
const outcomeOk = result.outcomeSuccess;
const processOk =
typeof result.processScore === "number" &&
result.processScore >= processThreshold;
switch (resolvedMode) {
case "outcome":
return outcomeOk;
case "process":
return processOk;
case "both":
return outcomeOk && processOk;
}
}
10 changes: 5 additions & 5 deletions packages/evals/scripts/backfill-webtailbench-rubrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ const JSONL_PATH = path.join(
"WebTailBench_data.jsonl",
);

interface Rubric {
interface RawRubric {
items: Array<Record<string, unknown>>;
}

Expand All @@ -38,7 +38,7 @@ interface LocalRow {
category?: string;
ques: string;
web?: string;
precomputed_rubric?: Rubric;
precomputed_rubric?: RawRubric;
}

/**
Expand Down Expand Up @@ -114,12 +114,12 @@ async function main(): Promise<void> {
);
}

const rubricsById = new Map<string, Rubric>();
const rubricsById = new Map<string, RawRubric>();
for (let i = 1; i < rows.length; i++) {
const cols = rows[i];
if (!cols[idIdx]) continue;
try {
const parsed = JSON.parse(cols[rubricIdx]) as Rubric;
const parsed = JSON.parse(cols[rubricIdx]) as RawRubric;
rubricsById.set(cols[idIdx], parsed);
} catch (e) {
console.warn(
Expand Down Expand Up @@ -149,7 +149,7 @@ async function main(): Promise<void> {
}

console.log(
` ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`,
` ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to generated rubrics)`,
);

await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8");
Expand Down
27 changes: 25 additions & 2 deletions packages/evals/suites/webtailbench.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js";
import type { AvailableModel } from "@browserbasehq/stagehand";
import { normalizeRubric, type AvailableModel } from "@browserbasehq/stagehand";
import { tasksConfig } from "../taskConfig.js";
import { getPackageRootDir } from "../runtimePaths.js";
import {
Expand Down Expand Up @@ -32,6 +32,12 @@ export const buildWebTailBenchTestcases = (
ques: string;
category?: string;
web?: string;
/**
* Per-task rubric ported from microsoft/WebTailBench-v1-rubrics.tsv
* via packages/evals/scripts/backfill-webtailbench-rubrics.ts.
* When present, the verifier uses these upstream criteria directly.
*/
precomputed_rubric?: unknown;
[key: string]: unknown;
};

Expand All @@ -42,7 +48,23 @@ export const buildWebTailBenchTestcases = (
}

const candidates = parseJsonlRows(lines, isWebTailBenchRow);
const rows = applySampling(candidates, sampleCount, maxCases);

// EVAL_WEBTAILBENCH_IDS restricts the suite to exactly those task IDs,
// preserving the order given and ignoring sampling / limit knobs.
const explicitIds = process.env.EVAL_WEBTAILBENCH_IDS
? process.env.EVAL_WEBTAILBENCH_IDS.split(",")
.map((s) => s.trim())
.filter(Boolean)
: null;
let rows: WebTailBenchRow[];
if (explicitIds && explicitIds.length > 0) {
const byId = new Map(candidates.map((r) => [r.id, r]));
rows = explicitIds
.map((id) => byId.get(id))
.filter((r): r is WebTailBenchRow => Boolean(r));
} else {
rows = applySampling(candidates, sampleCount, maxCases);
}

const allTestcases: Testcase[] = [];
for (const modelEntry of normalizeAgentModelEntries(models)) {
Expand All @@ -57,6 +79,7 @@ export const buildWebTailBenchTestcases = (
category: row.category,
ques: row.ques,
web: row.web,
precomputed_rubric: normalizeRubric(row.precomputed_rubric),
},
};
const taskCategories =
Expand Down
Loading
Loading