From c7c3883a2f1b467d6c2ba8f59659e084147463eb Mon Sep 17 00:00:00 2001 From: jarugupj <121142710+jarugupj@users.noreply.github.com> Date: Fri, 26 Jun 2026 14:32:12 +0000 Subject: [PATCH 1/4] Add packages/bench with a single-task CUA model runner Introduce a private @onkernel/cua-bench workspace that runs one task on one model against a fresh Kernel browser via CuaAgentHarness, capturing wall-clock, turn count, and token totals. Accuracy scoring and cost conversion are left unscored for follow-up work. Includes a spike entrypoint for a manual run. --- package-lock.json | 20 ++++- package.json | 3 +- packages/bench/package.json | 26 ++++++ packages/bench/src/index.ts | 2 + packages/bench/src/runTask.ts | 128 +++++++++++++++++++++++++++++ packages/bench/src/spike.ts | 21 +++++ packages/bench/src/types.ts | 31 +++++++ packages/bench/tsconfig.build.json | 13 +++ packages/bench/tsconfig.json | 3 + tsconfig.json | 3 +- 10 files changed, 247 insertions(+), 3 deletions(-) create mode 100644 packages/bench/package.json create mode 100644 packages/bench/src/index.ts create mode 100644 packages/bench/src/runTask.ts create mode 100644 packages/bench/src/spike.ts create mode 100644 packages/bench/src/types.ts create mode 100644 packages/bench/tsconfig.build.json create mode 100644 packages/bench/tsconfig.json diff --git a/package-lock.json b/package-lock.json index 3e439a7..5383681 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,7 +11,8 @@ "packages/ai", "packages/agent", "packages/ptywright", - "packages/cli" + "packages/cli", + "packages/bench" ], "devDependencies": { "@types/node": "22.18.4", @@ -3455,6 +3456,10 @@ "resolved": "packages/ai", "link": true }, + "node_modules/@onkernel/cua-bench": { + "resolved": "packages/bench", + "link": true + }, "node_modules/@onkernel/cua-cli": { "resolved": "packages/cli", "link": true @@ -6100,6 +6105,19 @@ "vitest": "^3.2.4" } }, + "packages/bench": { + "name": "@onkernel/cua-bench", + "version": "0.0.0", + "license": "MIT", + "dependencies": { + "@onkernel/cua-agent": "*", + "@onkernel/cua-ai": "*", + "@onkernel/sdk": "0.49.0" + }, + "devDependencies": { + "tsx": "^4.21.0" + } + }, "packages/cli": { "name": "@onkernel/cua-cli", "version": "0.1.4", diff --git a/package.json b/package.json index 1f5397e..7c2266d 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,8 @@ "packages/ai", "packages/agent", "packages/ptywright", - "packages/cli" + "packages/cli", + "packages/bench" ], "scripts": { "build": "npm run build --workspace @onkernel/cua-ai && npm run build --workspace @onkernel/cua-agent && tsc -b && npm run build --workspace @onkernel/cua-cli && npm run build:native --workspace @onkernel/ptywright --if-present", diff --git a/packages/bench/package.json b/packages/bench/package.json new file mode 100644 index 0000000..a67fb8b --- /dev/null +++ b/packages/bench/package.json @@ -0,0 +1,26 @@ +{ + "name": "@onkernel/cua-bench", + "version": "0.0.0", + "description": "Benchmark runner for CUA models on Kernel cloud browsers", + "license": "MIT", + "type": "module", + "private": true, + "exports": { + ".": { + "types": "./dist-tsc/index.d.ts", + "source": "./src/index.ts" + } + }, + "scripts": { + "spike": "NODE_OPTIONS=--conditions=source tsx src/spike.ts", + "typecheck": "tsc -b" + }, + "dependencies": { + "@onkernel/cua-agent": "*", + "@onkernel/cua-ai": "*", + "@onkernel/sdk": "0.49.0" + }, + "devDependencies": { + "tsx": "^4.21.0" + } +} diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts new file mode 100644 index 0000000..1b7489e --- /dev/null +++ b/packages/bench/src/index.ts @@ -0,0 +1,2 @@ +export { runTask, type RunTaskOptions } from "./runTask"; +export type { Task, TaskResult, TokenTotals } from "./types"; diff --git a/packages/bench/src/runTask.ts b/packages/bench/src/runTask.ts new file mode 100644 index 0000000..32356e2 --- /dev/null +++ b/packages/bench/src/runTask.ts @@ -0,0 +1,128 @@ +import { type AgentHarnessEvent, CuaAgentHarness, JsonlSessionRepo, NodeExecutionEnv } from "@onkernel/cua-agent"; +import { type CuaModelRef, getCuaEnvApiKey, type ImageContent, resolveCuaRuntimeSpec } from "@onkernel/cua-ai"; +import Kernel from "@onkernel/sdk"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import type { Task, TaskResult, TokenTotals } from "./types"; + +export interface RunTaskOptions { + /** Kernel API key. Defaults to `KERNEL_API_KEY`. */ + kernelApiKey?: string; + /** Kernel browser session lifetime in seconds. Defaults to 300. */ + timeoutSeconds?: number; + /** Root directory for jsonl transcripts. Defaults to a temp dir. */ + sessionsRoot?: string; +} + +/** + * Run a single benchmark task on a single model against a fresh Kernel + * browser. Returns timing and token totals; `success` and `costUsd` are + * not scored here. + */ +export async function runTask( + modelRef: CuaModelRef, + task: Task, + options: RunTaskOptions = {}, +): Promise { + const kernelApiKey = options.kernelApiKey ?? process.env.KERNEL_API_KEY; + if (!kernelApiKey) throw new Error("KERNEL_API_KEY is required to run a benchmark task"); + + const client = new Kernel({ apiKey: kernelApiKey }); + const browser = await client.browsers.create({ + stealth: true, + timeout_seconds: options.timeoutSeconds && options.timeoutSeconds > 0 ? options.timeoutSeconds : 300, + }); + + const cwd = process.cwd(); + const repo = new JsonlSessionRepo({ + fs: new NodeExecutionEnv({ cwd }), + sessionsRoot: options.sessionsRoot ?? join(tmpdir(), "cua-bench", "sessions"), + }); + const session = await repo.create({ cwd }); + + const tokens: TokenTotals = { input: 0, output: 0, total: 0 }; + let costUsd: number | null = null; + let steps = 0; + + const harness = new CuaAgentHarness({ + env: new NodeExecutionEnv({ cwd }), + session, + model: modelRef, + browser, + client, + systemPrompt: ({ model }) => resolveCuaRuntimeSpec(model).defaultSystemPrompt, + getApiKeyAndHeaders: async (resolved) => { + const apiKey = getCuaEnvApiKey(resolved.provider); + return apiKey ? { apiKey } : undefined; + }, + }); + + const unsubscribe = harness.subscribe((event: AgentHarnessEvent) => { + if (event.type === "turn_start") { + steps += 1; + return; + } + if (event.type === "message_end" && event.message.role === "assistant") { + const { usage } = event.message; + tokens.input += usage.input; + tokens.output += usage.output; + tokens.total += usage.totalTokens; + if (usage.cost.total > 0) costUsd = (costUsd ?? 0) + usage.cost.total; + } + }); + + const startedAt = Date.now(); + let stopReason = "completed"; + let finalText = ""; + let errorMessage: string | undefined; + try { + const screenshot = await captureScreenshot(client, browser.session_id); + const images: ImageContent[] | undefined = screenshot + ? [{ type: "image", data: screenshot, mimeType: "image/png" }] + : undefined; + const assistant = await harness.prompt(task.prompt, images ? { images } : undefined); + stopReason = assistant.stopReason; + finalText = textOf(assistant.content); + if (assistant.stopReason === "error" || assistant.stopReason === "aborted") { + errorMessage = assistant.errorMessage ?? `agent stopped with ${assistant.stopReason}`; + } + } finally { + unsubscribe(); + await client.browsers.deleteByID(browser.session_id).catch(() => {}); + } + + return { + model: modelRef, + taskId: task.id, + success: null, + stopReason, + finalText, + errorMessage, + wallClockMs: Date.now() - startedAt, + steps, + tokens, + costUsd, + }; +} + +async function captureScreenshot(client: Kernel, sessionId: string): Promise { + try { + const response = await client.browsers.computer.captureScreenshot(sessionId); + const arrayBuffer = await response.arrayBuffer(); + return Buffer.from(arrayBuffer).toString("base64"); + } catch { + return undefined; + } +} + +function textOf(content: unknown): string { + if (typeof content === "string") return content; + if (!Array.isArray(content)) return ""; + const parts: string[] = []; + for (const c of content) { + if (c && typeof c === "object" && (c as { type?: unknown }).type === "text" && typeof (c as { text?: unknown }).text === "string") { + parts.push((c as { text: string }).text); + } + } + return parts.join("\n"); +} diff --git a/packages/bench/src/spike.ts b/packages/bench/src/spike.ts new file mode 100644 index 0000000..efd28dd --- /dev/null +++ b/packages/bench/src/spike.ts @@ -0,0 +1,21 @@ +import type { CuaModelRef } from "@onkernel/cua-ai"; +import { runTask } from "./runTask"; +import type { Task } from "./types"; + +const TASK: Task = { + id: "hn-top-story", + prompt: "Go to https://news.ycombinator.com and tell me the title of the current top story.", +}; + +const MODEL: CuaModelRef = "anthropic:claude-opus-4-6"; + +async function main(): Promise { + console.log(`[bench] running task "${TASK.id}" on ${MODEL}`); + const result = await runTask(MODEL, TASK); + console.log(JSON.stringify(result, null, 2)); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts new file mode 100644 index 0000000..55c1a59 --- /dev/null +++ b/packages/bench/src/types.ts @@ -0,0 +1,31 @@ +import type { CuaModelRef } from "@onkernel/cua-ai"; + +/** A single benchmark task to run against a model. */ +export interface Task { + id: string; + prompt: string; +} + +/** Token totals summed across every model call in a run. */ +export interface TokenTotals { + input: number; + output: number; + total: number; +} + +/** Outcome of running one task on one model. */ +export interface TaskResult { + model: CuaModelRef; + taskId: string; + /** null until an accuracy judge scores the run. */ + success: boolean | null; + stopReason: string; + finalText: string; + errorMessage?: string; + wallClockMs: number; + /** Number of agent turns taken. */ + steps: number; + tokens: TokenTotals; + /** null when the provider doesn't report a cost. */ + costUsd: number | null; +} diff --git a/packages/bench/tsconfig.build.json b/packages/bench/tsconfig.build.json new file mode 100644 index 0000000..7802bbd --- /dev/null +++ b/packages/bench/tsconfig.build.json @@ -0,0 +1,13 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "./dist-tsc", + "rootDir": "./src", + "emitDeclarationOnly": true, + "sourceMap": false, + "declarationMap": false + }, + "include": ["src/**/*.ts"], + "exclude": ["node_modules", "dist", "**/*.d.ts", "src/**/*.d.ts"], + "references": [{ "path": "../ai" }, { "path": "../agent" }] +} diff --git a/packages/bench/tsconfig.json b/packages/bench/tsconfig.json new file mode 100644 index 0000000..d8faaf5 --- /dev/null +++ b/packages/bench/tsconfig.json @@ -0,0 +1,3 @@ +{ + "extends": "./tsconfig.build.json" +} diff --git a/tsconfig.json b/tsconfig.json index aad9af1..6166f1e 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -4,6 +4,7 @@ { "path": "./packages/ai" }, { "path": "./packages/agent" }, { "path": "./packages/ptywright" }, - { "path": "./packages/cli" } + { "path": "./packages/cli" }, + { "path": "./packages/bench" } ] } From 6e4941aae4d6c19ad451216a6fc6f2d915426f95 Mon Sep 17 00:00:00 2001 From: jarugupj <121142710+jarugupj@users.noreply.github.com> Date: Fri, 26 Jun 2026 15:58:11 +0000 Subject: [PATCH 2/4] Add Online-Mind2Web benchmark harness to packages/bench Build the full standard-benchmark pipeline on top of the single-task runner: load the osunlp/Online-Mind2Web tasks, run them across models on Kernel browsers (stealth, fresh profile, 600s, concurrency-capped, resumable), and emit official online-mind2web-v2 trajectories plus a cost/speed sidecar per task. Accuracy is scored by the official WebJudge (via scripts/run-webjudge.sh) rather than a reimplementation; aggregate.ts rolls results into the per-model accuracy/cost/speed table. fetch-tasks.py loads the gated dataset. --- packages/bench/.gitignore | 2 + packages/bench/package.json | 2 + packages/bench/scripts/fetch-tasks.py | 41 +++++++++ packages/bench/scripts/run-webjudge.sh | 36 ++++++++ packages/bench/src/aggregate.ts | 99 +++++++++++++++++++++ packages/bench/src/benchmark.ts | 99 +++++++++++++++++++++ packages/bench/src/browser.ts | 53 +++++++++++ packages/bench/src/index.ts | 22 ++++- packages/bench/src/pool.ts | 16 ++++ packages/bench/src/runOne.ts | 117 +++++++++++++++++++++++++ packages/bench/src/tasks.ts | 18 ++++ packages/bench/src/trajectory.ts | 100 +++++++++++++++++++++ packages/bench/src/types.ts | 49 +++++++++++ 13 files changed, 653 insertions(+), 1 deletion(-) create mode 100644 packages/bench/.gitignore create mode 100755 packages/bench/scripts/fetch-tasks.py create mode 100755 packages/bench/scripts/run-webjudge.sh create mode 100644 packages/bench/src/aggregate.ts create mode 100644 packages/bench/src/benchmark.ts create mode 100644 packages/bench/src/browser.ts create mode 100644 packages/bench/src/pool.ts create mode 100644 packages/bench/src/runOne.ts create mode 100644 packages/bench/src/tasks.ts create mode 100644 packages/bench/src/trajectory.ts diff --git a/packages/bench/.gitignore b/packages/bench/.gitignore new file mode 100644 index 0000000..ef55106 --- /dev/null +++ b/packages/bench/.gitignore @@ -0,0 +1,2 @@ +results/ +tasks/ diff --git a/packages/bench/package.json b/packages/bench/package.json index a67fb8b..0bcc229 100644 --- a/packages/bench/package.json +++ b/packages/bench/package.json @@ -13,6 +13,8 @@ }, "scripts": { "spike": "NODE_OPTIONS=--conditions=source tsx src/spike.ts", + "bench": "NODE_OPTIONS=--conditions=source tsx src/benchmark.ts", + "aggregate": "NODE_OPTIONS=--conditions=source tsx src/aggregate.ts", "typecheck": "tsc -b" }, "dependencies": { diff --git a/packages/bench/scripts/fetch-tasks.py b/packages/bench/scripts/fetch-tasks.py new file mode 100755 index 0000000..8403649 --- /dev/null +++ b/packages/bench/scripts/fetch-tasks.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +"""Fetch Online-Mind2Web tasks into the JSON the TS harness reads. + +The dataset is gated, so this needs an HF token (HF_TOKEN env, or `huggingface-cli login`). + + pip install datasets + HF_TOKEN=hf_... python scripts/fetch-tasks.py --out tasks/online-mind2web-test.json +""" +import argparse +import json +import os +from pathlib import Path + +from datasets import load_dataset + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--out", default="tasks/online-mind2web-test.json") + parser.add_argument("--split", default="test") + args = parser.parse_args() + + ds = load_dataset("osunlp/Online-Mind2Web", split=args.split, token=os.environ.get("HF_TOKEN")) + tasks = [ + { + "task_id": row["task_id"], + "website": row["website"], + "confirmed_task": row["confirmed_task"], + "reference_length": int(row["reference_length"]) if row.get("reference_length") is not None else 1, + } + for row in ds + ] + + out = Path(args.out) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(tasks, indent=2)) + print(f"wrote {len(tasks)} tasks to {out}") + + +if __name__ == "__main__": + main() diff --git a/packages/bench/scripts/run-webjudge.sh b/packages/bench/scripts/run-webjudge.sh new file mode 100755 index 0000000..f56801f --- /dev/null +++ b/packages/bench/scripts/run-webjudge.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# Score benchmark trajectories with the OFFICIAL Online-Mind2Web WebJudge. +# +# Clones the upstream OSU-NLP repo and runs its WebJudge over each model's +# trajectories (which the harness already wrote in the official v2 schema), +# then normalizes the output to /webjudge.jsonl for the aggregator. +# +# OPENAI_API_KEY=... scripts/run-webjudge.sh results [judge-model] [score-threshold] +set -euo pipefail + +RESULTS_DIR="$(cd "${1:-results}" && pwd)" +JUDGE_MODEL="${2:-o4-mini}" +THRESHOLD="${3:-3}" +: "${OPENAI_API_KEY:?OPENAI_API_KEY is required for WebJudge}" + +WORKDIR="$(mktemp -d)" +trap 'rm -rf "$WORKDIR"' EXIT +git clone --depth 1 https://github.com/OSU-NLP-Group/Online-Mind2Web "$WORKDIR/om2w" +pip install -q -r "$WORKDIR/om2w/requirements.txt" + +for MODEL_DIR in "$RESULTS_DIR"/*/; do + [ -d "$MODEL_DIR" ] || continue + MODEL_DIR="${MODEL_DIR%/}" + echo "== WebJudge: $MODEL_DIR ==" + ( cd "$WORKDIR/om2w/src" && python run.py \ + --mode WebJudge_Online_Mind2Web_eval \ + --model "$JUDGE_MODEL" \ + --trajectories_dir "$MODEL_DIR" \ + --api_key "$OPENAI_API_KEY" \ + --output_path "$MODEL_DIR" \ + --score_threshold "$THRESHOLD" ) + OUT="$MODEL_DIR/WebJudge_Online_Mind2Web_eval_${JUDGE_MODEL}_score_threshold_${THRESHOLD}_auto_eval_results.json" + [ -f "$OUT" ] && cp "$OUT" "$MODEL_DIR/webjudge.jsonl" +done + +echo "WebJudge complete — aggregate with: npm run aggregate" diff --git a/packages/bench/src/aggregate.ts b/packages/bench/src/aggregate.ts new file mode 100644 index 0000000..1dcae79 --- /dev/null +++ b/packages/bench/src/aggregate.ts @@ -0,0 +1,99 @@ +import { readdir, readFile, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import type { ModelSummary, TaskMetrics } from "./types"; + +/** + * Roll per-task results into one ModelSummary per model. Cost/speed come from + * the `metrics.json` sidecars; accuracy comes from an optional WebJudge output + * (`/webjudge.jsonl`, one `{task_id, predicted_label}` per line) written + * by `scripts/run-webjudge.sh`. Accuracy is null until that file exists. + */ +export async function aggregate(outDir: string): Promise { + const summaries: ModelSummary[] = []; + const modelDirs = await readdir(outDir, { withFileTypes: true }); + + for (const entry of modelDirs) { + if (!entry.isDirectory()) continue; + const modelDir = join(outDir, entry.name); + const metrics = await readMetrics(modelDir); + if (metrics.length === 0) continue; + + const judged = await readJudgements(join(modelDir, "webjudge.jsonl")); + const costs = metrics.map((m) => m.costUsd).filter((c): c is number => c !== null); + const passed = judged ? metrics.filter((m) => judged.get(m.task_id) === true).length : null; + + summaries.push({ + model: metrics[0]!.model, + tasks: metrics.length, + passed, + accuracyPct: judged ? round((passed! / judged.size) * 100, 1) : null, + avgCostUsd: costs.length ? round(sum(costs) / costs.length, 4) : null, + avgSpeedSec: round(sum(metrics.map((m) => m.wallClockMs)) / metrics.length / 1000, 1), + }); + } + + await writeFile(join(outDir, "summary.json"), `${JSON.stringify(summaries, null, 2)}\n`); + printTable(summaries); + return summaries; +} + +async function readMetrics(modelDir: string): Promise { + const out: TaskMetrics[] = []; + for (const entry of await readdir(modelDir, { withFileTypes: true })) { + if (!entry.isDirectory()) continue; + try { + out.push(JSON.parse(await readFile(join(modelDir, entry.name, "metrics.json"), "utf8"))); + } catch { + // task dir without a finished metrics.json — not yet run + } + } + return out; +} + +async function readJudgements(path: string): Promise | undefined> { + let raw: string; + try { + raw = await readFile(path, "utf8"); + } catch { + return undefined; + } + const map = new Map(); + for (const line of raw.split("\n")) { + if (!line.trim()) continue; + const row = JSON.parse(line) as { task_id: string; predicted_label: unknown }; + map.set(row.task_id, isPass(row.predicted_label)); + } + return map; +} + +function isPass(label: unknown): boolean { + if (typeof label === "number") return label === 1; + if (typeof label === "boolean") return label; + if (typeof label === "string") return ["1", "success", "yes", "true"].includes(label.toLowerCase()); + return false; +} + +function printTable(summaries: ModelSummary[]): void { + console.log("\nmodel\taccuracy\tcost/task\tspeed"); + for (const s of summaries) { + const acc = s.accuracyPct === null ? "—" : `${s.accuracyPct}%`; + const cost = s.avgCostUsd === null ? "—" : `$${s.avgCostUsd}`; + console.log(`${s.model}\t${acc}\t${cost}\t${s.avgSpeedSec}s`); + } +} + +function sum(xs: number[]): number { + return xs.reduce((a, b) => a + b, 0); +} + +function round(x: number, places: number): number { + const f = 10 ** places; + return Math.round(x * f) / f; +} + +if (process.argv[1]?.endsWith("aggregate.ts")) { + aggregate(process.argv[2] ?? "results").catch((err) => { + console.error(err); + process.exit(1); + }); +} diff --git a/packages/bench/src/benchmark.ts b/packages/bench/src/benchmark.ts new file mode 100644 index 0000000..3d5b054 --- /dev/null +++ b/packages/bench/src/benchmark.ts @@ -0,0 +1,99 @@ +import type { CuaModelRef } from "@onkernel/cua-ai"; +import { access } from "node:fs/promises"; +import { join } from "node:path"; +import { createKernelClient, DEFAULT_BROWSER_SETTINGS } from "./browser"; +import { runPool } from "./pool"; +import { modelSlug, runOne } from "./runOne"; +import { loadTasks } from "./tasks"; + +const DEFAULT_MODELS: CuaModelRef[] = [ + "anthropic:claude-opus-4-6", + "openai:gpt-5.5", + "google:gemini-3-flash-preview", +]; + +interface Options { + tasksPath: string; + outDir: string; + limit?: number; + concurrency: number; + models: CuaModelRef[]; +} + +function parseArgs(argv: string[]): Options { + const opts: Options = { + tasksPath: "tasks/online-mind2web-test.json", + outDir: "results", + concurrency: 5, + models: DEFAULT_MODELS, + }; + for (let i = 0; i < argv.length; i++) { + const value = () => argv[++i] ?? ""; + switch (argv[i]) { + case "--tasks": + opts.tasksPath = value(); + break; + case "--out": + opts.outDir = value(); + break; + case "--limit": + opts.limit = Number(value()); + break; + case "--concurrency": + opts.concurrency = Number(value()); + break; + case "--models": + opts.models = value().split(",").map((s) => s.trim()) as CuaModelRef[]; + break; + } + } + return opts; +} + +async function exists(path: string): Promise { + try { + await access(path); + return true; + } catch { + return false; + } +} + +async function main(): Promise { + const opts = parseArgs(process.argv.slice(2)); + const client = createKernelClient(); + const tasks = await loadTasks(opts.tasksPath, opts.limit); + console.log(`[bench] ${tasks.length} tasks × ${opts.models.length} models, concurrency ${opts.concurrency}`); + + for (const model of opts.models) { + const slug = modelSlug(model); + console.log(`[bench] === ${model} ===`); + let done = 0; + let failed = 0; + let skipped = 0; + await runPool(tasks, opts.concurrency, async (task) => { + const taskDir = join(opts.outDir, slug, task.task_id); + if (await exists(join(taskDir, "result.json"))) { + skipped++; + return; + } + try { + const m = await runOne(client, model, task, DEFAULT_BROWSER_SETTINGS, taskDir); + done++; + console.log(`[bench] ${slug} ${task.task_id} ok steps=${m.steps} ${(m.wallClockMs / 1000).toFixed(1)}s`); + } catch (err) { + failed++; + console.error(`[bench] ${slug} ${task.task_id} FAILED: ${(err as Error).message}`); + } + }); + console.log(`[bench] ${slug}: done=${done} skipped=${skipped} failed=${failed}`); + } + + console.log(`[bench] complete — results in ${opts.outDir}/`); + console.log("[bench] next: score with scripts/run-webjudge.sh, then aggregate with src/aggregate.ts"); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/packages/bench/src/browser.ts b/packages/bench/src/browser.ts new file mode 100644 index 0000000..01f5b18 --- /dev/null +++ b/packages/bench/src/browser.ts @@ -0,0 +1,53 @@ +import type { KernelBrowser } from "@onkernel/cua-agent"; +import Kernel from "@onkernel/sdk"; + +/** Browser configuration held constant across every model so the only variable is the model. */ +export interface BrowserSettings { + stealth: boolean; + viewport: { width: number; height: number }; + timeoutSeconds: number; +} + +/** Benchmark defaults: stealth on, fresh unauthenticated profile, generous timeout. */ +export const DEFAULT_BROWSER_SETTINGS: BrowserSettings = { + stealth: true, + viewport: { width: 1280, height: 800 }, + timeoutSeconds: 600, +}; + +export interface BrowserHandle { + client: Kernel; + browser: KernelBrowser; + close(): Promise; +} + +export function createKernelClient(apiKey?: string): Kernel { + const key = apiKey ?? process.env.KERNEL_API_KEY; + if (!key) throw new Error("KERNEL_API_KEY is required"); + return new Kernel({ apiKey: key }); +} + +/** Provision a fresh Kernel browser under the given settings. */ +export async function provisionBrowser(client: Kernel, settings: BrowserSettings): Promise { + const browser = await client.browsers.create({ + stealth: settings.stealth, + viewport: settings.viewport, + timeout_seconds: settings.timeoutSeconds, + }); + return { + client, + browser, + close: async () => { + await client.browsers.deleteByID(browser.session_id).catch(() => {}); + }, + }; +} + +export async function captureScreenshot(client: Kernel, sessionId: string): Promise { + try { + const response = await client.browsers.computer.captureScreenshot(sessionId); + return Buffer.from(await response.arrayBuffer()); + } catch { + return undefined; + } +} diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts index 1b7489e..da7f27d 100644 --- a/packages/bench/src/index.ts +++ b/packages/bench/src/index.ts @@ -1,2 +1,22 @@ export { runTask, type RunTaskOptions } from "./runTask"; -export type { Task, TaskResult, TokenTotals } from "./types"; +export { runOne, modelSlug } from "./runOne"; +export { loadTasks } from "./tasks"; +export { aggregate } from "./aggregate"; +export { runPool } from "./pool"; +export { recordTrajectory } from "./trajectory"; +export { + type BrowserSettings, + DEFAULT_BROWSER_SETTINGS, + provisionBrowser, + createKernelClient, +} from "./browser"; +export type { + ActionStep, + ModelSummary, + Om2wResult, + Om2wTask, + Task, + TaskMetrics, + TaskResult, + TokenTotals, +} from "./types"; diff --git a/packages/bench/src/pool.ts b/packages/bench/src/pool.ts new file mode 100644 index 0000000..89aa66c --- /dev/null +++ b/packages/bench/src/pool.ts @@ -0,0 +1,16 @@ +/** Run `worker` over `items` with at most `concurrency` in flight at once. */ +export async function runPool( + items: T[], + concurrency: number, + worker: (item: T, index: number) => Promise, +): Promise { + let next = 0; + const lanes = Array.from({ length: Math.min(Math.max(1, concurrency), items.length) }, async () => { + while (true) { + const index = next++; + if (index >= items.length) return; + await worker(items[index]!, index); + } + }); + await Promise.all(lanes); +} diff --git a/packages/bench/src/runOne.ts b/packages/bench/src/runOne.ts new file mode 100644 index 0000000..8a58480 --- /dev/null +++ b/packages/bench/src/runOne.ts @@ -0,0 +1,117 @@ +import { CuaAgentHarness, JsonlSessionRepo, NodeExecutionEnv } from "@onkernel/cua-agent"; +import { type CuaModelRef, getCuaEnvApiKey, type ImageContent, resolveCuaRuntimeSpec } from "@onkernel/cua-ai"; +import type Kernel from "@onkernel/sdk"; +import { mkdir, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { type BrowserSettings, captureScreenshot, provisionBrowser } from "./browser"; +import { recordTrajectory, type TrajectoryRecording } from "./trajectory"; +import type { ActionStep, Om2wResult, Om2wTask, TaskMetrics } from "./types"; + +/** Filesystem-safe slug for a provider-qualified model ref like `anthropic:claude-opus-4-6`. */ +export function modelSlug(model: CuaModelRef): string { + return model.replace(/[^a-zA-Z0-9._-]/g, "-"); +} + +function buildPrompt(task: Om2wTask): string { + if (task.website && !task.confirmed_task.toLowerCase().includes(task.website.toLowerCase())) { + return `Go to ${task.website} and ${task.confirmed_task}`; + } + return task.confirmed_task; +} + +/** + * Run one Online-Mind2Web task on one model against a fresh Kernel browser and + * write the official v2 trajectory (`result.json` + `trajectory/`) plus a + * `metrics.json` cost/speed sidecar into `taskDir`. + */ +export async function runOne( + client: Kernel, + model: CuaModelRef, + task: Om2wTask, + settings: BrowserSettings, + taskDir: string, +): Promise { + const handle = await provisionBrowser(client, settings); + const cwd = process.cwd(); + const repo = new JsonlSessionRepo({ + fs: new NodeExecutionEnv({ cwd }), + sessionsRoot: join(tmpdir(), "cua-bench", "sessions"), + }); + const session = await repo.create({ cwd }); + + const harness = new CuaAgentHarness({ + env: new NodeExecutionEnv({ cwd }), + session, + model, + browser: handle.browser, + client, + systemPrompt: ({ model: active }) => resolveCuaRuntimeSpec(active).defaultSystemPrompt, + getApiKeyAndHeaders: async (resolved) => { + const apiKey = getCuaEnvApiKey(resolved.provider); + return apiKey ? { apiKey } : undefined; + }, + }); + + const { recording, stop } = recordTrajectory(harness); + const startedAt = Date.now(); + let stopReason = "completed"; + let errorMessage: string | undefined; + try { + const shot = await captureScreenshot(handle.client, handle.browser.session_id); + const images: ImageContent[] | undefined = shot + ? [{ type: "image", data: shot.toString("base64"), mimeType: "image/png" }] + : undefined; + const assistant = await harness.prompt(buildPrompt(task), images ? { images } : undefined); + stopReason = assistant.stopReason; + if (assistant.stopReason === "error" || assistant.stopReason === "aborted") { + errorMessage = assistant.errorMessage ?? `agent stopped with ${assistant.stopReason}`; + } + } finally { + stop(); + await handle.close(); + } + + const wallClockMs = Date.now() - startedAt; + const metrics: TaskMetrics = { + task_id: task.task_id, + model, + wallClockMs, + steps: recording.turns, + tokens: recording.tokens, + costUsd: recording.costUsd, + stopReason, + errorMessage, + }; + await writeArtifacts(taskDir, task, recording, metrics); + return metrics; +} + +async function writeArtifacts( + taskDir: string, + task: Om2wTask, + recording: TrajectoryRecording, + metrics: TaskMetrics, +): Promise { + const trajectoryDir = join(taskDir, "trajectory"); + await mkdir(trajectoryDir, { recursive: true }); + + const action_history: ActionStep[] = []; + for (let i = 0; i < recording.steps.length; i++) { + const step = recording.steps[i]!; + const screenshot = `${String(i).padStart(4, "0")}.png`; + await writeFile(join(trajectoryDir, screenshot), step.screenshot); + action_history.push({ step: i, screenshot, action: step.action, thought: step.thought, url: null }); + } + + const result: Om2wResult = { + schema_version: "online-mind2web-v2", + task: task.confirmed_task, + task_id: task.task_id, + agent_final_answer: recording.finalAnswer, + reference_length: task.reference_length, + action_history, + }; + await writeFile(join(taskDir, "result.json"), `${JSON.stringify(result, null, 2)}\n`); + await writeFile(join(taskDir, "metrics.json"), `${JSON.stringify(metrics, null, 2)}\n`); +} diff --git a/packages/bench/src/tasks.ts b/packages/bench/src/tasks.ts new file mode 100644 index 0000000..4a5c87a --- /dev/null +++ b/packages/bench/src/tasks.ts @@ -0,0 +1,18 @@ +import { readFile } from "node:fs/promises"; +import type { Om2wTask } from "./types"; + +/** + * Load Online-Mind2Web tasks from a local JSON file produced by + * `scripts/fetch-tasks.py` (the dataset is gated, so it's fetched with the + * official `datasets` loader rather than over HTTP). + */ +export async function loadTasks(path: string, limit?: number): Promise { + let raw: string; + try { + raw = await readFile(path, "utf8"); + } catch { + throw new Error(`task file not found at ${path} — generate it with: python scripts/fetch-tasks.py --out ${path}`); + } + const tasks = JSON.parse(raw) as Om2wTask[]; + return typeof limit === "number" ? tasks.slice(0, limit) : tasks; +} diff --git a/packages/bench/src/trajectory.ts b/packages/bench/src/trajectory.ts new file mode 100644 index 0000000..4acf128 --- /dev/null +++ b/packages/bench/src/trajectory.ts @@ -0,0 +1,100 @@ +import type { AgentHarnessEvent, CuaAgentHarness } from "@onkernel/cua-agent"; +import type { TokenTotals } from "./types"; + +/** A trajectory step before it's assigned a screenshot filename. */ +export interface RecordedStep { + action: string; + thought: string | null; + screenshot: Buffer; +} + +export interface TrajectoryRecording { + steps: RecordedStep[]; + finalAnswer: string | null; + tokens: TokenTotals; + costUsd: number | null; + turns: number; +} + +/** + * Subscribe to a running harness and accumulate the data WebJudge needs: + * one step per computer action that produced a screenshot, the agent's final + * answer, and summed token/cost usage. Returns the live recording plus an + * unsubscribe handle. + */ +export function recordTrajectory(harness: CuaAgentHarness): { + recording: TrajectoryRecording; + stop: () => void; +} { + const recording: TrajectoryRecording = { + steps: [], + finalAnswer: null, + tokens: { input: 0, output: 0, total: 0 }, + costUsd: null, + turns: 0, + }; + const pendingActions = new Map(); + let currentThought: string | null = null; + + const stop = harness.subscribe((event: AgentHarnessEvent) => { + switch (event.type) { + case "turn_start": + recording.turns += 1; + return; + case "message_end": { + if (event.message.role !== "assistant") return; + const text = textOf(event.message.content); + if (text) { + currentThought = text; + recording.finalAnswer = text; + } + const { usage } = event.message; + recording.tokens.input += usage.input; + recording.tokens.output += usage.output; + recording.tokens.total += usage.totalTokens; + if (usage.cost.total > 0) recording.costUsd = (recording.costUsd ?? 0) + usage.cost.total; + return; + } + case "tool_execution_start": + pendingActions.set(event.toolCallId, formatAction(event.toolName, event.args)); + return; + case "tool_execution_end": { + const action = pendingActions.get(event.toolCallId) ?? event.toolName; + pendingActions.delete(event.toolCallId); + const screenshot = screenshotOf(event.result); + if (screenshot) recording.steps.push({ action, thought: currentThought, screenshot }); + return; + } + default: + return; + } + }); + + return { recording, stop }; +} + +function formatAction(toolName: string, args: unknown): string { + const rendered = args && typeof args === "object" ? JSON.stringify(args) : String(args ?? ""); + return rendered ? `${toolName} ${rendered}` : toolName; +} + +function screenshotOf(result: unknown): Buffer | undefined { + const content = (result as { content?: Array<{ type?: string; data?: string }> } | undefined)?.content; + if (!content) return undefined; + for (const c of content) { + if (c?.type === "image" && typeof c.data === "string") return Buffer.from(c.data, "base64"); + } + return undefined; +} + +function textOf(content: unknown): string { + if (typeof content === "string") return content; + if (!Array.isArray(content)) return ""; + const parts: string[] = []; + for (const c of content) { + if (c && typeof c === "object" && (c as { type?: unknown }).type === "text" && typeof (c as { text?: unknown }).text === "string") { + parts.push((c as { text: string }).text); + } + } + return parts.join("\n"); +} diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts index 55c1a59..e7b6070 100644 --- a/packages/bench/src/types.ts +++ b/packages/bench/src/types.ts @@ -29,3 +29,52 @@ export interface TaskResult { /** null when the provider doesn't report a cost. */ costUsd: number | null; } + +/** A task from the osunlp/Online-Mind2Web dataset. */ +export interface Om2wTask { + task_id: string; + website: string; + confirmed_task: string; + reference_length: number; +} + +/** One step of an Online-Mind2Web v2 trajectory. */ +export interface ActionStep { + step: number; + screenshot: string; + action: string; + thought: string | null; + url: string | null; +} + +/** A result.json conforming to the official `online-mind2web-v2` submission schema. */ +export interface Om2wResult { + schema_version: "online-mind2web-v2"; + task: string; + task_id: string; + agent_final_answer: string | null; + reference_length: number; + action_history: ActionStep[]; +} + +/** Per-run cost/speed sidecar, kept out of result.json so the latter stays schema-pure. */ +export interface TaskMetrics { + task_id: string; + model: CuaModelRef; + wallClockMs: number; + steps: number; + tokens: TokenTotals; + costUsd: number | null; + stopReason: string; + errorMessage?: string; +} + +/** Aggregated accuracy/cost/speed for one model — the numbers that fill the page. */ +export interface ModelSummary { + model: CuaModelRef; + tasks: number; + passed: number | null; + accuracyPct: number | null; + avgCostUsd: number | null; + avgSpeedSec: number; +} From aa031046905d508fc17323554c556fcda5a4ee78 Mon Sep 17 00:00:00 2001 From: jarugupj <121142710+jarugupj@users.noreply.github.com> Date: Fri, 26 Jun 2026 17:32:20 +0000 Subject: [PATCH 3/4] Deslop: drop the superseded single-task spike runOne + the benchmark CLI fully cover the spike's single-task path, so remove runTask.ts/spike.ts and their now-dead Task/TaskResult types. Eliminates the duplicated screenshot helper, textOf, browser provisioning, and usage accumulation that the spike carried alongside the harness. --- packages/bench/package.json | 1 - packages/bench/src/index.ts | 3 - packages/bench/src/runTask.ts | 128 ---------------------------------- packages/bench/src/spike.ts | 21 ------ packages/bench/src/types.ts | 23 ------ 5 files changed, 176 deletions(-) delete mode 100644 packages/bench/src/runTask.ts delete mode 100644 packages/bench/src/spike.ts diff --git a/packages/bench/package.json b/packages/bench/package.json index 0bcc229..bda0753 100644 --- a/packages/bench/package.json +++ b/packages/bench/package.json @@ -12,7 +12,6 @@ } }, "scripts": { - "spike": "NODE_OPTIONS=--conditions=source tsx src/spike.ts", "bench": "NODE_OPTIONS=--conditions=source tsx src/benchmark.ts", "aggregate": "NODE_OPTIONS=--conditions=source tsx src/aggregate.ts", "typecheck": "tsc -b" diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts index da7f27d..e214a07 100644 --- a/packages/bench/src/index.ts +++ b/packages/bench/src/index.ts @@ -1,4 +1,3 @@ -export { runTask, type RunTaskOptions } from "./runTask"; export { runOne, modelSlug } from "./runOne"; export { loadTasks } from "./tasks"; export { aggregate } from "./aggregate"; @@ -15,8 +14,6 @@ export type { ModelSummary, Om2wResult, Om2wTask, - Task, TaskMetrics, - TaskResult, TokenTotals, } from "./types"; diff --git a/packages/bench/src/runTask.ts b/packages/bench/src/runTask.ts deleted file mode 100644 index 32356e2..0000000 --- a/packages/bench/src/runTask.ts +++ /dev/null @@ -1,128 +0,0 @@ -import { type AgentHarnessEvent, CuaAgentHarness, JsonlSessionRepo, NodeExecutionEnv } from "@onkernel/cua-agent"; -import { type CuaModelRef, getCuaEnvApiKey, type ImageContent, resolveCuaRuntimeSpec } from "@onkernel/cua-ai"; -import Kernel from "@onkernel/sdk"; -import { tmpdir } from "node:os"; -import { join } from "node:path"; -import type { Task, TaskResult, TokenTotals } from "./types"; - -export interface RunTaskOptions { - /** Kernel API key. Defaults to `KERNEL_API_KEY`. */ - kernelApiKey?: string; - /** Kernel browser session lifetime in seconds. Defaults to 300. */ - timeoutSeconds?: number; - /** Root directory for jsonl transcripts. Defaults to a temp dir. */ - sessionsRoot?: string; -} - -/** - * Run a single benchmark task on a single model against a fresh Kernel - * browser. Returns timing and token totals; `success` and `costUsd` are - * not scored here. - */ -export async function runTask( - modelRef: CuaModelRef, - task: Task, - options: RunTaskOptions = {}, -): Promise { - const kernelApiKey = options.kernelApiKey ?? process.env.KERNEL_API_KEY; - if (!kernelApiKey) throw new Error("KERNEL_API_KEY is required to run a benchmark task"); - - const client = new Kernel({ apiKey: kernelApiKey }); - const browser = await client.browsers.create({ - stealth: true, - timeout_seconds: options.timeoutSeconds && options.timeoutSeconds > 0 ? options.timeoutSeconds : 300, - }); - - const cwd = process.cwd(); - const repo = new JsonlSessionRepo({ - fs: new NodeExecutionEnv({ cwd }), - sessionsRoot: options.sessionsRoot ?? join(tmpdir(), "cua-bench", "sessions"), - }); - const session = await repo.create({ cwd }); - - const tokens: TokenTotals = { input: 0, output: 0, total: 0 }; - let costUsd: number | null = null; - let steps = 0; - - const harness = new CuaAgentHarness({ - env: new NodeExecutionEnv({ cwd }), - session, - model: modelRef, - browser, - client, - systemPrompt: ({ model }) => resolveCuaRuntimeSpec(model).defaultSystemPrompt, - getApiKeyAndHeaders: async (resolved) => { - const apiKey = getCuaEnvApiKey(resolved.provider); - return apiKey ? { apiKey } : undefined; - }, - }); - - const unsubscribe = harness.subscribe((event: AgentHarnessEvent) => { - if (event.type === "turn_start") { - steps += 1; - return; - } - if (event.type === "message_end" && event.message.role === "assistant") { - const { usage } = event.message; - tokens.input += usage.input; - tokens.output += usage.output; - tokens.total += usage.totalTokens; - if (usage.cost.total > 0) costUsd = (costUsd ?? 0) + usage.cost.total; - } - }); - - const startedAt = Date.now(); - let stopReason = "completed"; - let finalText = ""; - let errorMessage: string | undefined; - try { - const screenshot = await captureScreenshot(client, browser.session_id); - const images: ImageContent[] | undefined = screenshot - ? [{ type: "image", data: screenshot, mimeType: "image/png" }] - : undefined; - const assistant = await harness.prompt(task.prompt, images ? { images } : undefined); - stopReason = assistant.stopReason; - finalText = textOf(assistant.content); - if (assistant.stopReason === "error" || assistant.stopReason === "aborted") { - errorMessage = assistant.errorMessage ?? `agent stopped with ${assistant.stopReason}`; - } - } finally { - unsubscribe(); - await client.browsers.deleteByID(browser.session_id).catch(() => {}); - } - - return { - model: modelRef, - taskId: task.id, - success: null, - stopReason, - finalText, - errorMessage, - wallClockMs: Date.now() - startedAt, - steps, - tokens, - costUsd, - }; -} - -async function captureScreenshot(client: Kernel, sessionId: string): Promise { - try { - const response = await client.browsers.computer.captureScreenshot(sessionId); - const arrayBuffer = await response.arrayBuffer(); - return Buffer.from(arrayBuffer).toString("base64"); - } catch { - return undefined; - } -} - -function textOf(content: unknown): string { - if (typeof content === "string") return content; - if (!Array.isArray(content)) return ""; - const parts: string[] = []; - for (const c of content) { - if (c && typeof c === "object" && (c as { type?: unknown }).type === "text" && typeof (c as { text?: unknown }).text === "string") { - parts.push((c as { text: string }).text); - } - } - return parts.join("\n"); -} diff --git a/packages/bench/src/spike.ts b/packages/bench/src/spike.ts deleted file mode 100644 index efd28dd..0000000 --- a/packages/bench/src/spike.ts +++ /dev/null @@ -1,21 +0,0 @@ -import type { CuaModelRef } from "@onkernel/cua-ai"; -import { runTask } from "./runTask"; -import type { Task } from "./types"; - -const TASK: Task = { - id: "hn-top-story", - prompt: "Go to https://news.ycombinator.com and tell me the title of the current top story.", -}; - -const MODEL: CuaModelRef = "anthropic:claude-opus-4-6"; - -async function main(): Promise { - console.log(`[bench] running task "${TASK.id}" on ${MODEL}`); - const result = await runTask(MODEL, TASK); - console.log(JSON.stringify(result, null, 2)); -} - -main().catch((err) => { - console.error(err); - process.exit(1); -}); diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts index e7b6070..c711463 100644 --- a/packages/bench/src/types.ts +++ b/packages/bench/src/types.ts @@ -1,11 +1,5 @@ import type { CuaModelRef } from "@onkernel/cua-ai"; -/** A single benchmark task to run against a model. */ -export interface Task { - id: string; - prompt: string; -} - /** Token totals summed across every model call in a run. */ export interface TokenTotals { input: number; @@ -13,23 +7,6 @@ export interface TokenTotals { total: number; } -/** Outcome of running one task on one model. */ -export interface TaskResult { - model: CuaModelRef; - taskId: string; - /** null until an accuracy judge scores the run. */ - success: boolean | null; - stopReason: string; - finalText: string; - errorMessage?: string; - wallClockMs: number; - /** Number of agent turns taken. */ - steps: number; - tokens: TokenTotals; - /** null when the provider doesn't report a cost. */ - costUsd: number | null; -} - /** A task from the osunlp/Online-Mind2Web dataset. */ export interface Om2wTask { task_id: string; From 7934b6d93edf394a8f92f439403366c7351c61eb Mon Sep 17 00:00:00 2001 From: jarugupj <121142710+jarugupj@users.noreply.github.com> Date: Fri, 26 Jun 2026 21:08:52 +0000 Subject: [PATCH 4/4] Don't persist result.json for failed bench runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Errored/aborted runs were writing a result.json, which the resumable logic treats as a completed task and permanently skips on retry — baking empty trajectories into the WebJudge scored set. Throw on error/aborted so the run stays retryable and is recorded as failed. Co-Authored-By: Claude Opus 4.7 --- packages/bench/src/runOne.ts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/packages/bench/src/runOne.ts b/packages/bench/src/runOne.ts index 8a58480..7631805 100644 --- a/packages/bench/src/runOne.ts +++ b/packages/bench/src/runOne.ts @@ -72,6 +72,13 @@ export async function runOne( await handle.close(); } + if (stopReason === "error" || stopReason === "aborted") { + // Throw instead of persisting: result.json is the resume sentinel, so writing + // one for a failed run would bake an empty trajectory into the scored set and + // permanently skip the retry. + throw new Error(errorMessage ?? `agent stopped with ${stopReason}`); + } + const wallClockMs = Date.now() - startedAt; const metrics: TaskMetrics = { task_id: task.task_id,