kernel · jarugupj · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -8,7 +8,8 @@
     "packages/ai",
     "packages/agent",
     "packages/ptywright",
-    "packages/cli"
+    "packages/cli",
+    "packages/bench"
   ],
   "scripts": {
     "build": "npm run build --workspace @onkernel/cua-ai && npm run build --workspace @onkernel/cua-agent && tsc -b && npm run build --workspace @onkernel/cua-cli && npm run build:native --workspace @onkernel/ptywright --if-present",

diff --git a/packages/bench/package.json b/packages/bench/package.json
@@ -0,0 +1,26 @@
+{
+  "name": "@onkernel/cua-bench",
+  "version": "0.0.0",
+  "description": "Benchmark runner for CUA models on Kernel cloud browsers",
+  "license": "MIT",
+  "type": "module",
+  "private": true,
+  "exports": {
+    ".": {
+      "types": "./dist-tsc/index.d.ts",
+      "source": "./src/index.ts"
+    }
+  },
+  "scripts": {
+    "spike": "NODE_OPTIONS=--conditions=source tsx src/spike.ts",
+    "typecheck": "tsc -b"
+  },
+  "dependencies": {
+    "@onkernel/cua-agent": "*",
+    "@onkernel/cua-ai": "*",
+    "@onkernel/sdk": "0.49.0"
+  },
+  "devDependencies": {
+    "tsx": "^4.21.0"
+  }
+}
diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts
@@ -0,0 +1,2 @@
+export { runTask, type RunTaskOptions } from "./runTask";
+export type { Task, TaskResult, TokenTotals } from "./types";
diff --git a/packages/bench/src/runTask.ts b/packages/bench/src/runTask.ts
@@ -0,0 +1,130 @@
+import { type AgentHarnessEvent, CuaAgentHarness, JsonlSessionRepo, NodeExecutionEnv } from "@onkernel/cua-agent";
+import { type CuaModelRef, getCuaEnvApiKey, type ImageContent, resolveCuaRuntimeSpec } from "@onkernel/cua-ai";
+import Kernel from "@onkernel/sdk";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import type { Task, TaskResult, TokenTotals } from "./types";
+
+export interface RunTaskOptions {
+	/** Kernel API key. Defaults to `KERNEL_API_KEY`. */
+	kernelApiKey?: string;
+	/** Kernel browser session lifetime in seconds. Defaults to 300. */
+	timeoutSeconds?: number;
+	/** Root directory for jsonl transcripts. Defaults to a temp dir. */
+	sessionsRoot?: string;
+}
+
+/**
+ * Run a single benchmark task on a single model against a fresh Kernel
+ * browser, returning timing and token totals. `success` and `costUsd` are
+ * left unscored here — the accuracy judge and price table land in later steps.
+ */
+export async function runTask(
+	modelRef: CuaModelRef,
+	task: Task,
+	options: RunTaskOptions = {},
+): Promise<TaskResult> {
+	const kernelApiKey = options.kernelApiKey ?? process.env.KERNEL_API_KEY;
+	if (!kernelApiKey) throw new Error("KERNEL_API_KEY is required to run a benchmark task");
+
+	const client = new Kernel({ apiKey: kernelApiKey });
+	const browser = await client.browsers.create({
+		stealth: true,
+		timeout_seconds: options.timeoutSeconds && options.timeoutSeconds > 0 ? options.timeoutSeconds : 300,
+	});
+
+	const cwd = process.cwd();
+	const repo = new JsonlSessionRepo({
+		fs: new NodeExecutionEnv({ cwd }),
+		sessionsRoot: options.sessionsRoot ?? join(tmpdir(), "cua-bench", "sessions"),
+	});
+	const session = await repo.create({ cwd });
+
+	const tokens: TokenTotals = { input: 0, output: 0, total: 0 };
+	let costUsd: number | null = null;
+	let steps = 0;
+
+	const harness = new CuaAgentHarness({
+		env: new NodeExecutionEnv({ cwd }),
+		session,
+		model: modelRef,
+		browser,
+		client,
+		systemPrompt: ({ model }) => resolveCuaRuntimeSpec(model).defaultSystemPrompt,
+		getApiKeyAndHeaders: async (resolved) => {
+			const apiKey = getCuaEnvApiKey(resolved.provider);
+			return apiKey ? { apiKey } : undefined;
+		},
+	});
+
+	const unsubscribe = harness.subscribe((event: AgentHarnessEvent) => {
+		if (event.type === "turn_start") {
+			steps += 1;
+			return;
+		}
+		if (event.type === "message_end" && event.message.role === "assistant") {
+			const usage = event.message.usage;
+			if (!usage) return;
+			tokens.input += usage.input ?? 0;
+			tokens.output += usage.output ?? 0;
+			tokens.total += usage.totalTokens ?? 0;
+			const turnCost = usage.cost?.total ?? 0;
+			if (turnCost > 0) costUsd = (costUsd ?? 0) + turnCost;
+		}
+	});
+
+	const startedAt = Date.now();
+	let stopReason = "completed";
+	let finalText = "";
+	let errorMessage: string | undefined;
+	try {
+		const screenshot = await captureScreenshot(client, browser.session_id);
+		const images: ImageContent[] | undefined = screenshot
+			? [{ type: "image", data: screenshot, mimeType: "image/png" }]
+			: undefined;
+		const assistant = await harness.prompt(task.prompt, images ? { images } : undefined);
+		stopReason = assistant.stopReason ?? "completed";
+		finalText = textOf(assistant.content);
+		if (assistant.stopReason === "error" || assistant.stopReason === "aborted") {
+			errorMessage = assistant.errorMessage ?? `agent stopped with ${assistant.stopReason}`;
+		}
+	} finally {
+		unsubscribe();
+		await client.browsers.deleteByID(browser.session_id).catch(() => {});
+	}
+
+	return {
+		model: modelRef,
+		taskId: task.id,
+		success: null,
+		stopReason,
+		finalText,
+		errorMessage,
+		wallClockMs: Date.now() - startedAt,
+		steps,
+		tokens,
+		costUsd,
+	};
+}
+
+async function captureScreenshot(client: Kernel, sessionId: string): Promise<string | undefined> {
+	try {
+		const response = await client.browsers.computer.captureScreenshot(sessionId);
+		const arrayBuffer = await response.arrayBuffer();
+		return Buffer.from(arrayBuffer).toString("base64");
+	} catch {
+		return undefined;
+	}
+}
+
+function textOf(content: unknown): string {
+	if (typeof content === "string") return content;
+	if (!Array.isArray(content)) return "";
+	const parts: string[] = [];
+	for (const c of content) {
+		if (c && typeof c === "object" && (c as { type?: unknown }).type === "text" && typeof (c as { text?: unknown }).text === "string") {
+			parts.push((c as { text: string }).text);
+		}
+	}
+	return parts.join("\n");
+}
diff --git a/packages/bench/src/spike.ts b/packages/bench/src/spike.ts
@@ -0,0 +1,21 @@
+import type { CuaModelRef } from "@onkernel/cua-ai";
+import { runTask } from "./runTask";
+import type { Task } from "./types";
+
+const TASK: Task = {
+	id: "hn-top-story",
+	prompt: "Go to https://news.ycombinator.com and tell me the title of the current top story.",
+};
+
+const MODEL: CuaModelRef = "anthropic:claude-opus-4-6";
+
+async function main(): Promise<void> {
+	console.log(`[bench] running task "${TASK.id}" on ${MODEL}`);
+	const result = await runTask(MODEL, TASK);
+	console.log(JSON.stringify(result, null, 2));
+}
+
+main().catch((err) => {
+	console.error(err);
+	process.exit(1);
+});
diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts
@@ -0,0 +1,33 @@
+import type { CuaModelRef } from "@onkernel/cua-ai";
+
+/** A single benchmark task to run against a model. */
+export interface Task {
+	id: string;
+	prompt: string;
+	/** Optional starting URL; benchmark loaders (step 3) populate this. */
+	startUrl?: string;
+}
+
+/** Token totals summed across every model call in a run. */
+export interface TokenTotals {
+	input: number;
+	output: number;
+	total: number;
+}
+
+/** Outcome of running one task on one model. */
+export interface TaskResult {
+	model: CuaModelRef;
+	taskId: string;
+	/** null until the accuracy judge (step 3) scores the run. */
+	success: boolean | null;
+	stopReason: string;
+	finalText: string;
+	errorMessage?: string;
+	wallClockMs: number;
+	/** Number of agent turns taken. */
+	steps: number;
+	tokens: TokenTotals;
+	/** null when the provider doesn't report cost; token×price conversion is step 4. */
+	costUsd: number | null;
+}
diff --git a/packages/bench/tsconfig.build.json b/packages/bench/tsconfig.build.json
@@ -0,0 +1,13 @@
+{
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": {
+    "outDir": "./dist-tsc",
+    "rootDir": "./src",
+    "emitDeclarationOnly": true,
+    "sourceMap": false,
+    "declarationMap": false
+  },
+  "include": ["src/**/*.ts"],
+  "exclude": ["node_modules", "dist", "**/*.d.ts", "src/**/*.d.ts"],
+  "references": [{ "path": "../ai" }, { "path": "../agent" }]
+}
diff --git a/packages/bench/tsconfig.json b/packages/bench/tsconfig.json
@@ -0,0 +1,3 @@
+{
+  "extends": "./tsconfig.build.json"
+}
diff --git a/tsconfig.json b/tsconfig.json
@@ -4,6 +4,7 @@
     { "path": "./packages/ai" },
     { "path": "./packages/agent" },
     { "path": "./packages/ptywright" },
-    { "path": "./packages/cli" }
+    { "path": "./packages/cli" },
+    { "path": "./packages/bench" }
   ]
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		export { runTask, type RunTaskOptions } from "./runTask";
		export type { Task, TaskResult, TokenTotals } from "./types";