Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,522 changes: 776 additions & 746 deletions package-lock.json

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
"packages/ai",
"packages/agent",
"packages/ptywright",
"packages/cli"
"packages/cli",
"packages/bench"
],
"scripts": {
"build": "npm run build --workspace @onkernel/cua-ai && npm run build --workspace @onkernel/cua-agent && tsc -b && npm run build --workspace @onkernel/cua-cli && npm run build:native --workspace @onkernel/ptywright --if-present",
Expand Down
26 changes: 26 additions & 0 deletions packages/bench/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"name": "@onkernel/cua-bench",
"version": "0.0.0",
"description": "Benchmark runner for CUA models on Kernel cloud browsers",
"license": "MIT",
"type": "module",
"private": true,
"exports": {
".": {
"types": "./dist-tsc/index.d.ts",
"source": "./src/index.ts"
}
},
"scripts": {
"spike": "NODE_OPTIONS=--conditions=source tsx src/spike.ts",
"typecheck": "tsc -b"
},
"dependencies": {
"@onkernel/cua-agent": "*",
"@onkernel/cua-ai": "*",
"@onkernel/sdk": "0.49.0"
},
"devDependencies": {
"tsx": "^4.21.0"
}
}
2 changes: 2 additions & 0 deletions packages/bench/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
export { runTask, type RunTaskOptions } from "./runTask";
export type { Task, TaskResult, TokenTotals } from "./types";
130 changes: 130 additions & 0 deletions packages/bench/src/runTask.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import { type AgentHarnessEvent, CuaAgentHarness, JsonlSessionRepo, NodeExecutionEnv } from "@onkernel/cua-agent";
import { type CuaModelRef, getCuaEnvApiKey, type ImageContent, resolveCuaRuntimeSpec } from "@onkernel/cua-ai";
import Kernel from "@onkernel/sdk";
import { tmpdir } from "node:os";
import { join } from "node:path";
import type { Task, TaskResult, TokenTotals } from "./types";

export interface RunTaskOptions {
/** Kernel API key. Defaults to `KERNEL_API_KEY`. */
kernelApiKey?: string;
/** Kernel browser session lifetime in seconds. Defaults to 300. */
timeoutSeconds?: number;
/** Root directory for jsonl transcripts. Defaults to a temp dir. */
sessionsRoot?: string;
}

/**
* Run a single benchmark task on a single model against a fresh Kernel
* browser, returning timing and token totals. `success` and `costUsd` are
* left unscored here — the accuracy judge and price table land in later steps.
*/
export async function runTask(
modelRef: CuaModelRef,
task: Task,
options: RunTaskOptions = {},
): Promise<TaskResult> {
const kernelApiKey = options.kernelApiKey ?? process.env.KERNEL_API_KEY;
if (!kernelApiKey) throw new Error("KERNEL_API_KEY is required to run a benchmark task");

const client = new Kernel({ apiKey: kernelApiKey });
const browser = await client.browsers.create({
stealth: true,
timeout_seconds: options.timeoutSeconds && options.timeoutSeconds > 0 ? options.timeoutSeconds : 300,
});

const cwd = process.cwd();
const repo = new JsonlSessionRepo({
fs: new NodeExecutionEnv({ cwd }),
sessionsRoot: options.sessionsRoot ?? join(tmpdir(), "cua-bench", "sessions"),
});
const session = await repo.create({ cwd });

const tokens: TokenTotals = { input: 0, output: 0, total: 0 };
let costUsd: number | null = null;
let steps = 0;

const harness = new CuaAgentHarness({
env: new NodeExecutionEnv({ cwd }),
session,
model: modelRef,
browser,
client,
systemPrompt: ({ model }) => resolveCuaRuntimeSpec(model).defaultSystemPrompt,
getApiKeyAndHeaders: async (resolved) => {
const apiKey = getCuaEnvApiKey(resolved.provider);
return apiKey ? { apiKey } : undefined;
},
});

const unsubscribe = harness.subscribe((event: AgentHarnessEvent) => {
if (event.type === "turn_start") {
steps += 1;
return;
}
if (event.type === "message_end" && event.message.role === "assistant") {
const usage = event.message.usage;
if (!usage) return;
tokens.input += usage.input ?? 0;
tokens.output += usage.output ?? 0;
tokens.total += usage.totalTokens ?? 0;
const turnCost = usage.cost?.total ?? 0;
if (turnCost > 0) costUsd = (costUsd ?? 0) + turnCost;
}
});

const startedAt = Date.now();
let stopReason = "completed";
let finalText = "";
let errorMessage: string | undefined;
try {
const screenshot = await captureScreenshot(client, browser.session_id);
const images: ImageContent[] | undefined = screenshot
? [{ type: "image", data: screenshot, mimeType: "image/png" }]
: undefined;
const assistant = await harness.prompt(task.prompt, images ? { images } : undefined);
stopReason = assistant.stopReason ?? "completed";
finalText = textOf(assistant.content);
if (assistant.stopReason === "error" || assistant.stopReason === "aborted") {
errorMessage = assistant.errorMessage ?? `agent stopped with ${assistant.stopReason}`;
}
} finally {
unsubscribe();
await client.browsers.deleteByID(browser.session_id).catch(() => {});
}

return {
model: modelRef,
taskId: task.id,
success: null,
stopReason,
finalText,
errorMessage,
wallClockMs: Date.now() - startedAt,
steps,
tokens,
costUsd,
};
}

async function captureScreenshot(client: Kernel, sessionId: string): Promise<string | undefined> {
try {
const response = await client.browsers.computer.captureScreenshot(sessionId);
const arrayBuffer = await response.arrayBuffer();
return Buffer.from(arrayBuffer).toString("base64");
} catch {
return undefined;
}
}

function textOf(content: unknown): string {
if (typeof content === "string") return content;
if (!Array.isArray(content)) return "";
const parts: string[] = [];
for (const c of content) {
if (c && typeof c === "object" && (c as { type?: unknown }).type === "text" && typeof (c as { text?: unknown }).text === "string") {
parts.push((c as { text: string }).text);
}
}
return parts.join("\n");
}
21 changes: 21 additions & 0 deletions packages/bench/src/spike.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import type { CuaModelRef } from "@onkernel/cua-ai";
import { runTask } from "./runTask";
import type { Task } from "./types";

const TASK: Task = {
id: "hn-top-story",
prompt: "Go to https://news.ycombinator.com and tell me the title of the current top story.",
};

const MODEL: CuaModelRef = "anthropic:claude-opus-4-6";

async function main(): Promise<void> {
console.log(`[bench] running task "${TASK.id}" on ${MODEL}`);
const result = await runTask(MODEL, TASK);
console.log(JSON.stringify(result, null, 2));
}

main().catch((err) => {
console.error(err);
process.exit(1);
});
33 changes: 33 additions & 0 deletions packages/bench/src/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import type { CuaModelRef } from "@onkernel/cua-ai";

/** A single benchmark task to run against a model. */
export interface Task {
id: string;
prompt: string;
/** Optional starting URL; benchmark loaders (step 3) populate this. */
startUrl?: string;
}

/** Token totals summed across every model call in a run. */
export interface TokenTotals {
input: number;
output: number;
total: number;
}

/** Outcome of running one task on one model. */
export interface TaskResult {
model: CuaModelRef;
taskId: string;
/** null until the accuracy judge (step 3) scores the run. */
success: boolean | null;
stopReason: string;
finalText: string;
errorMessage?: string;
wallClockMs: number;
/** Number of agent turns taken. */
steps: number;
tokens: TokenTotals;
/** null when the provider doesn't report cost; token×price conversion is step 4. */
costUsd: number | null;
}
13 changes: 13 additions & 0 deletions packages/bench/tsconfig.build.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"extends": "../../tsconfig.base.json",
"compilerOptions": {
"outDir": "./dist-tsc",
"rootDir": "./src",
"emitDeclarationOnly": true,
"sourceMap": false,
"declarationMap": false
},
"include": ["src/**/*.ts"],
"exclude": ["node_modules", "dist", "**/*.d.ts", "src/**/*.d.ts"],
"references": [{ "path": "../ai" }, { "path": "../agent" }]
}
3 changes: 3 additions & 0 deletions packages/bench/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"extends": "./tsconfig.build.json"
}
3 changes: 2 additions & 1 deletion tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
{ "path": "./packages/ai" },
{ "path": "./packages/agent" },
{ "path": "./packages/ptywright" },
{ "path": "./packages/cli" }
{ "path": "./packages/cli" },
{ "path": "./packages/bench" }
]
}
Loading