From b930ca1e2f22e73886cd059dca688109010a895c Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Fri, 19 Jun 2026 21:51:41 +0000 Subject: [PATCH 01/11] Add opt-in playwright_execute tool to the CUA agent and CLI Exposes a tool that runs Playwright/TypeScript directly against the browser session (via the Kernel SDK browsers.playwright.execute) for steps that are awkward as raw pointer/keyboard actions. Modeled on the existing computer_use_extra navigation tool: defined in cua-ai, executed through the translator, gated by a `playwright` option, and added to keepToolNames so providers retain it in the payload. Enable with the `--playwright` CLI flag. Returns result/stdout/stderr and appends a fresh screenshot so the screenshot loop stays coherent. Co-Authored-By: Claude Opus 4.7 --- packages/agent/src/agent.ts | 12 +++ packages/agent/src/index.ts | 1 + packages/agent/src/tools.ts | 88 +++++++++++++++++-- packages/agent/src/translator/translator.ts | 11 +++ packages/agent/test/agent.test.ts | 17 ++++ .../agent/test/tool-exhaustiveness.test.ts | 55 ++++++++++++ packages/ai/src/providers/common.ts | 27 ++++++ packages/cli/src/cli-harness.ts | 2 + packages/cli/src/cli.ts | 6 ++ packages/cli/src/harness.ts | 3 + packages/cli/src/tui/message-list.ts | 2 + 11 files changed, 216 insertions(+), 8 deletions(-) diff --git a/packages/agent/src/agent.ts b/packages/agent/src/agent.ts index 49a4a6b..3e158f2 100644 --- a/packages/agent/src/agent.ts +++ b/packages/agent/src/agent.ts @@ -12,6 +12,7 @@ import { import { type Api, CUA_NAVIGATION_TOOL_NAME, + CUA_PLAYWRIGHT_TOOL_NAME, type CuaModelRef, type CuaRuntimeSpec, type CuaSimpleStreamOptions, @@ -66,6 +67,8 @@ export type CuaAgentOptions = Omit & { extraTools?: AgentTool[]; /** Expose a helper for browser navigation and URL reads. */ computerUseExtra?: boolean; + /** Expose a tool that runs Playwright code against the browser session. */ + playwright?: boolean; }; /** @@ -89,6 +92,8 @@ export type CuaAgentHarnessOptions< extraTools?: AgentTool[]; /** Expose a helper for browser navigation and URL reads. */ computerUseExtra?: boolean; + /** Expose a tool that runs Playwright code against the browser session. */ + playwright?: boolean; /** Optional payload hook composed after the provider-specific CUA payload hook. */ onPayload?: SimpleStreamOptions["onPayload"]; }; @@ -110,6 +115,7 @@ class CuaRuntimeController { model: CuaRuntimeInput; extraTools?: AgentTool[]; computerUseExtra?: boolean; + playwright?: boolean; onPayload?: SimpleStreamOptions["onPayload"]; }, ) { @@ -136,6 +142,7 @@ class CuaRuntimeController { { toolExecutors: this.runtimeSpec.toolExecutors, computerUseExtra: this.options.computerUseExtra, + playwright: this.options.playwright, }, this.translator, ), @@ -159,6 +166,7 @@ class CuaRuntimeController { return [ ...(this.options.extraTools ?? []).map((tool) => tool.name), ...(this.options.computerUseExtra ? [CUA_NAVIGATION_TOOL_NAME] : []), + ...(this.options.playwright ? [CUA_PLAYWRIGHT_TOOL_NAME] : []), ]; } @@ -203,6 +211,7 @@ export class CuaAgent extends Agent { prepareNextTurn, extraTools, computerUseExtra, + playwright, ...agentOptions } = options; const runtime = new CuaRuntimeController({ @@ -211,6 +220,7 @@ export class CuaAgent extends Agent { model: initialState.model, extraTools, computerUseExtra, + playwright, onPayload, }); const wrappedStreamFn: StreamFn = (model, context, streamOptions) => { @@ -326,6 +336,7 @@ export class CuaAgentHarness< model, extraTools, computerUseExtra, + playwright, systemPrompt, getApiKeyAndHeaders, onPayload, @@ -338,6 +349,7 @@ export class CuaAgentHarness< model, extraTools, computerUseExtra, + playwright, onPayload, }); const resolvedTools = runtime.tools(); diff --git a/packages/agent/src/index.ts b/packages/agent/src/index.ts index 052b5d8..75bea96 100644 --- a/packages/agent/src/index.ts +++ b/packages/agent/src/index.ts @@ -8,6 +8,7 @@ export type { ComputerToolOptions, CuaExecutorTool, NavigationDetails, + PlaywrightDetails, } from "./tools"; export { CuaAgent, CuaAgentHarness } from "./agent"; export type { CuaAgentHarnessOptions, CuaAgentOptions, CuaAgentState } from "./agent"; diff --git a/packages/agent/src/tools.ts b/packages/agent/src/tools.ts index 74becaa..73e9c63 100644 --- a/packages/agent/src/tools.ts +++ b/packages/agent/src/tools.ts @@ -2,15 +2,18 @@ import type Kernel from "@onkernel/sdk"; import type { ImageContent, TextContent, Tool } from "@earendil-works/pi-ai"; import { CUA_NAVIGATION_TOOL_NAME, + CUA_PLAYWRIGHT_TOOL_NAME, createCuaNavigationToolDefinition, + createCuaPlaywrightToolDefinition, type ComputerToolCoordinateSystem, type CuaBatchInput, type CuaNavigationInput, + type CuaPlaywrightInput, type CuaScreenshotSpec, type CuaToolExecutorSpec, type TSchema, } from "@onkernel/cua-ai"; -import { InternalComputerTranslator, type KernelBrowser } from "./translator/translator"; +import { InternalComputerTranslator, type KernelBrowser, type PlaywrightExecutionResult } from "./translator/translator"; import type { AgentTool, AgentToolResult } from "@earendil-works/pi-agent-core"; export interface ComputerToolOptions { @@ -20,6 +23,7 @@ export interface ComputerToolOptions { coordinateSystem?: ComputerToolCoordinateSystem; screenshot?: CuaScreenshotSpec; computerUseExtra?: boolean; + playwright?: boolean; } type ToolContent = Array; @@ -35,12 +39,19 @@ export interface NavigationDetails { url?: string; } +export interface PlaywrightDetails { + success: boolean; + statusText: string; +} + type BatchTool = AgentTool; type NavigationTool = AgentTool; +type PlaywrightTool = AgentTool; type ActionTool = AgentTool; -export type CuaExecutorTool = BatchTool | NavigationTool | ActionTool; +export type CuaExecutorTool = BatchTool | NavigationTool | PlaywrightTool | ActionTool; type NavigationExecutorSpec = { kind: "navigation"; definition: Tool }; -type ComputerExecutorSpec = CuaToolExecutorSpec | NavigationExecutorSpec; +type PlaywrightExecutorSpec = { kind: "playwright"; definition: Tool }; +type ComputerExecutorSpec = CuaToolExecutorSpec | NavigationExecutorSpec | PlaywrightExecutorSpec; export function createCuaComputerTools(args: ComputerToolOptions): CuaExecutorTool[] { return buildCuaComputerTools(args, new InternalComputerTranslator(args)); @@ -48,18 +59,20 @@ export function createCuaComputerTools(args: ComputerToolOptions): CuaExecutorTo /** Build executor tools against an existing translator (internal; not part of the package surface). */ export function buildCuaComputerTools( - args: Pick, + args: Pick, translator: InternalComputerTranslator, ): CuaExecutorTool[] { - return withNavigationTool(args).map((executor) => createExecutorTool(executor, translator)); + return withExtraTools(args).map((executor) => createExecutorTool(executor, translator)); } -function withNavigationTool(args: Pick): ComputerExecutorSpec[] { +function withExtraTools(args: Pick): ComputerExecutorSpec[] { const executors: ComputerExecutorSpec[] = [...args.toolExecutors]; const existing = new Set(executors.map((executor) => executor.definition.name)); if (args.computerUseExtra && !existing.has(CUA_NAVIGATION_TOOL_NAME)) { - const definition = createCuaNavigationToolDefinition(); - executors.push({ kind: "navigation", definition }); + executors.push({ kind: "navigation", definition: createCuaNavigationToolDefinition() }); + } + if (args.playwright && !existing.has(CUA_PLAYWRIGHT_TOOL_NAME)) { + executors.push({ kind: "playwright", definition: createCuaPlaywrightToolDefinition() }); } return executors; } @@ -78,6 +91,19 @@ function createExecutorTool(executor: ComputerExecutorSpec, translator: Internal }; return tool; } + if (isPlaywrightExecutor(executor)) { + const tool: PlaywrightTool = { + name: definition.name, + label: definition.name, + description: definition.description, + parameters: definition.parameters, + executionMode: "sequential", + async execute(_toolCallId: string, params: unknown): Promise> { + return executePlaywrightTool(translator, asPlaywrightInput(params)); + }, + }; + return tool; + } const tool: ActionTool = { name: definition.name, label: definition.name, @@ -95,6 +121,10 @@ function isNavigationExecutor(executor: ComputerExecutorSpec): executor is Navig return "kind" in executor && executor.kind === "navigation"; } +function isPlaywrightExecutor(executor: ComputerExecutorSpec): executor is PlaywrightExecutorSpec { + return "kind" in executor && executor.kind === "playwright"; +} + async function executeBatchTool(translator: InternalComputerTranslator, params: CuaBatchInput): Promise> { const content: ToolContent = []; const readResults: BatchDetails["readResults"] = []; @@ -149,6 +179,41 @@ async function executeNavigationTool(translator: InternalComputerTranslator, par } } +async function executePlaywrightTool(translator: InternalComputerTranslator, params: CuaPlaywrightInput): Promise> { + let execution: PlaywrightExecutionResult; + try { + execution = await translator.executePlaywright(params.code, params.timeout_sec); + } catch (err) { + throw new Error(`playwright_execute failed: ${errorMessage(err)}`, { cause: err }); + } + + const content: ToolContent = []; + if (execution.result !== undefined) { + content.push({ type: "text", text: `result: ${formatPlaywrightResult(execution.result)}` }); + } + if (execution.stdout?.trim()) { + content.push({ type: "text", text: `stdout:\n${execution.stdout.trimEnd()}` }); + } + if (execution.stderr?.trim()) { + content.push({ type: "text", text: `stderr:\n${execution.stderr.trimEnd()}` }); + } + if (!execution.success) { + content.push({ type: "text", text: `error: ${execution.error ?? "playwright execution reported failure"}` }); + } + + const statusText = execution.success ? "Playwright executed successfully." : `Playwright execution failed: ${execution.error ?? "unknown error"}`; + if (content.length === 0) content.push({ type: "text", text: statusText }); + + const screenshot = await translator.screenshot(); + content.push({ type: "image", data: screenshot.data.toString("base64"), mimeType: screenshot.mimeType }); + + return { content, details: { success: execution.success, statusText } }; +} + +function formatPlaywrightResult(result: unknown): string { + return typeof result === "string" ? result : JSON.stringify(result); +} + function errorMessage(err: unknown): string { return err instanceof Error ? err.message : String(err); } @@ -163,3 +228,10 @@ function asNavigationInput(value: unknown): CuaNavigationInput { } throw new Error("invalid computer_use_extra parameters"); } + +function asPlaywrightInput(value: unknown): CuaPlaywrightInput { + if (value && typeof value === "object" && typeof (value as { code?: unknown }).code === "string") { + return value as CuaPlaywrightInput; + } + throw new Error("invalid playwright_execute parameters"); +} diff --git a/packages/agent/src/translator/translator.ts b/packages/agent/src/translator/translator.ts index 495b19a..73343af 100644 --- a/packages/agent/src/translator/translator.ts +++ b/packages/agent/src/translator/translator.ts @@ -85,6 +85,14 @@ export class InternalComputerTranslator { return { x: Math.trunc(pos.x), y: Math.trunc(pos.y) }; } + async executePlaywright(code: string, timeoutSec?: number): Promise { + const timeout = typeof timeoutSec === "number" && Number.isFinite(timeoutSec) && timeoutSec > 0 ? Math.trunc(timeoutSec) : undefined; + return this.client.browsers.playwright.execute(this.sessionId, { + code, + ...(timeout !== undefined ? { timeout_sec: timeout } : {}), + }); + } + async executeBatch(actions: CuaAction[]): Promise { const result: BatchExecutionResult = { readResults: [] }; const pending: KernelBatchAction[] = []; @@ -228,6 +236,9 @@ export class InternalComputerTranslator { type KernelBatchAction = Parameters[1]["actions"][number]; +export type PlaywrightExecutionResult = + Awaited>; + const CLICK_BUTTONS: ReadonlySet = new Set(["left", "right", "middle", "back", "forward"]); const DRAG_BUTTONS: ReadonlySet = new Set(["left", "right", "middle"]); diff --git a/packages/agent/test/agent.test.ts b/packages/agent/test/agent.test.ts index 73e83ee..dfc7525 100644 --- a/packages/agent/test/agent.test.ts +++ b/packages/agent/test/agent.test.ts @@ -144,6 +144,23 @@ describe("CuaAgent", () => { ]); }); + it("synthesizes a playwright_execute tool when requested", () => { + const runtime = resolveCuaRuntimeSpec("openai:gpt-5.5"); + const agent = new CuaAgent({ + browser, + client, + playwright: true, + initialState: { + model: "openai:gpt-5.5", + }, + }); + + expect(agent.state.tools.map((tool) => tool.name)).toEqual([ + ...runtime.toolExecutors.map((tool) => tool.definition.name), + "playwright_execute", + ]); + }); + it("refreshes CUA runtime state when state.model changes", () => { const runtime = resolveCuaRuntimeSpec("google:gemini-3-flash-preview"); const agent = new CuaAgent({ diff --git a/packages/agent/test/tool-exhaustiveness.test.ts b/packages/agent/test/tool-exhaustiveness.test.ts index 2a75273..ac905bc 100644 --- a/packages/agent/test/tool-exhaustiveness.test.ts +++ b/packages/agent/test/tool-exhaustiveness.test.ts @@ -85,4 +85,59 @@ describe("Cua tool executor coverage", () => { ]); expect(result.content.at(-1)).toMatchObject({ type: "image", mimeType: "image/png" }); }); + + it("runs the playwright_execute tool and appends a screenshot", async () => { + const calls: Array<{ id: string; body: { code: string; timeout_sec?: number } }> = []; + const runtime = resolveCuaRuntimeSpec("openai:gpt-5.5"); + const tools = createCuaComputerTools({ + browser, + client: { + browsers: { + playwright: { + execute: async (id: string, body: { code: string; timeout_sec?: number }) => { + calls.push({ id, body }); + return { success: true, result: "Example Domain", stdout: "logged\n", stderr: "" }; + }, + }, + computer: { captureScreenshot: async () => new Response(tinyPng) }, + }, + } as unknown as Kernel, + toolExecutors: runtime.toolExecutors, + playwright: true, + }); + const playwright = tools.find((tool) => tool.name === "playwright_execute"); + expect(playwright).toBeDefined(); + + const result = await playwright!.execute("call_1", { code: "return await page.title();", timeout_sec: 30 }); + + expect(calls).toEqual([{ id: "browser_123", body: { code: "return await page.title();", timeout_sec: 30 } }]); + expect(result.content[0]).toMatchObject({ type: "text", text: "result: Example Domain" }); + expect(result.content.some((block) => block.type === "text" && block.text === "stdout:\nlogged")).toBe(true); + expect(result.content.at(-1)).toMatchObject({ type: "image", mimeType: "image/png" }); + expect(result.details).toMatchObject({ success: true }); + }); + + it("surfaces playwright_execute failures as tool content without throwing", async () => { + const runtime = resolveCuaRuntimeSpec("openai:gpt-5.5"); + const tools = createCuaComputerTools({ + browser, + client: { + browsers: { + playwright: { execute: async () => ({ success: false, error: "boom", stderr: "stack" }) }, + computer: { captureScreenshot: async () => new Response(tinyPng) }, + }, + } as unknown as Kernel, + toolExecutors: runtime.toolExecutors, + playwright: true, + }); + const playwright = tools.find((tool) => tool.name === "playwright_execute"); + expect(playwright).toBeDefined(); + + const result = await playwright!.execute("call_1", { code: "await page.click('#missing')" }); + + expect(result.content.some((block) => block.type === "text" && block.text.includes("error: boom"))).toBe(true); + expect(result.content.some((block) => block.type === "text" && block.text === "stderr:\nstack")).toBe(true); + expect(result.content.at(-1)).toMatchObject({ type: "image", mimeType: "image/png" }); + expect(result.details).toMatchObject({ success: false }); + }); }); diff --git a/packages/ai/src/providers/common.ts b/packages/ai/src/providers/common.ts index b31b479..6fe2f21 100644 --- a/packages/ai/src/providers/common.ts +++ b/packages/ai/src/providers/common.ts @@ -297,10 +297,22 @@ export const CuaNavigationSchema = Type.Object( { additionalProperties: false }, ); +export const CuaPlaywrightSchema = Type.Object( + { + code: Type.String({ + description: + "Playwright/TypeScript to run against the live browser. `page`, `context`, and `browser` are in scope; end with a `return` to send a JSON-serializable value back. Example: \"await page.goto('https://example.com'); return await page.title();\"", + }), + timeout_sec: Type.Optional(Type.Number({ description: "Maximum execution time in seconds. Defaults to 60." })), + }, + { additionalProperties: false }, +); + export interface CuaBatchInput { actions: CuaAction[]; } export type CuaNavigationInput = Static; +export type CuaPlaywrightInput = Static; /** Tool schema plus execution adapter for a browser computer-use tool. */ export interface CuaToolExecutorSpec { @@ -317,6 +329,7 @@ export interface CuaToolExecutorSpec { */ export const CUA_BATCH_TOOL_NAME = "computer_batch"; export const CUA_NAVIGATION_TOOL_NAME = "computer_use_extra"; +export const CUA_PLAYWRIGHT_TOOL_NAME = "playwright_execute"; export const CUA_BATCH_TOOL_DESCRIPTION = [ "Execute multiple computer actions in sequence, including ordered read steps like url(), cursor_position(), and screenshot().", @@ -326,6 +339,12 @@ export const CUA_BATCH_TOOL_DESCRIPTION = [ export const CUA_NAVIGATION_TOOL_DESCRIPTION = "High-level browser navigation helpers for goto, back, forward, and url."; +export const CUA_PLAYWRIGHT_TOOL_DESCRIPTION = [ + "Run Playwright/TypeScript directly against the live browser session for steps that are awkward as raw pointer/keyboard actions: precise DOM reads, form fills, data extraction, and waiting on selectors.", + "`page`, `context`, and `browser` are in scope and the code may `return` a JSON-serializable value, which comes back as the result.", + "Capture page state with a follow-up screenshot action rather than calling page.screenshot() inside the code.", +].join("\n"); + export interface ComputerToolsOptions { actions?: readonly CuaActionType[]; } @@ -425,6 +444,14 @@ export function createCuaNavigationToolDefinition(): Tool { }; } +export function createCuaPlaywrightToolDefinition(): Tool { + return { + name: CUA_PLAYWRIGHT_TOOL_NAME, + description: CUA_PLAYWRIGHT_TOOL_DESCRIPTION, + parameters: CuaPlaywrightSchema, + }; +} + export interface CuaScreenshotTransformSpec { width: number; height: number; diff --git a/packages/cli/src/cli-harness.ts b/packages/cli/src/cli-harness.ts index 7c7d806..dc13007 100644 --- a/packages/cli/src/cli-harness.ts +++ b/packages/cli/src/cli-harness.ts @@ -176,6 +176,7 @@ export interface HarnessCliFlags { debugTui: boolean; jsonlIncludeDeltas: boolean; jsonlIncludeImages: boolean; + playwright: boolean; model?: string; thinking?: string; browserProfile?: string; @@ -413,6 +414,7 @@ async function setupHarnessRuntime( skills, contextFiles, thinkingLevel, + playwright: flags.playwright, modelBaseUrl: baseUrlOverride, }); diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts index b3a249c..5c20070 100644 --- a/packages/cli/src/cli.ts +++ b/packages/cli/src/cli.ts @@ -45,6 +45,8 @@ Options: --profile-no-save-changes Do not persist changes back to the profile --browser-timeout Browser inactivity timeout in seconds (default 300) --max-steps Max turns for action subcommands (default 3) + --playwright Add the playwright_execute tool so the model can run + Playwright code against the browser session --out Output file for screenshot subcommand -o, --output Output format for --print: text (default) | jsonl --jsonl-include-deltas Include assistant_text_delta events (default off) @@ -98,6 +100,7 @@ interface CliFlags { debugTui: boolean; jsonlIncludeDeltas: boolean; jsonlIncludeImages: boolean; + playwright: boolean; model?: string; thinking?: string; browserProfile?: string; @@ -146,6 +149,7 @@ function parseCliArgs(argv: string[]): CliFlags { output: { type: "string", short: "o" }, "jsonl-include-deltas": { type: "boolean", default: false }, "jsonl-include-images": { type: "boolean", default: false }, + playwright: { type: "boolean", default: false }, }, allowPositionals: true, strict: true, @@ -192,6 +196,7 @@ function parseCliArgs(argv: string[]): CliFlags { output: parsed.values.output as string | undefined, jsonlIncludeDeltas: !!parsed.values["jsonl-include-deltas"], jsonlIncludeImages: !!parsed.values["jsonl-include-images"], + playwright: !!parsed.values.playwright, positionals: parsed.positionals, }; } @@ -207,6 +212,7 @@ function toHarnessFlags(flags: CliFlags): HarnessCliFlags { debugTui: flags.debugTui, jsonlIncludeDeltas: flags.jsonlIncludeDeltas, jsonlIncludeImages: flags.jsonlIncludeImages, + playwright: flags.playwright, model: flags.model, thinking: flags.thinking, browserProfile: flags.browserProfile, diff --git a/packages/cli/src/harness.ts b/packages/cli/src/harness.ts index f68ced8..a748811 100644 --- a/packages/cli/src/harness.ts +++ b/packages/cli/src/harness.ts @@ -31,6 +31,8 @@ export interface BuildCuaHarnessOptions { /** Context files (AGENTS.md, CLAUDE.md, …) appended to the system prompt. */ contextFiles?: ContextFile[]; thinkingLevel?: ThinkingLevel; + /** Expose the playwright_execute tool that runs Playwright code against the browser session. */ + playwright?: boolean; /** Override the default coding-tools extraTools (bash/read/edit/write/grep/find/ls). */ extraTools?: CuaAgentHarnessOptions["extraTools"]; /** Override env-var API-key resolution (mainly for tests). */ @@ -60,6 +62,7 @@ export function buildCuaHarness(opts: BuildCuaHarnessOptions): CuaAgentHarness { browser: opts.browser, client: opts.client, extraTools, + playwright: opts.playwright, resources: { skills }, thinkingLevel: opts.thinkingLevel, systemPrompt: ({ model: activeModel, resources }) => { diff --git a/packages/cli/src/tui/message-list.ts b/packages/cli/src/tui/message-list.ts index 422c84d..95b9b32 100644 --- a/packages/cli/src/tui/message-list.ts +++ b/packages/cli/src/tui/message-list.ts @@ -86,6 +86,8 @@ function formatToolCall(name: string, args: unknown): string { if (action === "goto" && typeof obj.url === "string") return `goto(${obj.url})`; return action; } + case "playwright_execute": + return colors.dim(typeof obj.code === "string" ? truncate(obj.code.replace(/\s+/g, " ").trim(), 80) : ""); case "bash": return colors.dim(typeof obj.command === "string" ? truncate(obj.command, 80) : ""); case "read": From 9fa28705695ec289395f860c7811cfc89aa296f3 Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Sat, 20 Jun 2026 14:41:25 +0000 Subject: [PATCH 02/11] Address self-review feedback on playwright_execute - Drop misleading "Defaults to 60" from timeout_sec description; the actual default lives in the Kernel SDK, not here. - Expose result/stdout/stderr/error on PlaywrightDetails so library consumers can branch on the structured execution result without re-parsing tool content text. - Guard formatPlaywrightResult against non-JSON-serializable returns (e.g. BigInt, circular refs) so a successful Playwright run never becomes a tool-level error. - Sync package-lock.json to match the cua-cli 0.1.1 bump in a7cdc07. Co-Authored-By: Claude Opus 4.7 --- package-lock.json | 2 +- packages/agent/src/tools.ts | 18 ++++++++++++++++-- packages/ai/src/providers/common.ts | 2 +- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/package-lock.json b/package-lock.json index d1618ca..b2cf080 100644 --- a/package-lock.json +++ b/package-lock.json @@ -6102,7 +6102,7 @@ }, "packages/cli": { "name": "@onkernel/cua-cli", - "version": "0.1.0", + "version": "0.1.1", "license": "MIT", "dependencies": { "@earendil-works/pi-coding-agent": "0.79.1", diff --git a/packages/agent/src/tools.ts b/packages/agent/src/tools.ts index 73e9c63..acc07f4 100644 --- a/packages/agent/src/tools.ts +++ b/packages/agent/src/tools.ts @@ -42,6 +42,10 @@ export interface NavigationDetails { export interface PlaywrightDetails { success: boolean; statusText: string; + result?: unknown; + stdout?: string; + stderr?: string; + error?: string; } type BatchTool = AgentTool; @@ -207,11 +211,21 @@ async function executePlaywrightTool(translator: InternalComputerTranslator, par const screenshot = await translator.screenshot(); content.push({ type: "image", data: screenshot.data.toString("base64"), mimeType: screenshot.mimeType }); - return { content, details: { success: execution.success, statusText } }; + const details: PlaywrightDetails = { success: execution.success, statusText }; + if (execution.result !== undefined) details.result = execution.result; + if (execution.stdout) details.stdout = execution.stdout; + if (execution.stderr) details.stderr = execution.stderr; + if (execution.error) details.error = execution.error; + return { content, details }; } function formatPlaywrightResult(result: unknown): string { - return typeof result === "string" ? result : JSON.stringify(result); + if (typeof result === "string") return result; + try { + return JSON.stringify(result); + } catch { + return String(result); + } } function errorMessage(err: unknown): string { diff --git a/packages/ai/src/providers/common.ts b/packages/ai/src/providers/common.ts index 6fe2f21..c392db5 100644 --- a/packages/ai/src/providers/common.ts +++ b/packages/ai/src/providers/common.ts @@ -303,7 +303,7 @@ export const CuaPlaywrightSchema = Type.Object( description: "Playwright/TypeScript to run against the live browser. `page`, `context`, and `browser` are in scope; end with a `return` to send a JSON-serializable value back. Example: \"await page.goto('https://example.com'); return await page.title();\"", }), - timeout_sec: Type.Optional(Type.Number({ description: "Maximum execution time in seconds. Defaults to 60." })), + timeout_sec: Type.Optional(Type.Number({ description: "Optional execution timeout in seconds." })), }, { additionalProperties: false }, ); From 49e6f155262283b9355fbbc002823dd394fd7605 Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Sat, 20 Jun 2026 14:43:42 +0000 Subject: [PATCH 03/11] Document fresh-context-per-call in playwright_execute description Locals don't persist across calls but the browser session does. Without this, a model could write code in call N assuming variables from call N-1 are still in scope. Co-Authored-By: Claude Opus 4.7 --- packages/ai/src/providers/common.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/ai/src/providers/common.ts b/packages/ai/src/providers/common.ts index c392db5..158fe99 100644 --- a/packages/ai/src/providers/common.ts +++ b/packages/ai/src/providers/common.ts @@ -342,6 +342,7 @@ export const CUA_NAVIGATION_TOOL_DESCRIPTION = "High-level browser navigation he export const CUA_PLAYWRIGHT_TOOL_DESCRIPTION = [ "Run Playwright/TypeScript directly against the live browser session for steps that are awkward as raw pointer/keyboard actions: precise DOM reads, form fills, data extraction, and waiting on selectors.", "`page`, `context`, and `browser` are in scope and the code may `return` a JSON-serializable value, which comes back as the result.", + "Each call runs in a fresh JS context — local variables do not persist across calls, but the browser session does (navigation, cookies, DOM state carry over via `page`/`context`/`browser`).", "Capture page state with a follow-up screenshot action rather than calling page.screenshot() inside the code.", ].join("\n"); From e96be94b9962ad90c88c9cc0d98bc2be84cde9cb Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Sat, 20 Jun 2026 14:47:25 +0000 Subject: [PATCH 04/11] Restore documented timeout default and add the 300s max MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Earlier review feedback dropped "Defaults to 60" out of a worry that the default lived in the SDK and could drift. The kernel.sh docs put both the default (60s) and the cap (300s) on the server, so the description is the authoritative place to surface them — the model can't choose a sensible timeout without that anchor. Co-Authored-By: Claude Opus 4.7 --- packages/ai/src/providers/common.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ai/src/providers/common.ts b/packages/ai/src/providers/common.ts index 158fe99..4fbbace 100644 --- a/packages/ai/src/providers/common.ts +++ b/packages/ai/src/providers/common.ts @@ -303,7 +303,7 @@ export const CuaPlaywrightSchema = Type.Object( description: "Playwright/TypeScript to run against the live browser. `page`, `context`, and `browser` are in scope; end with a `return` to send a JSON-serializable value back. Example: \"await page.goto('https://example.com'); return await page.title();\"", }), - timeout_sec: Type.Optional(Type.Number({ description: "Optional execution timeout in seconds." })), + timeout_sec: Type.Optional(Type.Number({ description: "Optional execution timeout in seconds. Default 60, max 300." })), }, { additionalProperties: false }, ); From 5ddf0e50b9dbb6e47725d3af977403bc44586048 Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Sat, 20 Jun 2026 14:52:38 +0000 Subject: [PATCH 05/11] Clamp playwright_execute timeout to the documented 300s server max Schema description tells the model "max 300" but nothing enforced it. A model that ignored the bound would have hit a confusing SDK-level failure depending on server behavior; this clamp keeps the client honest to the documented contract. Co-Authored-By: Claude Opus 4.7 --- packages/agent/src/translator/translator.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/packages/agent/src/translator/translator.ts b/packages/agent/src/translator/translator.ts index 73343af..87dcabc 100644 --- a/packages/agent/src/translator/translator.ts +++ b/packages/agent/src/translator/translator.ts @@ -86,7 +86,10 @@ export class InternalComputerTranslator { } async executePlaywright(code: string, timeoutSec?: number): Promise { - const timeout = typeof timeoutSec === "number" && Number.isFinite(timeoutSec) && timeoutSec > 0 ? Math.trunc(timeoutSec) : undefined; + const timeout = + typeof timeoutSec === "number" && Number.isFinite(timeoutSec) && timeoutSec > 0 + ? Math.min(Math.trunc(timeoutSec), PLAYWRIGHT_MAX_TIMEOUT_SEC) + : undefined; return this.client.browsers.playwright.execute(this.sessionId, { code, ...(timeout !== undefined ? { timeout_sec: timeout } : {}), @@ -239,6 +242,8 @@ type KernelBatchAction = export type PlaywrightExecutionResult = Awaited>; +const PLAYWRIGHT_MAX_TIMEOUT_SEC = 300; + const CLICK_BUTTONS: ReadonlySet = new Set(["left", "right", "middle", "back", "forward"]); const DRAG_BUTTONS: ReadonlySet = new Set(["left", "right", "middle"]); From 849f6a4cfc7e05f0a207114f2c6068ec6e2c77c6 Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Sat, 20 Jun 2026 18:18:45 +0000 Subject: [PATCH 06/11] Document playwright_execute in package READMEs - packages/agent: list playwright option alongside computerUseExtra and add a paragraph explaining the tool's behavior and tested-models scope. - packages/ai: list the new tool-definition factory, schema, constants, and CuaPlaywrightInput type in the API surface index. - packages/cli: document --playwright with a short explainer. Co-Authored-By: Claude Opus 4.7 --- packages/agent/README.md | 14 ++++++++++++++ packages/ai/README.md | 5 ++++- packages/cli/README.md | 10 ++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/packages/agent/README.md b/packages/agent/README.md index 03b1f20..f839093 100644 --- a/packages/agent/README.md +++ b/packages/agent/README.md @@ -98,6 +98,8 @@ Both classes mirror pi constructor shapes and behavior, with minimal additions: - CUA model refs (`"provider:model"`) accepted where pi expects a concrete model - `extraTools` to add your own pi tools alongside the built-in browser tools - `computerUseExtra: true` to let the model use a small navigation helper +- `playwright: true` to let the model run Playwright/TypeScript against the + live browser session If auth callbacks are omitted, both classes default to CUA env var conventions: - OpenAI: `OPENAI_API_KEY` @@ -124,6 +126,18 @@ URL or go back. `computerUseExtra: true` adds `computer_use_extra`, a provider-neutral escape hatch exposing `goto`, `back`, `forward`, and `url` so navigation works uniformly regardless of which model is driving. +Some steps are awkward as raw pointer/keyboard actions: precise DOM reads, +form fills, data extraction, or waiting on a specific selector. +`playwright: true` adds `playwright_execute`, which runs Playwright/TypeScript +directly against the live browser session. `page`, `context`, and `browser` +are in scope and the code may `return` a JSON-serializable value. Each call +runs in a fresh JS context (locals don't persist across calls) but the +browser session does carry over. A fresh screenshot is appended after every +call so the loop stays coherent. Playwright-level failures come back as tool +content (so the model can adapt) rather than thrown errors. Tested against +Anthropic and OpenAI computer-use models; CUA-specialized providers +(`tzafon`, `yutori`) may not emit calls for non-native tools. + ### Model Switching `CuaAgent` follows pi `Agent` semantics: assign `agent.state.model` to a diff --git a/packages/ai/README.md b/packages/ai/README.md index 5bf1586..6d0da7a 100644 --- a/packages/ai/README.md +++ b/packages/ai/README.md @@ -309,12 +309,15 @@ definitions and executors; it is forwarded to the provider module's - `createCuaNavigationToolDefinition()`, `CuaNavigationSchema`, `CUA_NAVIGATION_TOOL_NAME` (`"computer_use_extra"`), `CUA_NAVIGATION_TOOL_DESCRIPTION` +- `createCuaPlaywrightToolDefinition()`, `CuaPlaywrightSchema`, + `CUA_PLAYWRIGHT_TOOL_NAME` (`"playwright_execute"`), + `CUA_PLAYWRIGHT_TOOL_DESCRIPTION` - `canonicalToolCallName(action)`, `canonicalToolCallArguments(action)` — map a normalized `CuaAction` back to its tool-call name/arguments - `normalizeGotoUrl(value)` — prefix bare hostnames with `https://` - Types: `CuaAction` (plus the 16 per-action interfaces), `CuaActionType`, `CuaMouseButton`, `CuaDragMouseButton`, `CuaBatchInput`, - `CuaNavigationInput`, `CuaToolExecutorSpec`, `ComputerToolsOptions`, + `CuaNavigationInput`, `CuaPlaywrightInput`, `CuaToolExecutorSpec`, `ComputerToolsOptions`, `ComputerToolCoordinateSystem` ### Provider registration diff --git a/packages/cli/README.md b/packages/cli/README.md index 3ae0fe2..90bda14 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -102,6 +102,16 @@ Configuration is by environment variable. There is no config file. Use `--thinking ` (`off | minimal | low | medium | high | xhigh`, default `low`) for providers that support reasoning effort. +## Playwright escape hatch + +Pass `--playwright` to expose the `playwright_execute` tool, letting the +model run Playwright/TypeScript directly against the live browser session +for steps that are awkward as raw pointer/keyboard actions (precise DOM +reads, form fills, data extraction, waiting on selectors). `page`, +`context`, and `browser` are in scope; the code may `return` a +JSON-serializable value. Off by default. Tested with Anthropic and OpenAI +computer-use models. + ## Output formats `--print` defaults to streaming text. Pass `-o jsonl` for one From 11afbbfc9a03617bc4589605f6eebb00ec77ba63 Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Sat, 20 Jun 2026 18:46:08 +0000 Subject: [PATCH 07/11] Deslop: drop dead defensive checks around playwright_execute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit formatPlaywrightResult's JSON.stringify try/catch guarded against non-serializable values, but execution.result came from the SDK after a JSON round trip through the wire — anything that survived that is already JSON-safe, so the catch arm is unreachable. The executePlaywright timeout chain checked typeof === "number" (dead, the parameter is TS-typed number | undefined) and Number.isFinite (redundant — timeoutSec > 0 already rejects NaN, and Math.min handles Infinity). Co-Authored-By: Claude Opus 4.7 --- packages/agent/src/tools.ts | 7 +------ packages/agent/src/translator/translator.ts | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/packages/agent/src/tools.ts b/packages/agent/src/tools.ts index acc07f4..f05425f 100644 --- a/packages/agent/src/tools.ts +++ b/packages/agent/src/tools.ts @@ -220,12 +220,7 @@ async function executePlaywrightTool(translator: InternalComputerTranslator, par } function formatPlaywrightResult(result: unknown): string { - if (typeof result === "string") return result; - try { - return JSON.stringify(result); - } catch { - return String(result); - } + return typeof result === "string" ? result : JSON.stringify(result); } function errorMessage(err: unknown): string { diff --git a/packages/agent/src/translator/translator.ts b/packages/agent/src/translator/translator.ts index 87dcabc..5342d5a 100644 --- a/packages/agent/src/translator/translator.ts +++ b/packages/agent/src/translator/translator.ts @@ -87,7 +87,7 @@ export class InternalComputerTranslator { async executePlaywright(code: string, timeoutSec?: number): Promise { const timeout = - typeof timeoutSec === "number" && Number.isFinite(timeoutSec) && timeoutSec > 0 + timeoutSec !== undefined && timeoutSec > 0 ? Math.min(Math.trunc(timeoutSec), PLAYWRIGHT_MAX_TIMEOUT_SEC) : undefined; return this.client.browsers.playwright.execute(this.sessionId, { From 302fc14c424bc22f1362d25a6d08ef3e5ef548d2 Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Sat, 20 Jun 2026 19:00:46 +0000 Subject: [PATCH 08/11] Update playwright_execute docs after Tzafon and Yutori e2e verification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Empirical results show CUA-specialized providers (Tzafon, Yutori) do emit playwright_execute calls — earlier docs were overly cautious. Yutori in particular demonstrates the failure-as-content design well: it iterated through two wrong-API attempts (page.querySelector, bare document) before reading the stderr/error blocks and landing on page.evaluate(), which throwing would have prevented. Co-Authored-By: Claude Opus 4.7 --- packages/agent/README.md | 6 +++--- packages/cli/README.md | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/agent/README.md b/packages/agent/README.md index f839093..f6922bd 100644 --- a/packages/agent/README.md +++ b/packages/agent/README.md @@ -134,9 +134,9 @@ are in scope and the code may `return` a JSON-serializable value. Each call runs in a fresh JS context (locals don't persist across calls) but the browser session does carry over. A fresh screenshot is appended after every call so the loop stays coherent. Playwright-level failures come back as tool -content (so the model can adapt) rather than thrown errors. Tested against -Anthropic and OpenAI computer-use models; CUA-specialized providers -(`tzafon`, `yutori`) may not emit calls for non-native tools. +content (so the model can adapt) rather than thrown errors. Verified e2e +against Anthropic, Tzafon, and Yutori CUA models; OpenAI and Google are +unit-tested. ### Model Switching diff --git a/packages/cli/README.md b/packages/cli/README.md index 90bda14..0269493 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -109,8 +109,8 @@ model run Playwright/TypeScript directly against the live browser session for steps that are awkward as raw pointer/keyboard actions (precise DOM reads, form fills, data extraction, waiting on selectors). `page`, `context`, and `browser` are in scope; the code may `return` a -JSON-serializable value. Off by default. Tested with Anthropic and OpenAI -computer-use models. +JSON-serializable value. Off by default. Verified e2e with Anthropic, +Tzafon, and Yutori CUA models. ## Output formats From 565fe015f5045f2f8b9888bbbdf5c1a086e065ac Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Sat, 20 Jun 2026 20:17:46 +0000 Subject: [PATCH 09/11] Wrap playwright_execute screenshot in the same try as the execution Matches executeBatchTool's shape: the trailing translator.screenshot() lives inside the same try/catch as the underlying work, so any failure in the pipeline produces a single wrapped tool error rather than diverging based on which step failed. Co-Authored-By: Claude Opus 4.7 --- packages/agent/src/tools.ts | 59 ++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/packages/agent/src/tools.ts b/packages/agent/src/tools.ts index f05425f..8375270 100644 --- a/packages/agent/src/tools.ts +++ b/packages/agent/src/tools.ts @@ -13,7 +13,7 @@ import { type CuaToolExecutorSpec, type TSchema, } from "@onkernel/cua-ai"; -import { InternalComputerTranslator, type KernelBrowser, type PlaywrightExecutionResult } from "./translator/translator"; +import { InternalComputerTranslator, type KernelBrowser } from "./translator/translator"; import type { AgentTool, AgentToolResult } from "@earendil-works/pi-agent-core"; export interface ComputerToolOptions { @@ -184,39 +184,38 @@ async function executeNavigationTool(translator: InternalComputerTranslator, par } async function executePlaywrightTool(translator: InternalComputerTranslator, params: CuaPlaywrightInput): Promise> { - let execution: PlaywrightExecutionResult; try { - execution = await translator.executePlaywright(params.code, params.timeout_sec); - } catch (err) { - throw new Error(`playwright_execute failed: ${errorMessage(err)}`, { cause: err }); - } - - const content: ToolContent = []; - if (execution.result !== undefined) { - content.push({ type: "text", text: `result: ${formatPlaywrightResult(execution.result)}` }); - } - if (execution.stdout?.trim()) { - content.push({ type: "text", text: `stdout:\n${execution.stdout.trimEnd()}` }); - } - if (execution.stderr?.trim()) { - content.push({ type: "text", text: `stderr:\n${execution.stderr.trimEnd()}` }); - } - if (!execution.success) { - content.push({ type: "text", text: `error: ${execution.error ?? "playwright execution reported failure"}` }); - } + const execution = await translator.executePlaywright(params.code, params.timeout_sec); - const statusText = execution.success ? "Playwright executed successfully." : `Playwright execution failed: ${execution.error ?? "unknown error"}`; - if (content.length === 0) content.push({ type: "text", text: statusText }); + const content: ToolContent = []; + if (execution.result !== undefined) { + content.push({ type: "text", text: `result: ${formatPlaywrightResult(execution.result)}` }); + } + if (execution.stdout?.trim()) { + content.push({ type: "text", text: `stdout:\n${execution.stdout.trimEnd()}` }); + } + if (execution.stderr?.trim()) { + content.push({ type: "text", text: `stderr:\n${execution.stderr.trimEnd()}` }); + } + if (!execution.success) { + content.push({ type: "text", text: `error: ${execution.error ?? "playwright execution reported failure"}` }); + } - const screenshot = await translator.screenshot(); - content.push({ type: "image", data: screenshot.data.toString("base64"), mimeType: screenshot.mimeType }); + const statusText = execution.success ? "Playwright executed successfully." : `Playwright execution failed: ${execution.error ?? "unknown error"}`; + if (content.length === 0) content.push({ type: "text", text: statusText }); - const details: PlaywrightDetails = { success: execution.success, statusText }; - if (execution.result !== undefined) details.result = execution.result; - if (execution.stdout) details.stdout = execution.stdout; - if (execution.stderr) details.stderr = execution.stderr; - if (execution.error) details.error = execution.error; - return { content, details }; + const screenshot = await translator.screenshot(); + content.push({ type: "image", data: screenshot.data.toString("base64"), mimeType: screenshot.mimeType }); + + const details: PlaywrightDetails = { success: execution.success, statusText }; + if (execution.result !== undefined) details.result = execution.result; + if (execution.stdout) details.stdout = execution.stdout; + if (execution.stderr) details.stderr = execution.stderr; + if (execution.error) details.error = execution.error; + return { content, details }; + } catch (err) { + throw new Error(`playwright_execute failed: ${errorMessage(err)}`, { cause: err }); + } } function formatPlaywrightResult(result: unknown): string { From d746e8c6cd2abbe2fb3ee4b7a1b228190568fdc5 Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Sat, 20 Jun 2026 20:26:02 +0000 Subject: [PATCH 10/11] Bugbot fixes: sub-second timeout floor and PlaywrightDetails TSDoc - executePlaywright: timeout_sec values below 1s previously truncated to 0 and were forwarded to the SDK, which differs from omitting the field. Floor the truncated value at 1s; anything sub-second falls back to "use server default". - Document PlaywrightDetails fields so library consumers know what each one means without reading the executor source. Co-Authored-By: Claude Opus 4.7 --- packages/agent/src/tools.ts | 15 +++++++++++++++ packages/agent/src/translator/translator.ts | 8 ++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/packages/agent/src/tools.ts b/packages/agent/src/tools.ts index 8375270..fecfec2 100644 --- a/packages/agent/src/tools.ts +++ b/packages/agent/src/tools.ts @@ -39,6 +39,21 @@ export interface NavigationDetails { url?: string; } +/** + * Structured details for a `playwright_execute` tool result. Library + * consumers can read these directly instead of re-parsing the model-facing + * tool content blocks. + * + * - `success` — whether the Playwright code itself completed without error. + * A `false` value means the code threw or the SDK reported failure; in + * that case the failure is also surfaced as tool content for the model. + * - `statusText` — short human-readable status (success or failure summary). + * - `result` — present only when the code returned a JSON-serializable value. + * - `stdout`/`stderr` — present only when the daemon captured output on that + * stream during execution. + * - `error` — present only when `success` is `false`; the error message from + * the daemon. + */ export interface PlaywrightDetails { success: boolean; statusText: string; diff --git a/packages/agent/src/translator/translator.ts b/packages/agent/src/translator/translator.ts index 5342d5a..26d309c 100644 --- a/packages/agent/src/translator/translator.ts +++ b/packages/agent/src/translator/translator.ts @@ -86,10 +86,10 @@ export class InternalComputerTranslator { } async executePlaywright(code: string, timeoutSec?: number): Promise { - const timeout = - timeoutSec !== undefined && timeoutSec > 0 - ? Math.min(Math.trunc(timeoutSec), PLAYWRIGHT_MAX_TIMEOUT_SEC) - : undefined; + const truncated = timeoutSec !== undefined ? Math.trunc(timeoutSec) : undefined; + const timeout = truncated !== undefined && truncated >= 1 + ? Math.min(truncated, PLAYWRIGHT_MAX_TIMEOUT_SEC) + : undefined; return this.client.browsers.playwright.execute(this.sessionId, { code, ...(timeout !== undefined ? { timeout_sec: timeout } : {}), From f855cf105d518990ac44ae821de1d8d15911c57c Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Tue, 23 Jun 2026 20:06:35 +0000 Subject: [PATCH 11/11] Drop auto-appended screenshot from playwright_execute; clarify stdout/stderr details MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit playwright_execute is frequently a pure read where forcing a screenshot wastes image tokens and latency. Let the model request one on a follow-up turn. The existing content.length === 0 → statusText fallback keeps content non-empty for side-effect-only calls. Also tighten the PlaywrightDetails TSDoc for stdout/stderr to reflect that details captures raw daemon output (potentially whitespace-only), while the model-facing content blocks only surface trimmed non-empty output. - packages/agent/src/tools.ts: drop screenshot append in executePlaywrightTool; update PlaywrightDetails TSDoc for stdout/stderr. - packages/agent/README.md and packages/ai/src/providers/common.ts: reword to make explicit no screenshot is returned automatically. - packages/agent/test/tool-exhaustiveness.test.ts: flip the trailing-image assertions to assert no image is appended; drop the unused captureScreenshot mocks; add a side-effect-only case that hits the statusText fallback. --- packages/agent/README.md | 7 +++-- packages/agent/src/tools.ts | 7 ++--- .../agent/test/tool-exhaustiveness.test.ts | 31 ++++++++++++++++--- packages/ai/src/providers/common.ts | 2 +- 4 files changed, 33 insertions(+), 14 deletions(-) diff --git a/packages/agent/README.md b/packages/agent/README.md index f6922bd..cb60d4f 100644 --- a/packages/agent/README.md +++ b/packages/agent/README.md @@ -132,9 +132,10 @@ form fills, data extraction, or waiting on a specific selector. directly against the live browser session. `page`, `context`, and `browser` are in scope and the code may `return` a JSON-serializable value. Each call runs in a fresh JS context (locals don't persist across calls) but the -browser session does carry over. A fresh screenshot is appended after every -call so the loop stays coherent. Playwright-level failures come back as tool -content (so the model can adapt) rather than thrown errors. Verified e2e +browser session does carry over. No screenshot is returned automatically; +request one on a follow-up turn when the model needs to see the page. +Playwright-level failures come back as tool content (so the model can adapt) +rather than thrown errors. Verified e2e against Anthropic, Tzafon, and Yutori CUA models; OpenAI and Google are unit-tested. diff --git a/packages/agent/src/tools.ts b/packages/agent/src/tools.ts index fecfec2..a326116 100644 --- a/packages/agent/src/tools.ts +++ b/packages/agent/src/tools.ts @@ -49,8 +49,8 @@ export interface NavigationDetails { * that case the failure is also surfaced as tool content for the model. * - `statusText` — short human-readable status (success or failure summary). * - `result` — present only when the code returned a JSON-serializable value. - * - `stdout`/`stderr` — present only when the daemon captured output on that - * stream during execution. + * - `stdout`/`stderr` — raw daemon output, present whenever the daemon + * reported a non-empty value on that stream (may be whitespace-only). * - `error` — present only when `success` is `false`; the error message from * the daemon. */ @@ -219,9 +219,6 @@ async function executePlaywrightTool(translator: InternalComputerTranslator, par const statusText = execution.success ? "Playwright executed successfully." : `Playwright execution failed: ${execution.error ?? "unknown error"}`; if (content.length === 0) content.push({ type: "text", text: statusText }); - const screenshot = await translator.screenshot(); - content.push({ type: "image", data: screenshot.data.toString("base64"), mimeType: screenshot.mimeType }); - const details: PlaywrightDetails = { success: execution.success, statusText }; if (execution.result !== undefined) details.result = execution.result; if (execution.stdout) details.stdout = execution.stdout; diff --git a/packages/agent/test/tool-exhaustiveness.test.ts b/packages/agent/test/tool-exhaustiveness.test.ts index ac905bc..af8c1ba 100644 --- a/packages/agent/test/tool-exhaustiveness.test.ts +++ b/packages/agent/test/tool-exhaustiveness.test.ts @@ -86,7 +86,7 @@ describe("Cua tool executor coverage", () => { expect(result.content.at(-1)).toMatchObject({ type: "image", mimeType: "image/png" }); }); - it("runs the playwright_execute tool and appends a screenshot", async () => { + it("runs the playwright_execute tool and returns result + stdout as tool content", async () => { const calls: Array<{ id: string; body: { code: string; timeout_sec?: number } }> = []; const runtime = resolveCuaRuntimeSpec("openai:gpt-5.5"); const tools = createCuaComputerTools({ @@ -99,7 +99,6 @@ describe("Cua tool executor coverage", () => { return { success: true, result: "Example Domain", stdout: "logged\n", stderr: "" }; }, }, - computer: { captureScreenshot: async () => new Response(tinyPng) }, }, } as unknown as Kernel, toolExecutors: runtime.toolExecutors, @@ -113,10 +112,33 @@ describe("Cua tool executor coverage", () => { expect(calls).toEqual([{ id: "browser_123", body: { code: "return await page.title();", timeout_sec: 30 } }]); expect(result.content[0]).toMatchObject({ type: "text", text: "result: Example Domain" }); expect(result.content.some((block) => block.type === "text" && block.text === "stdout:\nlogged")).toBe(true); - expect(result.content.at(-1)).toMatchObject({ type: "image", mimeType: "image/png" }); + expect(result.content.every((block) => block.type !== "image")).toBe(true); expect(result.details).toMatchObject({ success: true }); }); + it("falls back to statusText for side-effect-only playwright_execute calls", async () => { + const runtime = resolveCuaRuntimeSpec("openai:gpt-5.5"); + const tools = createCuaComputerTools({ + browser, + client: { + browsers: { + playwright: { execute: async () => ({ success: true }) }, + }, + } as unknown as Kernel, + toolExecutors: runtime.toolExecutors, + playwright: true, + }); + const playwright = tools.find((tool) => tool.name === "playwright_execute"); + expect(playwright).toBeDefined(); + + const result = await playwright!.execute("call_1", { code: "await page.click('#submit')" }); + + expect(result.content).toEqual([ + { type: "text", text: "Playwright executed successfully." }, + ]); + expect(result.details).toMatchObject({ success: true, statusText: "Playwright executed successfully." }); + }); + it("surfaces playwright_execute failures as tool content without throwing", async () => { const runtime = resolveCuaRuntimeSpec("openai:gpt-5.5"); const tools = createCuaComputerTools({ @@ -124,7 +146,6 @@ describe("Cua tool executor coverage", () => { client: { browsers: { playwright: { execute: async () => ({ success: false, error: "boom", stderr: "stack" }) }, - computer: { captureScreenshot: async () => new Response(tinyPng) }, }, } as unknown as Kernel, toolExecutors: runtime.toolExecutors, @@ -137,7 +158,7 @@ describe("Cua tool executor coverage", () => { expect(result.content.some((block) => block.type === "text" && block.text.includes("error: boom"))).toBe(true); expect(result.content.some((block) => block.type === "text" && block.text === "stderr:\nstack")).toBe(true); - expect(result.content.at(-1)).toMatchObject({ type: "image", mimeType: "image/png" }); + expect(result.content.every((block) => block.type !== "image")).toBe(true); expect(result.details).toMatchObject({ success: false }); }); }); diff --git a/packages/ai/src/providers/common.ts b/packages/ai/src/providers/common.ts index 4fbbace..e829a6c 100644 --- a/packages/ai/src/providers/common.ts +++ b/packages/ai/src/providers/common.ts @@ -343,7 +343,7 @@ export const CUA_PLAYWRIGHT_TOOL_DESCRIPTION = [ "Run Playwright/TypeScript directly against the live browser session for steps that are awkward as raw pointer/keyboard actions: precise DOM reads, form fills, data extraction, and waiting on selectors.", "`page`, `context`, and `browser` are in scope and the code may `return` a JSON-serializable value, which comes back as the result.", "Each call runs in a fresh JS context — local variables do not persist across calls, but the browser session does (navigation, cookies, DOM state carry over via `page`/`context`/`browser`).", - "Capture page state with a follow-up screenshot action rather than calling page.screenshot() inside the code.", + "No screenshot is returned automatically; request one with a follow-up screenshot action when you need to see the page, rather than calling page.screenshot() inside the code.", ].join("\n"); export interface ComputerToolsOptions {