From ad69072a5c88b87b9775db5590fbfca570b4255c Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Tue, 23 Jun 2026 21:43:08 +0000 Subject: [PATCH] Drop auto-appended screenshot from computer_use_extra MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The navigation tool used to append a fresh screenshot after every call. Mirror playwright_execute and let the model request a screenshot on a follow-up turn when it actually needs to see the page — avoids wasted image tokens + latency on side-effect-only navs. Co-Authored-By: Claude Opus 4.7 --- packages/agent/README.md | 4 +++- packages/agent/src/tools.ts | 6 +---- .../agent/test/tool-exhaustiveness.test.ts | 24 +++++++++++++++++++ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/packages/agent/README.md b/packages/agent/README.md index cb60d4f..1832ccc 100644 --- a/packages/agent/README.md +++ b/packages/agent/README.md @@ -124,7 +124,9 @@ Not every provider's native computer-use vocabulary includes browser navigation — some models can click and type but have no direct way to open a URL or go back. `computerUseExtra: true` adds `computer_use_extra`, a provider-neutral escape hatch exposing `goto`, `back`, `forward`, and `url` -so navigation works uniformly regardless of which model is driving. +so navigation works uniformly regardless of which model is driving. No +screenshot is returned automatically; request one on a follow-up turn when +the model needs to see the page. Some steps are awkward as raw pointer/keyboard actions: precise DOM reads, form fills, data extraction, or waiting on a specific selector. diff --git a/packages/agent/src/tools.ts b/packages/agent/src/tools.ts index a326116..0f725dd 100644 --- a/packages/agent/src/tools.ts +++ b/packages/agent/src/tools.ts @@ -185,12 +185,8 @@ async function executeNavigationTool(translator: InternalComputerTranslator, par } else { await translator.executeBatch([{ type: action }]); } - const screenshot = await translator.screenshot(); return { - content: [ - { type: "text", text: statusText }, - { type: "image", data: screenshot.data.toString("base64"), mimeType: screenshot.mimeType }, - ], + content: [{ type: "text", text: statusText }], details: { action, statusText, ...(url ? { url } : {}) }, }; } catch (err) { diff --git a/packages/agent/test/tool-exhaustiveness.test.ts b/packages/agent/test/tool-exhaustiveness.test.ts index af8c1ba..c2cbfb2 100644 --- a/packages/agent/test/tool-exhaustiveness.test.ts +++ b/packages/agent/test/tool-exhaustiveness.test.ts @@ -86,6 +86,30 @@ describe("Cua tool executor coverage", () => { expect(result.content.at(-1)).toMatchObject({ type: "image", mimeType: "image/png" }); }); + it("runs computer_use_extra without auto-appending a screenshot", async () => { + const runtime = resolveCuaRuntimeSpec("openai:gpt-5.5"); + const tools = createCuaComputerTools({ + browser, + client: { + browsers: { + computer: { + batch: async () => undefined, + captureScreenshot: async () => new Response(tinyPng), + }, + }, + } as unknown as Kernel, + toolExecutors: runtime.toolExecutors, + computerUseExtra: true, + }); + const nav = tools.find((tool) => tool.name === "computer_use_extra"); + expect(nav).toBeDefined(); + + const result = await nav!.execute("call_1", { action: "back" }); + + expect(result.content).toEqual([{ type: "text", text: "back executed successfully." }]); + expect(result.details).toMatchObject({ action: "back", statusText: "back executed successfully." }); + }); + it("runs the playwright_execute tool and returns result + stdout as tool content", async () => { const calls: Array<{ id: string; body: { code: string; timeout_sec?: number } }> = []; const runtime = resolveCuaRuntimeSpec("openai:gpt-5.5");