From ad69072a5c88b87b9775db5590fbfca570b4255c Mon Sep 17 00:00:00 2001
From: dprevoznik <58714078+dprevoznik@users.noreply.github.com>
Date: Tue, 23 Jun 2026 21:43:08 +0000
Subject: [PATCH] Drop auto-appended screenshot from computer_use_extra
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The navigation tool used to append a fresh screenshot after every call.
Mirror playwright_execute and let the model request a screenshot on a
follow-up turn when it actually needs to see the page — avoids wasted
image tokens + latency on side-effect-only navs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/agent/README.md                      |  4 +++-
 packages/agent/src/tools.ts                   |  6 +----
 .../agent/test/tool-exhaustiveness.test.ts    | 24 +++++++++++++++++++
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/packages/agent/README.md b/packages/agent/README.md
index cb60d4f..1832ccc 100644
--- a/packages/agent/README.md
+++ b/packages/agent/README.md
@@ -124,7 +124,9 @@ Not every provider's native computer-use vocabulary includes browser
 navigation — some models can click and type but have no direct way to open a
 URL or go back. `computerUseExtra: true` adds `computer_use_extra`, a
 provider-neutral escape hatch exposing `goto`, `back`, `forward`, and `url`
-so navigation works uniformly regardless of which model is driving.
+so navigation works uniformly regardless of which model is driving. No
+screenshot is returned automatically; request one on a follow-up turn when
+the model needs to see the page.
 
 Some steps are awkward as raw pointer/keyboard actions: precise DOM reads,
 form fills, data extraction, or waiting on a specific selector.
diff --git a/packages/agent/src/tools.ts b/packages/agent/src/tools.ts
index a326116..0f725dd 100644
--- a/packages/agent/src/tools.ts
+++ b/packages/agent/src/tools.ts
@@ -185,12 +185,8 @@ async function executeNavigationTool(translator: InternalComputerTranslator, par
 		} else {
 			await translator.executeBatch([{ type: action }]);
 		}
-		const screenshot = await translator.screenshot();
 		return {
-			content: [
-				{ type: "text", text: statusText },
-				{ type: "image", data: screenshot.data.toString("base64"), mimeType: screenshot.mimeType },
-			],
+			content: [{ type: "text", text: statusText }],
 			details: { action, statusText, ...(url ? { url } : {}) },
 		};
 	} catch (err) {
diff --git a/packages/agent/test/tool-exhaustiveness.test.ts b/packages/agent/test/tool-exhaustiveness.test.ts
index af8c1ba..c2cbfb2 100644
--- a/packages/agent/test/tool-exhaustiveness.test.ts
+++ b/packages/agent/test/tool-exhaustiveness.test.ts
@@ -86,6 +86,30 @@ describe("Cua tool executor coverage", () => {
 		expect(result.content.at(-1)).toMatchObject({ type: "image", mimeType: "image/png" });
 	});
 
+	it("runs computer_use_extra without auto-appending a screenshot", async () => {
+		const runtime = resolveCuaRuntimeSpec("openai:gpt-5.5");
+		const tools = createCuaComputerTools({
+			browser,
+			client: {
+				browsers: {
+					computer: {
+						batch: async () => undefined,
+						captureScreenshot: async () => new Response(tinyPng),
+					},
+				},
+			} as unknown as Kernel,
+			toolExecutors: runtime.toolExecutors,
+			computerUseExtra: true,
+		});
+		const nav = tools.find((tool) => tool.name === "computer_use_extra");
+		expect(nav).toBeDefined();
+
+		const result = await nav!.execute("call_1", { action: "back" });
+
+		expect(result.content).toEqual([{ type: "text", text: "back executed successfully." }]);
+		expect(result.details).toMatchObject({ action: "back", statusText: "back executed successfully." });
+	});
+
 	it("runs the playwright_execute tool and returns result + stdout as tool content", async () => {
 		const calls: Array<{ id: string; body: { code: string; timeout_sec?: number } }> = [];
 		const runtime = resolveCuaRuntimeSpec("openai:gpt-5.5");